diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 1befa3715c42e8db9179a1994e870f0d09aad9dd..20cdbc5e52a8b93ce352aad0c36874acdc91e501 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -12,6 +12,7 @@ set(SOURCES src/context.c # src/simple_decoder_parser.c src/simple_decoder_parser_arc_eager.c src/simple_decoder_tagparser_arc_eager.c + src/simple_decoder_parser_arc_eager_error_predictor.c # src/simple_decoder_forrest.c src/simple_decoder_tagger.c src/simple_decoder_tagger_error_predictor.c @@ -57,6 +58,12 @@ target_link_libraries(maca_error_predictor_tagger_mcf2cff transparse) target_link_libraries(maca_error_predictor_tagger_mcf2cff maca_common) install (TARGETS maca_error_predictor_tagger_mcf2cff DESTINATION bin) +add_executable(maca_error_predictor_parser_arc_eager_mcf2cff ./src/maca_error_predictor_parser_arc_eager_mcf2cff.c) +target_link_libraries(maca_error_predictor_parser_arc_eager_mcf2cff perceptron) +target_link_libraries(maca_error_predictor_parser_arc_eager_mcf2cff transparse) +target_link_libraries(maca_error_predictor_parser_arc_eager_mcf2cff maca_common) +install (TARGETS maca_error_predictor_parser_arc_eager_mcf2cff DESTINATION bin) + add_executable(maca_trans_morpho_mcf2cff ./src/maca_trans_morpho_mcf2cff.c) target_link_libraries(maca_trans_morpho_mcf2cff perceptron) target_link_libraries(maca_trans_morpho_mcf2cff transparse) @@ -105,6 +112,12 @@ target_link_libraries(maca_trans_parser transparse) target_link_libraries(maca_trans_parser maca_common) install (TARGETS maca_trans_parser DESTINATION bin) +add_executable(maca_error_predictor_parser ./src/maca_error_predictor_parser.c) +target_link_libraries(maca_error_predictor_parser perceptron) +target_link_libraries(maca_error_predictor_parser transparse) +target_link_libraries(maca_error_predictor_parser maca_common) +install (TARGETS maca_error_predictor_parser DESTINATION bin) + add_executable(maca_trans_tagparser ./src/maca_trans_tagparser.c) target_link_libraries(maca_trans_tagparser perceptron) target_link_libraries(maca_trans_tagparser transparse) diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index ad4fb73ea78ddce14fee649b8bff03901677e6a2..547c862dd57d44998472ed0f8064a0eaa4080047 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -24,42 +24,39 @@ void context_free(context *ctx) if(ctx->root_label) free(ctx->root_label); if(ctx->vocabs_filename) free(ctx->vocabs_filename); if(ctx->fplm_filename) free(ctx->fplm_filename); - if(ctx->json_filename) free(ctx->json_filename); - if(ctx->dnn_model_filename) free(ctx->dnn_model_filename); - - if (ctx->mcd_struct) - mcd_free(ctx->mcd_struct); + if (ctx->mcd_struct) + mcd_free(ctx->mcd_struct); + if (ctx->mcd_struct_error) - mcd_free(ctx->mcd_struct_error); + mcd_free(ctx->mcd_struct_error); + if (ctx->vocabs) - dico_vec_free(ctx->vocabs); - + dico_vec_free(ctx->vocabs); + if (ctx->vocabs_error) - dico_vec_free(ctx->vocabs_error); - + dico_vec_free(ctx->vocabs_error); + if(ctx->d_perceptron_features) dico_free(ctx->d_perceptron_features); - - if(ctx->d_perceptron_features_error) + + if(ctx->d_perceptron_features_error) dico_free(ctx->d_perceptron_features_error); - /* - if(ctx->mcd_struct) + if(ctx->mcd_struct) mcd_free(ctx->mcd_struct); */ if(ctx->features_model) feat_model_free(ctx->features_model); if(ctx->features_model_error) - feat_model_free(ctx->features_model); + feat_model_free(ctx->features_model_error); if(ctx->f2p) form2pos_free(ctx->f2p); - free(ctx); } @@ -67,6 +64,7 @@ context *context_new(void) { context *ctx = (context *)memalloc(sizeof(context)); + ctx->force = 0; ctx->verbose = 0; ctx->program_name = NULL; ctx->input_filename = NULL; @@ -121,13 +119,13 @@ context *context_new(void) void context_general_help_message(context *ctx) { - fprintf(stderr, "usage: %s [options]\n", ctx->program_name); - fprintf(stderr, "Options:\n"); - fprintf(stderr, "\t-h --help : print this message\n"); - fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); - fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); - fprintf(stderr, "\t-D --maca_data_path <str> : path to the maca_data directory\n"); - fprintf(stderr, "\t-L --language <str> : identifier of the language to use (default is fr)\n"); + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); + fprintf(stderr, "\t-D --maca_data_path <str> : path to the maca_data directory\n"); + fprintf(stderr, "\t-L --language <str> : identifier of the language to use (default is fr)\n"); } void context_model_help_message(context *ctx){ @@ -207,9 +205,10 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[26] = + static struct option long_options[27] = { {"help", no_argument, 0, 'h'}, + {"force", no_argument, 0, 'K'}, {"verbose", no_argument, 0, 'v'}, {"debug", no_argument, 0, 'd'}, {"conll", no_argument, 0, 'c'}, @@ -240,93 +239,96 @@ context *context_read_options(int argc, char *argv[]) opterr = 0; - while ((c = getopt_long (argc, argv, "hvdcSTm:i:n:x:u:r:M:b:f:s:C:F:V:L:D:R:P:J:N:w:l:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "hKvdcSTm:i:n:x:u:r:M:b:f:s:C:F:V:L:D:R:P:J:N:w:l:", long_options, &option_index)) != -1){ switch (c) { case 'h': - ctx->help = 1; - break; + ctx->help = 1; + break; + case 'K' : + ctx->force = 1; + break; case 'v': - ctx->verbose = 1; - break; + ctx->verbose = 1; + break; case 'd': - ctx->debug_mode = 1; - break; + ctx->debug_mode = 1; + break; case 'c': - ctx->conll = 1; - break; + ctx->conll = 1; + break; case 'T': - ctx->trace_mode = 1; - break; + ctx->trace_mode = 1; + break; case 'm': - ctx->perc_model_filename = strdup(optarg); - break; + ctx->perc_model_filename = strdup(optarg); + break; case 'i': - ctx->input_filename = strdup(optarg); - break; + ctx->input_filename = strdup(optarg); + break; case 'n': - ctx->iteration_nb = atoi(optarg); - break; + ctx->iteration_nb = atoi(optarg); + break; case 'x': - ctx->cff_filename = strdup(optarg); - break; + ctx->cff_filename = strdup(optarg); + break; case 'w': - ctx->fplm_filename = strdup(optarg); - break; + ctx->fplm_filename = strdup(optarg); + break; case 'u': - ctx->feature_cutoff = atoi(optarg); - break; + ctx->feature_cutoff = atoi(optarg); + break; case 'r': - ctx->hash_ratio = atof(optarg); - break; + ctx->hash_ratio = atof(optarg); + break; case 'M': - ctx->mode = (!strcmp(optarg, "TEST"))? TEST_MODE : TRAIN_MODE; - break; + ctx->mode = (!strcmp(optarg, "TEST"))? TEST_MODE : TRAIN_MODE; + break; case 'b': - ctx->beam_width = atoi(optarg); - break; + ctx->beam_width = atoi(optarg); + break; case 'f': - ctx->fann_filename = strdup(optarg); - break; + ctx->fann_filename = strdup(optarg); + break; case 'l': - ctx->l_rules_filename = strdup(optarg); - break; + ctx->l_rules_filename = strdup(optarg); + break; case 's': - ctx->sent_nb = atoi(optarg); - break; + ctx->sent_nb = atoi(optarg); + break; case 'C': - ctx->mcd_filename = strdup(optarg); - break; + ctx->mcd_filename = strdup(optarg); + break; case 'F': - ctx->features_model_filename = strdup(optarg); - break; + ctx->features_model_filename = strdup(optarg); + break; case 'V': - ctx->vocabs_filename = strdup(optarg); - break; + ctx->vocabs_filename = strdup(optarg); + break; case 'L': - if (ctx->language) free(ctx->language); // libérer le default (strdup("fr") ) - ctx->language = strdup(optarg); - break; + if (ctx->language) free(ctx->language); // libérer le default (strdup("fr") ) + ctx->language = strdup(optarg); + break; case 'D': - ctx->maca_data_path = strdup(optarg); - break; + ctx->maca_data_path = strdup(optarg); + break; case 'R': - if (ctx->root_label) free(ctx->root_label); // libérer le default (strdup("root") ) - ctx->root_label = strdup(optarg); - break; + if (ctx->root_label) free(ctx->root_label); // libérer le default (strdup("root") ) + ctx->root_label = strdup(optarg); + break; case 'P': - ctx->f2p_filename = strdup(optarg); - if(!strcmp(ctx->f2p_filename, "_") || !strcmp(ctx->f2p_filename, "NULL")) - ctx->f2p = NULL; - else - ctx->f2p = form2pos_read(ctx->f2p_filename); - break; + ctx->f2p_filename = strdup(optarg); + if(!strcmp(ctx->f2p_filename, "_") || !strcmp(ctx->f2p_filename, "NULL")) + ctx->f2p = NULL; + else + ctx->f2p = form2pos_read(ctx->f2p_filename); + break; case 'N': - ctx->dnn_model_filename = strdup(optarg); - break; + ctx->dnn_model_filename = strdup(optarg); + break; case 'J': - ctx->json_filename = strdup(optarg); - break; + ctx->json_filename = strdup(optarg); + break; } } @@ -334,7 +336,7 @@ context *context_read_options(int argc, char *argv[]) ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); else ctx->mcd_struct = mcd_build_wpmlgfs(); - /* ctx->mcd_struct = mcd_build_wplgfs(); */ + /* ctx->mcd_struct = mcd_build_wplgfs(); */ /* initialize maca_data_path field */ @@ -346,12 +348,12 @@ context *context_read_options(int argc, char *argv[]) free(ctx->maca_data_path); } else { - char *e = getenv("MACAON_DIR"); - if (e != NULL) { + char *e = getenv("MACAON_DIR"); + if (e != NULL) { strcpy(absolute_path, e); - } else { + } else { fprintf(stderr, "WARNING: the environment variable MACAON_DIR is not defined\n"); - } + } } strcat(absolute_path, "/"); strcat(absolute_path, ctx->language); diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 179f0477871040e9dc20ce669fe02f85396f4ba4..e191bc3dd2fb9d7ffca144416ed03ff6717696f8 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -14,10 +14,16 @@ #define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab" #define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model" +#define DEFAULT_MULTI_COL_DESC_TAGGER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_tagger.mcd" #define DEFAULT_FEATURES_MODEL_TAGGER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_tagger.fm" #define DEFAULT_VOCABS_TAGGER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_tagger.vocab" #define DEFAULT_MODEL_TAGGER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_tagger.model" +#define DEFAULT_MULTI_COL_DESC_PARSER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_parser.mcd" +#define DEFAULT_FEATURES_MODEL_PARSER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_parser.fm" +#define DEFAULT_VOCABS_PARSER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_parser.vocab" +#define DEFAULT_MODEL_PARSER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_parser.model" + #define DEFAULT_MULTI_COL_DESC_LEMMATIZER_FILENAME "maca_trans_lemmatizer.mcd" #define DEFAULT_FEATURES_MODEL_LEMMATIZER_FILENAME "maca_trans_lemmatizer.fm" #define DEFAULT_VOCABS_LEMMATIZER_FILENAME "maca_trans_lemmatizer.vocab" @@ -57,6 +63,7 @@ typedef struct { int help; + int force; char *program_name; char *input_filename; char *perc_model_filename; diff --git a/maca_trans_parser/src/maca_error_predictor_parser.c b/maca_trans_parser/src/maca_error_predictor_parser.c new file mode 100644 index 0000000000000000000000000000000000000000..ac6cae4162c454637cc803981fa0daf3a0587020 --- /dev/null +++ b/maca_trans_parser/src/maca_error_predictor_parser.c @@ -0,0 +1,125 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"feat_fct.h" +#include"feature_table.h" +#include"dico.h" +#include"beam.h" +#include"form2pos.h" +#include"simple_decoder_parser_arc_eager_error_predictor.h" +#include"config2feat_vec.h" + +void decode_parser_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_beam_help_message(ctx); + context_conll_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_input_help_message(ctx); + context_mcd_help_message(ctx); + context_model_help_message(ctx); + context_vocabs_help_message(ctx); + context_features_model_help_message(ctx); + context_f2p_filename_help_message(ctx); +} + +void decode_parser_check_options(context *ctx){ + if(ctx->help + /*!ctx->conll_filename*/ + /* || !ctx->perc_model_filename + || !ctx->mcd_filename + || !ctx->vocabs_filename + || !ctx->features_model_filename*/ + ){ + decode_parser_help_message(ctx); + exit(1); + } +} + +void decode_parser_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_filename[500]; + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MODEL_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_VOCABS_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + if(!ctx->l_rules_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_PARSER_ERROR_PREDICTOR_FILENAME); + ctx->l_rules_filename = strdup(absolute_filename); + } + + if(!ctx->fann_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_VOCABS_PARSER_ERROR_PREDICTOR_FILENAME); + ctx->fann_filename = strdup(absolute_filename); + } + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + } +} + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + decode_parser_check_options(ctx); + + decode_parser_set_linguistic_resources_filenames(ctx); + + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + ctx->vocabs_error = dico_vec_read(ctx->fann_filename, ctx->hash_ratio); + + ctx->features_model_error = feat_model_read(ctx->l_rules_filename, feat_lib_build(), ctx->verbose); + ctx->mcd_struct_error = mcd_copy(ctx->mcd_struct); + + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + mcd_link_to_dico(ctx->mcd_struct_error, ctx->vocabs_error, ctx->verbose); + + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + ctx->d_perceptron_features_error = dico_vec_get_dico(ctx->vocabs_error, (char *)"d_perceptron_features"); + + ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + + if(ctx->dico_labels == NULL){ + fprintf(stderr, "cannot find label names\n"); + return 1; + } + + ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 3; + + char perc_error_filename[500]; + strcpy(perc_error_filename, ctx->maca_data_path); + strcat(perc_error_filename, DEFAULT_MODEL_PARSER_ERROR_PREDICTOR_FILENAME); + + simple_decoder_parser_arc_eager_error_predictor(ctx, perc_error_filename); + + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/maca_error_predictor_parser_arc_eager_mcf2cff.c b/maca_trans_parser/src/maca_error_predictor_parser_arc_eager_mcf2cff.c new file mode 100644 index 0000000000000000000000000000000000000000..0abe7af00df765c4165599e6eb207711bc8de322 --- /dev/null +++ b/maca_trans_parser/src/maca_error_predictor_parser_arc_eager_mcf2cff.c @@ -0,0 +1,337 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include<ctype.h> +#include"movement_parser_arc_eager.h" +#include"oracle_parser_arc_eager.h" +#include"feat_fct.h" +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"word_emb.h" +#include"config2feat_vec.h" +#include"feature_table.h" +#include"dico.h" +#include"mcd.h" + + +void maca_error_predictor_parser_mcf2cff_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_vocabs_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_vocabs_help_message(ctx); + +} + +void maca_error_predictor_parser_mcf2cff_check_options(context *ctx) +{ + if(!ctx->input_filename + || ctx->help + /* || !ctx->mcd_filename */ + /* || !(ctx->cff_filename || ctx->fann_filename) */ + ){ + maca_error_predictor_parser_mcf2cff_help_message(ctx); + exit(1); + } +} + +int config_is_equal_parser(config *c1, config *c2, int co1, int co2) // 3 for bm2p , 2 .. , 1 now, 0 no errors +{ + return ((co1==co2) ? 0 : 1); +} + +void generate_training_file_error(FILE *output_file, context *ctx) +{ + // oracle + config *config_oracle; + int mvt_code_oracle; + char mvt_type_oracle; + int mvt_label_oracle; + feat_vec *fv_oracle = feat_vec_new(feature_types_nb); + int sentence_nb = 0; + int root_label_oracle = dico_string2int(ctx->dico_labels, (char *) ctx->root_label); + word_buffer *ref_oracle = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct); + FILE *mcf_file_oracle = myfopen(ctx->input_filename, "r"); + + // prediction + config *config_predicted; + feat_vec *fv_predicted = feat_vec_new(feature_types_nb); + FILE *mcf_file_predicted = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + int root_label_predicted; + int mvt_code_predicted; + int mvt_type_predicted; + int mvt_label_predicted; + float max; + int result; + int argmax1, argmax2; + float max1, max2; + //int index; + + //dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); + //dico *dico_pos_error = dico_vec_get_dico(ctx->vocabs_error, (char *)"POS"); + + + /* create an mcd that corresponds to ctx->mcd_struct, but without gov and label */ + /* the idea is to ignore syntax in the mcf file that will be read */ + /* it is ugly !!! */ + + mcd *mcd_struct_hyp = mcd_copy(ctx->mcd_struct); + mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_GOV); + mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_LABEL); + mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_SENT_SEG); + + root_label_predicted = dico_string2int(ctx->dico_labels, ctx->root_label); + if(root_label_predicted == -1) root_label_predicted = 0; + config_predicted = config_new(mcf_file_predicted, ctx->mcd_struct, 5); + + config_oracle = config_new(mcf_file_oracle, mcd_struct_hyp, 5); + + while((!word_buffer_end(ref_oracle) && (sentence_nb < ctx->sent_nb)) || !config_is_terminal(config_predicted)){ + + mvt_code_oracle = oracle_parser_arc_eager(config_oracle, ref_oracle, root_label_oracle); + mvt_type_oracle = movement_parser_type(mvt_code_oracle); + mvt_label_oracle = movement_parser_label(mvt_code_oracle); + + + config2feat_vec_cff(ctx->features_model, config_oracle, ctx->d_perceptron_features, fv_oracle, LOOKUP_MODE); + + switch(mvt_type_oracle){ + case MVT_PARSER_EOS : + movement_parser_eos(config_oracle); + sentence_nb++; + if((sentence_nb % 100) == 0) + fprintf(stderr, "\rsentence %d", sentence_nb); + break; + case MVT_PARSER_LEFT : + movement_parser_left_arc(config_oracle, mvt_label_oracle); + break; + case MVT_PARSER_RIGHT : + movement_parser_right_arc(config_oracle, mvt_label_oracle); + word_buffer_move_right(ref_oracle); + break; + case MVT_PARSER_REDUCE : + movement_parser_reduce(config_oracle); + break; + case MVT_PARSER_ROOT : + movement_parser_root(config_oracle, root_label_oracle); + break; + case MVT_PARSER_SHIFT : + movement_parser_shift(config_oracle); + word_buffer_move_right(ref_oracle); + break; + } + + // predicted + + /* forced EOS (the element on the top of the stack is eos, but the preceding movement is not MVT_PARSER_EOS */ + /* which means that the top of the stack got its eos status from input */ + /* force the parser to finish parsing the sentence (perform all pending reduce actions) and determine root of the sentence */ + + if((word_get_sent_seg(stack_top(config_get_stack(config_predicted))) == 1) && (mvt_get_type(mvt_stack_top(config_get_history(config_predicted))) != MVT_PARSER_EOS)){ + word_set_sent_seg(stack_top(config_get_stack(config_predicted)), -1); + movement_parser_eos(config_predicted); + while(movement_parser_reduce(config_predicted)); + while(movement_parser_root(config_predicted, root_label_predicted)); + } + + /* normal behaviour, ask classifier what is the next movement to do and do it */ + else{ + config2feat_vec_cff(ctx->features_model_error, config_predicted, ctx->d_perceptron_features_error, fv_predicted, TRAIN_MODE); + + mvt_code_predicted = feature_table_argmax(fv_oracle, ft, &max); + mvt_type_predicted = movement_parser_type(mvt_code_predicted); + mvt_label_predicted = movement_parser_label(mvt_code_predicted); + + if((mvt_type_predicted == MVT_PARSER_EOS) && (word_get_sent_seg(stack_top(config_get_stack(config_predicted))) == 0)){ + feature_table_argmax_1_2(fv_oracle, ft, &argmax1, &max1, &argmax2, &max2); + mvt_code_predicted = argmax2; + mvt_type_predicted = movement_parser_type(mvt_code_predicted); + mvt_label_predicted = movement_parser_label(mvt_code_predicted); + } + + if(ctx->debug_mode){ + printf("Oracle : "); + movement_parser_print(stdout, mvt_code_oracle, ctx->dico_labels); + printf("\nPredicted : "); + movement_parser_print(stdout, mvt_code_predicted, ctx->dico_labels); + printf("\n"); + config_print(stdout,config_predicted); + if (mvt_code_oracle!=mvt_code_predicted) + fprintf(stdout, "**************** DIFFERENT CHOICE ***********\n\n"); + else + fprintf(stdout, "**************** EQUAL CHOICE ***********\n\n"); + } + + result = 0; + switch(mvt_type_predicted){ + case MVT_PARSER_LEFT : + result = movement_parser_left_arc(config_predicted, mvt_label_predicted); + break; + case MVT_PARSER_RIGHT: + result = movement_parser_right_arc(config_predicted, mvt_label_predicted); + break; + case MVT_PARSER_REDUCE: + result = movement_parser_reduce(config_predicted); + break; + case MVT_PARSER_ROOT: + result = movement_parser_root(config_predicted, root_label_predicted); + break; + case MVT_PARSER_EOS: + result = movement_parser_eos(config_predicted); + break; + case MVT_PARSER_SHIFT: + result = movement_parser_shift(config_predicted); + } + + if(result == 0){ + result = movement_parser_shift(config_predicted); + if(result == 0){ /* SHIFT failed no more words to read, let's get out of here ! */ + while(!stack_is_empty(config_get_stack(config_predicted))) + movement_parser_root(config_predicted, root_label_predicted); + } + } + } + + if(!ctx->debug_mode || output_file!=stdout) { + fprintf(output_file, "%d", ((config_is_equal_parser(config_oracle, config_predicted, mvt_code_oracle, mvt_code_predicted)))); + feat_vec_print(output_file, fv_predicted); + } + + + } + /* + config_free(c); + feat_vec_free(fv); + feature_table_free(ft); + if(ctx->input_filename) + fclose(f);*/ + + +} + +void error_parser_set_linguistic_resources_filename(context *ctx) +{ + char absolute_filename[500]; + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MODEL_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_VOCABS_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + if(!ctx->input_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_PATH_RELAT); + strcat(absolute_filename, DEFAULT_MCF_DEV); + ctx->input_filename = strdup(absolute_filename); + } + + if(!ctx->mcd_filename) { + ctx->mcd_struct = mcd_build_wpmlgfs(); + } + + if(!ctx->cff_filename){ + //printf("cff -> stdout\n") + } + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + if(!ctx->f2p_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_F2P_FILENAME); + ctx->f2p_filename = strdup(absolute_filename); + } + + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename); + fprintf(stderr, "input_filename = %s\n", ctx->input_filename); + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + + ctx = context_read_options(argc, argv); + //error_parser_set_linguistic_resources_filename(ctx); + ctx->f2p = form2pos_read(ctx->f2p_filename); + maca_error_predictor_parser_mcf2cff_check_options(ctx); + + ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); + ctx->mcd_struct_error = mcd_read(ctx->l_rules_filename, ctx->verbose); + + //error + mcd_extract_dico_from_corpus(ctx->mcd_struct_error, ctx->input_filename); + ctx->vocabs_error = mcd_build_dico_vec(ctx->mcd_struct_error); + + //parser + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + + + //error + ctx->d_perceptron_features_error = dico_new((char *)"d_perceptron_features", 10000000); + ctx->features_model_error = feat_model_read(ctx->fann_filename, feat_lib_build(), ctx->verbose); + + //parser + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + + ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + + if(ctx->dico_labels == NULL){ + fprintf(stderr, "cannot find label names\n"); + return 1; + } + + ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 3; + + feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); + + /* add the feature dictionnary to the dico vector */ + dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + dico_vec_add(ctx->vocabs_error, ctx->d_perceptron_features_error); + + /* open output file */ + output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w") : stdout; + + generate_training_file_error(output_file, ctx); + + dico_vec_print(ctx->dnn_model_filename, ctx->vocabs_error); + + if(ctx->cff_filename) + fclose(output_file); + + context_free(ctx); + + return 0; +} + diff --git a/maca_trans_parser/src/maca_error_predictor_tagger_mcf2cff.c b/maca_trans_parser/src/maca_error_predictor_tagger_mcf2cff.c index c7a83a8866e08b2855f613300de156d00426b44f..0fbc2454ef86160dc809a36985474f1a6c3fb27a 100644 --- a/maca_trans_parser/src/maca_error_predictor_tagger_mcf2cff.c +++ b/maca_trans_parser/src/maca_error_predictor_tagger_mcf2cff.c @@ -91,21 +91,21 @@ void generate_error_train(FILE *output_file, context *ctx) { config *config_oracle; feat_vec *fv_oracle = feat_vec_new(feature_types_nb); - FILE *conll_file_oracle = myfopen(ctx->input_filename, "r"); + FILE *mcf_file_oracle = myfopen(ctx->input_filename, "r"); int postag_oracle; word *b0; config *config_predicted; feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); feat_vec *fv_predicted = feat_vec_new(feature_types_nb); - FILE *conll_file_predicted = myfopen(ctx->input_filename, "r"); + FILE *mcf_file_predicted = myfopen(ctx->input_filename, "r"); int postag_predicted; float max; dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); dico *dico_pos_error = dico_vec_get_dico(ctx->vocabs_error, (char *)"POS"); - config_oracle = config_new(conll_file_oracle, ctx->mcd_struct, 5); - config_predicted = config_new(conll_file_predicted, ctx->mcd_struct, 5); + config_oracle = config_new(mcf_file_oracle, ctx->mcd_struct, 5); + config_predicted = config_new(mcf_file_predicted, ctx->mcd_struct, 5); while(!config_is_terminal(config_oracle)){ if(ctx->f2p){ @@ -166,8 +166,8 @@ void generate_error_train(FILE *output_file, context *ctx) config_free(config_oracle); config_free(config_predicted); - fclose(conll_file_oracle); - fclose(conll_file_predicted); + fclose(mcf_file_oracle); + fclose(mcf_file_predicted); } @@ -273,7 +273,7 @@ int main(int argc, char *argv[]) if(ctx->cff_filename) fclose(output_file); - //context_free(ctx); + context_free(ctx); return 0; } diff --git a/maca_trans_parser/src/maca_trans_parser.c b/maca_trans_parser/src/maca_trans_parser.c index 390ac1c49391e76e25511d9ec81e76173fbc2ff3..cdb1c998e8404fa81786be3c8a331c7841e41c74 100644 --- a/maca_trans_parser/src/maca_trans_parser.c +++ b/maca_trans_parser/src/maca_trans_parser.c @@ -41,8 +41,6 @@ void maca_trans_parser_check_options(context *ctx){ } } - - void set_linguistic_resources_filenames_parser(context *ctx) { char absolute_filename[500]; diff --git a/maca_trans_parser/src/simple_decoder_parser_arc_eager_error_predictor.c b/maca_trans_parser/src/simple_decoder_parser_arc_eager_error_predictor.c new file mode 100644 index 0000000000000000000000000000000000000000..9adb4e659063422db8de1316a441305c7c15eb6f --- /dev/null +++ b/maca_trans_parser/src/simple_decoder_parser_arc_eager_error_predictor.c @@ -0,0 +1,228 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"movement_parser_arc_eager.h" +#include"feat_fct.h" +#include"config2feat_vec.h" +#include"feature_table.h" +#include"dico.h" + + +void print_word_buffer(config *c, dico *dico_labels, mcd *mcd_struct) +{ + int i; + word *w; + char *label; + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + + + for(i=0; i < config_get_buffer(c)->nbelem; i++){ + w = word_buffer_get_word_n(config_get_buffer(c), i); + + if((mcd_get_gov_col(mcd_struct) == -1) + && (mcd_get_label_col(mcd_struct) == -1) + && (mcd_get_sent_seg_col(mcd_struct) == -1)){ + printf("%s\t", word_get_input(w)); + printf("%d\t", word_get_gov(w)); + label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w)); + if(label != NULL) + printf("%s\t", label) ; + else + printf("_\t"); + if(word_get_sent_seg(w) == 1) + printf("1\n") ; + else + printf("0\n"); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) printf("\t"); + if(col_nb == mcd_get_gov_col(mcd_struct)){ + printf("%d", word_get_gov(w)); + } + else + if(col_nb == mcd_get_label_col(mcd_struct)){ + label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w)); + if(label != NULL) + printf("%s", label) ; + else + printf("_"); + } + else + if(col_nb == mcd_get_sent_seg_col(mcd_struct)){ + if(word_get_sent_seg(w) == 1) + printf("1") ; + else + printf("0"); + } + else{ + word_print_col_n(stdout, w, col_nb); + } + col_nb++; + token = strtok(NULL, "\t"); + } + if((col_nb <= mcd_get_gov_col(mcd_struct)) || (mcd_get_gov_col(mcd_struct) == -1)){ + printf("\t%d", word_get_gov(w)); + } + if((col_nb <= mcd_get_label_col(mcd_struct)) || (mcd_get_label_col(mcd_struct) == -1)){ + label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w)); + if(label != NULL) + printf("\t%s", label) ; + else + printf("\t_"); + } + if((col_nb <= mcd_get_sent_seg_col(mcd_struct)) || (mcd_get_sent_seg_col(mcd_struct) == -1)){ + if(word_get_sent_seg(w) == 1) + printf("\t1") ; + else + printf("\t0"); + } + printf("\n"); + free(buffer); + } + } +} + +void simple_decoder_parser_arc_eager_error_predictor(context *ctx, char *perc_error_filename) +{ + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + feature_table *ft_error = feature_table_load(perc_error_filename, ctx->verbose); + + feat_vec *fv = feat_vec_new(feature_types_nb); + feat_vec *fv_error = feat_vec_new(feature_types_nb); + + int root_label; + int mvt_code; + int mvt_type; + int mvt_label; + + float max; + float max_err; + + int error_detect; + + config *c = NULL; + int result; + + int argmax1, argmax2; + float max1, max2; + int index; + + root_label = dico_string2int(ctx->dico_labels, ctx->root_label); + if(root_label == -1) root_label = 0; + + c = config_new(f, ctx->mcd_struct, 5); + while(!config_is_terminal(c)){ + + /* forced EOS (the element on the top of the stack is eos, but the preceding movement is not MVT_PARSER_EOS */ + /* which means that the top of the stack got its eos status from input */ + /* force the parser to finish parsing the sentence (perform all pending reduce actions) and determine root of the sentence */ + + if((word_get_sent_seg(stack_top(config_get_stack(c))) == 1) && (mvt_get_type(mvt_stack_top(config_get_history(c))) != MVT_PARSER_EOS)){ + word_set_sent_seg(stack_top(config_get_stack(c)), -1); + movement_parser_eos(c); + while(movement_parser_reduce(c)); + while(movement_parser_root(c, root_label)); + if(ctx->debug_mode) printf("force EOS\n"); + } + + /* normal behaviour, ask classifier what is the next movement to do and do it */ + else{ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + mvt_code = feature_table_argmax(fv, ft, &max); + + if(ctx->debug_mode){ + fprintf(stdout, "***********************************\n"); + config_print(stdout, c); + } + + if(ctx->debug_mode){ + fprintf(stdout, " ***Parser choice***\n"); + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + for(int i=0; i < 3; i++){ + printf(" %d\t", i); + movement_parser_print(stdout, vcode_array[i].class_code, ctx->dico_labels); + printf("\t%.4f\n", vcode_array[i].score); + } + free(vcode_array); + } + + mvt_type = movement_parser_type(mvt_code); + mvt_label = movement_parser_label(mvt_code); + + + config2feat_vec_cff(ctx->features_model_error, c, ctx->d_perceptron_features_error, fv_error, LOOKUP_MODE); + error_detect = feature_table_argmax(fv_error, ft_error, &max_err); + + if(ctx->debug_mode){ + fprintf(stdout, " ***Error detection***\n"); + vcode *vcode_array_err = feature_table_get_vcode_array(fv_error, ft_error); + for(int i=0; i < 2; i++){ + fprintf(stdout, " %d\t", i); + fprintf(stdout, "%d\t%.4f\n", vcode_array_err[i].class_code, vcode_array_err[i].score); + } + free(vcode_array_err); + } + + if((mvt_type == MVT_PARSER_EOS) && (word_get_sent_seg(stack_top(config_get_stack(c))) == 0)){ + if(ctx->verbose) + fprintf(stderr, "the classifier did predict EOS but this is not the case\n"); + feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2); + mvt_code = argmax2; + mvt_type = movement_parser_type(mvt_code); + mvt_label = movement_parser_label(mvt_code); + + } + + result = 0; + switch(mvt_type){ + case MVT_PARSER_LEFT : + result = movement_parser_left_arc(c, mvt_label); + break; + case MVT_PARSER_RIGHT: + result = movement_parser_right_arc(c, mvt_label); + break; + case MVT_PARSER_REDUCE: + result = movement_parser_reduce(c); + break; + case MVT_PARSER_ROOT: + result = movement_parser_root(c, root_label); + break; + case MVT_PARSER_EOS: + result = movement_parser_eos(c); + break; + case MVT_PARSER_SHIFT: + result = movement_parser_shift(c); + } + + if(result == 0){ + if(ctx->debug_mode) fprintf(stdout, "WARNING : movement cannot be executed doing a SHIFT instead !\n"); + result = movement_parser_shift(c); + if(result == 0){ /* SHIFT failed no more words to read, let's get out of here ! */ + if(ctx->debug_mode) fprintf(stdout, "WARNING : cannot exectue a SHIFT emptying stack !\n"); + while(!stack_is_empty(config_get_stack(c))) + movement_parser_root(c, root_label); + } + } + } + } + + if(!ctx->trace_mode) + print_word_buffer(c, ctx->dico_labels, ctx->mcd_struct); + + config_free(c); + feat_vec_free(fv); + feature_table_free(ft); + if(ctx->input_filename) + fclose(f); +} diff --git a/maca_trans_parser/src/simple_decoder_parser_arc_eager_error_predictor.h b/maca_trans_parser/src/simple_decoder_parser_arc_eager_error_predictor.h new file mode 100644 index 0000000000000000000000000000000000000000..ab10b4056dc4f275f13f6ec7d06d554f48bdc77a --- /dev/null +++ b/maca_trans_parser/src/simple_decoder_parser_arc_eager_error_predictor.h @@ -0,0 +1,7 @@ +#ifndef __SIMPLE_DECODER_PARSER_ARC_EAGER_ERROR_PREDICTOR__ +#define __SIMPLE_DECODER_PARSER_ARC_EAGER_ERROR_PREDICTOR__ +#include"context.h" + +void simple_decoder_parser_arc_eager_error_predictor(context *ctx, char *perc_error_filename); + +#endif diff --git a/maca_trans_parser/src/simple_decoder_tagger_error_predictor.c b/maca_trans_parser/src/simple_decoder_tagger_error_predictor.c index 01dd5fa4179f53db0369f75c7d2a2a1a90678007..bc864f38efba80d0b399ee55c78184b2fbfeb5ab 100644 --- a/maca_trans_parser/src/simple_decoder_tagger_error_predictor.c +++ b/maca_trans_parser/src/simple_decoder_tagger_error_predictor.c @@ -173,7 +173,7 @@ void simple_decoder_tagger_error_predictor(context *ctx, char *perc_error_filena if(ctx->debug_mode) { switch (error_detect) { case 0 : // No errors detected - sprintf(impr[nb]+strlen(impr[nb]),"\t✔\t0"); + sprintf(impr[nb]+strlen(impr[nb]),"\t✔\t_"); break; case 1 : @@ -191,18 +191,19 @@ void simple_decoder_tagger_error_predictor(context *ctx, char *perc_error_filena } sprintf(impr[nb]+strlen(impr[nb]),"\n"); nb +=1; - + + vcode *vcode_array_err = feature_table_get_vcode_array(fv_error, ft_error); + if(ctx->debug_mode){ fprintf(stdout, " ***Error detection***\n"); - vcode *vcode_array_err = feature_table_get_vcode_array(fv_error, ft_error); for(int i=0; i < 4; i++){ fprintf(stdout, " %d\t", i); fprintf(stdout, " %d\t%.4f\n", vcode_array_err[i].class_code, vcode_array_err[i].score); } - free(vcode_array_err); } + free(vcode_array_err); - if (error_detect == 3) { + if (error_detect == 3){// && (vcode_array_err[0].score-vcode_array_err[1].score)>2.5) { backward(c); backward(c); nb -= 3; @@ -235,7 +236,61 @@ void simple_decoder_tagger_error_predictor(context *ctx, char *perc_error_filena if(ctx->debug_mode){ vcode *vcode_arraye = feature_table_get_vcode_array(fv, ft); - for(int i=debug_choice; i < debug_choice+3; i++){//postag_err+3; i++){ + for(int i=debug_choice-1; i < debug_choice+2; i++){//postag_err+3; i++){ + fprintf(stdout, "%d\t", i); + fprintf(stdout, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_arraye[i].class_code), vcode_arraye[i].score); + } + free(vcode_arraye); + fprintf(stderr, "Ancien pos : %s, nouveau : %s\n", dico_int2string(dico_pos, postag_err), dico_int2string(dico_pos, postag)); + } + + if(postag==postag_err) + { + printf("ERROR PREDICTOR, NO CHOICE LEFT\n"); + exit(1); + } + word_set_pos(word_buffer_b0(c->bf), postag); + string_print_word(word_buffer_b0(c->bf), ctx->mcd_struct, dico_pos, postag,&impr[nb]); + if(ctx->debug_mode) + sprintf(impr[nb]+strlen(impr[nb]),"\t✐\t_\n"); + else + sprintf(impr[nb]+strlen(impr[nb]),"\n"); + nb += 1; + } + + else if (error_detect == 2 && ctx->force) { + backward(c); + nb -= 2; + + if(ctx->f2p) + add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); + + postag_err = word_get_pos(word_buffer_b0(c->bf)); + + postag = postag_err; + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + + int debug_choice; + for(int i=0; i < ft->classes_nb-1; i++){ + if (postag_err == vcode_array[i].class_code) { + postag = vcode_array[i+1].class_code; + debug_choice = i+1; + break; + } + } + + free(vcode_array); + + if(ctx->debug_mode){ + fprintf(stderr, "***********************************\n"); + config_print(stderr, c); + } + + if(ctx->debug_mode){ + vcode *vcode_arraye = feature_table_get_vcode_array(fv, ft); + for(int i=debug_choice-1; i < debug_choice+2; i++){//postag_err+3; i++){ fprintf(stdout, "%d\t", i); fprintf(stdout, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_arraye[i].class_code), vcode_arraye[i].score); }