diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 9b1b6fed317f1f07e9aab9c762dbd9d1d6e153b1..13cd87663a44951f1e8e3900d4328ae79b2cfb6a 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -37,7 +37,10 @@ #define DEFAULT_MODEL_PARSER_NN_FILENAME "maca_trans_parser_nn.weights" #define DEFAULT_JSON_PARSER_NN_FILENAME "maca_trans_parser_nn.json" - +#define DEFAULT_PATH_RELAT "../data/treebank/" +#define DEFAULT_CONLL07_DEV "dev.conll07" +#define DEFAULT_CONLL07_TRAIN "train.conll07" +#define DEFAULT_CONLL07_TEST "test.conll07" #define DEFAULT_F2P_FILENAME "fP" #define DEFAULT_FPLM_FILENAME "fplm" diff --git a/maca_trans_parser/src/maca_error_predictor_tagger.c b/maca_trans_parser/src/maca_error_predictor_tagger.c index f0ce5148463743607d876639df2a712a6a911e2a..68b4fb6a3903166018d576e5d3c265b1769e9fef 100644 --- a/maca_trans_parser/src/maca_error_predictor_tagger.c +++ b/maca_trans_parser/src/maca_error_predictor_tagger.c @@ -54,7 +54,7 @@ void add_signature_to_words_in_word_buffer_tagger(word_buffer *bf, form2pos *f2p int i; word *w; char lower_form[1000]; - + for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){ w = word_buffer_get_word_n(bf, i); if(word_get_signature(w) != -1) break; @@ -72,29 +72,24 @@ void add_signature_to_words_in_word_buffer_tagger(word_buffer *bf, form2pos *f2p void maca_error_predictor_help_message(context *ctx) { context_general_help_message(ctx); - context_mode_help_message(ctx); + //context_mode_help_message(ctx); context_sent_nb_help_message(ctx); - context_mcd_help_message(ctx); - + //context_mcd_help_message(ctx); fprintf(stderr, "INPUT\n"); - context_conll_help_message(ctx); - fprintf(stderr, "IN TEST MODE\n"); + fprintf(stderr, "\t-i --input <file> : input is in conll07 format (default is dev.conll07)\n"); + //fprintf(stderr, "IN TEST MODE\n"); context_vocabs_help_message(ctx); fprintf(stderr, "OUTPUT\n"); - context_cff_help_message(ctx); - fprintf(stderr, "IN TRAIN MODE\n"); - context_vocabs_help_message(ctx); + fprintf(stderr, "\t-x --cff <file> : CFF format file name (default is stdout)\n"); + //fprintf(stderr, "IN TRAIN MODE\n"); + //context_vocabs_help_message(ctx); } void maca_error_predictor_check_options(context *ctx) { - if(0 /*!ctx->input_filename - || ctx->help - / || !ctx->mcd_filename / - || !(ctx->cff_filename || ctx->fann_filename) - */){ + if(ctx->help){ maca_error_predictor_help_message(ctx); exit(1); } @@ -106,71 +101,75 @@ int config_is_equal_tagger(config *c1, config *c2) return ((bm1p(c1)==bm1p(c2))&&(bm2p(c1)==bm2p(c2))&&(bm3p(c1)==bm3p(c2))); } + void generate_error_train(FILE *output_file, context *ctx) { config *config_oracle; - feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); feat_vec *fv_oracle = feat_vec_new(feature_types_nb); FILE *conll_file_oracle = myfopen(ctx->input_filename, "r"); int postag_oracle; - float max; word *b0; - dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); config *config_predicted; + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); feat_vec *fv_predicted = feat_vec_new(feature_types_nb); FILE *conll_file_predicted = myfopen(ctx->input_filename, "r"); int postag_predicted; - + float max; + dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); + config_oracle = config_new(conll_file_oracle, ctx->mcd_struct, 5); config_predicted = config_new(conll_file_predicted, ctx->mcd_struct, 5); while(!config_is_terminal(config_oracle)){ - /*if(ctx->f2p){ - add_signature_to_words_in_word_buffer_tagger(config_oracle->bf, ctx->f2p); + if(ctx->f2p){ add_signature_to_words_in_word_buffer_tagger(config_predicted->bf, ctx->f2p); - }*/ + add_signature_to_words_in_word_buffer_tagger(config_oracle->bf, ctx->f2p); + } // oracle config2feat_vec_cff(ctx->features_model, config_oracle, ctx->d_perceptron_features, fv_oracle, LOOKUP_MODE); postag_oracle = oracle_tagger(config_oracle); - printf("Oracle : "); - print_word_simple(word_buffer_b0(config_oracle->bf), ctx->mcd_struct, dico_pos, postag_oracle); + + if(ctx->debug_mode){ + printf("Oracle : "); + print_word_simple(word_buffer_b0(config_oracle->bf), ctx->mcd_struct, dico_pos, postag_oracle); + } // predicted b0 = word_buffer_b0(config_predicted->bf); config2feat_vec_cff(ctx->features_model, config_predicted, ctx->d_perceptron_features, fv_predicted, LOOKUP_MODE); postag_predicted = feature_table_argmax(fv_predicted, ft, &max); - - printf("Predicted : "); - print_word_simple(b0, ctx->mcd_struct, dico_pos, postag_predicted); - - if(1){ + + if(ctx->debug_mode){ + printf("Predicted : "); + print_word_simple(b0, ctx->mcd_struct, dico_pos, postag_predicted); + } + + if(ctx->debug_mode){ vcode *vcode_array = feature_table_get_vcode_array(fv_predicted, ft); for(int i=0; i < 3; i++){ - fprintf(stdout, "%d\t", i); - fprintf(stdout, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score); + printf("%d\t", i); + printf("%s\t%.4f\n", dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score); } free(vcode_array); - } + + if (postag_oracle!=postag_predicted) + fprintf(stdout, "**************** DIFFERENT CHOICE ***********\n\n"); - - - if (postag_oracle!=postag_predicted) - fprintf(stdout, "**************** DIFFERENT CHOICE ***********\n\n"); + else + fprintf(stdout, "**************** EQUAL CHOICE ***********\n\n"); - else - fprintf(stdout, "**************** EQUAL CHOICE ***********\n\n"); - + } movement_tagger(config_oracle, postag_oracle); movement_tagger(config_predicted, postag_predicted); - fprintf(output_file, "%d", ((config_is_equal_tagger(config_oracle, config_predicted)) ? 1 : 0)); - fprintf(output_file, " or : %d, pred : %d", postag_oracle,postag_predicted); - feat_vec_print(output_file, fv_predicted); - //word_set_pos(word_buffer_bm1(config_predicted->bf), postag_oracle); - + if(!ctx->debug_mode || output_file!=stdout) { + fprintf(output_file, "%d", ((config_is_equal_tagger(config_oracle, config_predicted)) ? 1 : 0)); + feat_vec_print(output_file, fv_predicted); + } } + feat_vec_free(fv_oracle); feat_vec_free(fv_predicted); @@ -183,6 +182,54 @@ void generate_error_train(FILE *output_file, context *ctx) } +void error_tagger_set_linguistic_resources_filename(context *ctx) +{ + char absolute_filename[500]; + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MODEL_TAGGER_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_VOCABS_TAGGER_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + if(!ctx->input_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_PATH_RELAT); + strcat(absolute_filename, DEFAULT_CONLL07_DEV); + ctx->input_filename = strdup(absolute_filename); + } + + if(!ctx->cff_filename){ + //printf("cff -> stdout\n") + } + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_TAGGER_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + if(!ctx->f2p_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_F2P_FILENAME); + ctx->f2p_filename = strdup(absolute_filename); + } + + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename); + fprintf(stderr, "input_filename = %s\n", ctx->f2p_filename); + } +} int main(int argc, char *argv[]) @@ -191,20 +238,13 @@ int main(int argc, char *argv[]) FILE *output_file; ctx = context_read_options(argc, argv); - //maca_error_predictor_check_options(ctx); - - ctx->perc_model_filename = "/home/mathis/maca_data2/fr/bin/maca_trans_tagger.model" ; - ctx->features_model_filename = "/home/mathis/maca_data2/fr/bin/maca_trans_tagger.fm" ; - ctx->input_filename = "/home/mathis/maca_data2/fr/data/treebank/dev.conll07"; - ctx->f2p_filename = "/home/mathis/maca_data2/fr/bin/fP"; - ctx->vocabs_filename = "/home/mathis/maca_data2/fr/bin/maca_trans_tagger.vocab"; - ctx->cff_filename = "/home/mathis/test/stage/error.cff"; - + error_tagger_set_linguistic_resources_filename(ctx); + ctx->f2p = form2pos_read(ctx->f2p_filename); + maca_error_predictor_check_options(ctx); + + /* load ctx */ ctx->mcd_struct = mcd_build_conll07(); - - //decode_tagger_set_linguistic_resources_filenames(ctx); - ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), 1);//ctx->verbose); - + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); @@ -219,13 +259,13 @@ int main(int argc, char *argv[]) output_file = myfopen(ctx->cff_filename, "w"); else output_file = stdout; - + generate_error_train(output_file,ctx); if(ctx->cff_filename) fclose(output_file); - //context_free(ctx); + context_free(ctx); return 0; }