From 568cf30a95f4e02efeefe9dd272aceebf355245d Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Sat, 29 Jul 2017 11:24:59 +0200 Subject: [PATCH] modified maca_trans_lemmatizer so that it works when no morphological rules are supplied, in which case it only uses the exception lexicon --- maca_trans_parser/src/maca_trans_lemmatizer.c | 75 +++++++++++-------- .../src/maca_trans_lemmatizer_mcf2cff.c | 25 ++++--- perceptron/exec/perceptron_train.c | 12 +-- perceptron/lib/src/cf_file.c | 3 + 4 files changed, 68 insertions(+), 47 deletions(-) diff --git a/maca_trans_parser/src/maca_trans_lemmatizer.c b/maca_trans_parser/src/maca_trans_lemmatizer.c index 8970bf5..645be94 100644 --- a/maca_trans_parser/src/maca_trans_lemmatizer.c +++ b/maca_trans_parser/src/maca_trans_lemmatizer.c @@ -119,7 +119,7 @@ int main(int argc, char *argv[]) int l_rule_code; char *l_rule; float max; - + feature_table *ft = NULL; maca_lemmatizer_check_options(ctx); maca_lemmatizer_set_linguistic_resources_filenames(ctx); @@ -131,8 +131,17 @@ int main(int argc, char *argv[]) ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); - ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); - feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + if(d_l_rules->nbelem){ + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + } + else{ + if(ctx->verbose) + fprintf(stderr, "no morphological rules loaded\n"); + ctx->d_perceptron_features = NULL; + ft = NULL; + } + c = config_new(f, ctx->mcd_struct, 5); while(!config_is_terminal(c)){ @@ -153,37 +162,41 @@ int main(int argc, char *argv[]) } // if lemma is not found in exception file, predict an l_rule else{ - config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); - // feat_vec_print_string(fv, ctx->d_perceptron_features); - // feat_vec_print(stdout, fv); - - vcode *vcode_array = feature_table_get_vcode_array(fv, ft); - if(ctx->debug_mode){ - for(int i=0; i < 10; i++){ + if(ft == NULL){ /* no rule model just print the form as a lemma */ + print_word(b0, ctx->mcd_struct, form); + } + else{ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + // feat_vec_print_string(fv, ctx->d_perceptron_features); + // feat_vec_print(stdout, fv); + + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + if(ctx->debug_mode){ + for(int i=0; i < 10; i++){ + l_rule = dico_int2string(d_l_rules, vcode_array[i].class_code); + fprintf(stderr, "%d", i); + if(l_rule_is_applicable(form, l_rule)) fprintf(stderr, "*"); + fprintf(stderr, "\t%s\t%.4f\n", l_rule, vcode_array[i].score); + } + } + int i; + for(i=0; i < 10; i++){ l_rule = dico_int2string(d_l_rules, vcode_array[i].class_code); - fprintf(stderr, "%d", i); - if(l_rule_is_applicable(form, l_rule)) fprintf(stderr, "*"); - fprintf(stderr, "\t%s\t%.4f\n", l_rule, vcode_array[i].score); + if(l_rule_is_applicable(form, l_rule)){ + char *transformed_lemma = apply_l_rule(form, l_rule); + // printf("transformed_lemma = %s\n", transformed_lemma); + // print_word(b0, ctx->mcd_struct, to_lower_string(transformed_lemma)); + print_word(b0, ctx->mcd_struct, transformed_lemma); + free(transformed_lemma); + break; + } } - } - int i; - for(i=0; i < 10; i++){ - l_rule = dico_int2string(d_l_rules, vcode_array[i].class_code); - if(l_rule_is_applicable(form, l_rule)){ - char *transformed_lemma = apply_l_rule(form, l_rule); - // printf("transformed_lemma = %s\n", transformed_lemma); - // print_word(b0, ctx->mcd_struct, to_lower_string(transformed_lemma)); - print_word(b0, ctx->mcd_struct, transformed_lemma); - free(transformed_lemma); - break; + /* no rule applied */ + if(i == 10){ + print_word(b0, ctx->mcd_struct, form); } + free(vcode_array); } - /* no rule applied */ - if(i == 10){ - print_word(b0, ctx->mcd_struct, form); - } - - free(vcode_array); } } word_buffer_move_right(c->bf); @@ -192,7 +205,7 @@ int main(int argc, char *argv[]) if (ctx->input_filename) fclose(f); context_free(ctx); fplm_free(exceptions); - feature_table_free(ft); + if(ft) feature_table_free(ft); return 0; } diff --git a/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c b/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c index 8458683..1b4dd5f 100644 --- a/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c @@ -171,15 +171,17 @@ int main(int argc, char *argv[]) fplm_struct *exceptions; ctx = context_read_options(argc, argv); - - - - // decode_lemmatizer_set_linguistic_resources_filenames(ctx); maca_trans_lemmatizer_mcf2cff_check_options(ctx); exceptions = fplm_load_file(ctx->fplm_filename, ctx->verbose); d_l_rules = dico_read(ctx->l_rules_filename, 0.5); + if(d_l_rules->nbelem == 0){ + /* do not produce cff file when the rule file is empty */ + /* exit(1);*/ + } + + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); if(ctx->mode == TRAIN_MODE){ @@ -203,22 +205,25 @@ int main(int argc, char *argv[]) /* add the feature dictionnary to the dico vector */ dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + /* open output file */ if(ctx->cff_filename) output_file = myfopen(ctx->cff_filename, "w"); else output_file = stdout; - generate_training_file(output_file, ctx, d_l_rules, exceptions); - + if(d_l_rules->nbelem) + generate_training_file(output_file, ctx, d_l_rules, exceptions); + + + if(ctx->cff_filename) + fclose(output_file); + if(ctx->mode == TRAIN_MODE){ /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ dico_vec_print(ctx->vocabs_filename, ctx->vocabs); - } - - if(ctx->cff_filename) - fclose(output_file); + context_free(ctx); return 0; } diff --git a/perceptron/exec/perceptron_train.c b/perceptron/exec/perceptron_train.c index ba5b918..d4db2e9 100644 --- a/perceptron/exec/perceptron_train.c +++ b/perceptron/exec/perceptron_train.c @@ -35,12 +35,12 @@ int main(int argc, char *argv[]) train_check_options(ctx); look_for_number_of_features_and_classes(ctx->cff_filename, &nb_feat, &nb_class); - - ft = feature_table_new(nb_feat, nb_class); - fprintf(stderr, "table allocated (%d x %d)\n", nb_feat, nb_class); - perceptron_avg(ctx->cff_filename, ft, ctx->iteration_nb); - feature_table_dump(ctx->perc_model_filename, ft); - + if(nb_class > 1){ + ft = feature_table_new(nb_feat, nb_class); + fprintf(stderr, "table allocated (%d x %d)\n", nb_feat, nb_class); + perceptron_avg(ctx->cff_filename, ft, ctx->iteration_nb); + feature_table_dump(ctx->perc_model_filename, ft); + } perceptron_context_free(ctx); return 0; diff --git a/perceptron/lib/src/cf_file.c b/perceptron/lib/src/cf_file.c index 0114a3d..908d405 100644 --- a/perceptron/lib/src/cf_file.c +++ b/perceptron/lib/src/cf_file.c @@ -56,8 +56,11 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int FILE *f = fopen(filename, "r"); char *token; int nb; + *max_feat = 0; *max_class = 0; + if(f == NULL) + return; while(fgets(buffer, 10000, f)){ buffer[strlen(buffer) - 1] = '\0'; token = strtok(buffer, "\t"); -- GitLab