diff --git a/maca_common/src/word.c b/maca_common/src/word.c index d4c01a61d27af0ee0ad48cecca8a00290bf6c0d6..5890e28e9722d7c68f5ee43f69a841dff980bbc2 100644 --- a/maca_common/src/word.c +++ b/maca_common/src/word.c @@ -35,7 +35,7 @@ word *word_read(FILE *f, mcd *mcd_struct) /* look for a valid word */ while(fgets(buffer, 10000, f)){ - /* printf("buffer = %s\n", buffer); */ + // printf("buffer = %s\n", buffer); /* ignore empty lines */ if((buffer[0] == '\n')) continue; /* lines beginning with ## are comments */ @@ -63,6 +63,7 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct) /* if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1) && (strcmp(token, "_"))){ */ if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1)){ w->wf_array[mcd_struct->wf[col]] = mcd_get_code(mcd_struct, token, col); + // printf("col = %d val = %d\n", col, w->wf_array[mcd_struct->wf[col]]); } if(mcd_struct->wf[col] == MCD_WF_FORM){ w->form = strdup(token); diff --git a/maca_trans_parser/src/config2feat_vec.c b/maca_trans_parser/src/config2feat_vec.c index 362125b7c06a3751e5fcf665b3d0423f7228c877..07165c41e451908c8494b9fb50d24eee925ee798 100644 --- a/maca_trans_parser/src/config2feat_vec.c +++ b/maca_trans_parser/src/config2feat_vec.c @@ -28,9 +28,10 @@ int get_feat_value_cff(feat_model *fm, config *c, dico *dico_features, int feat_ } if(mode == LOOKUP_MODE){ - if(fm->string) - /* printf("fmstring = %s\n", fm->string); */ + if(fm->string){ + // printf("fmstring = %s\n", fm->string); return dico_string2int(dico_features, fm->string); + } } return dico_add(dico_features, fm->string); } @@ -50,8 +51,9 @@ feat_vec *config2feat_vec_cff(feat_model *fm, config *c, dico *dico_features, fe { int i; feat_vec_empty(fv); - for(i=0; i < fm->nbelem; i++) + for(i=0; i < fm->nbelem; i++){ feat_vec_add(fv, get_feat_value_cff(fm, c, dico_features, i, mode)); + } return fv; } diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 2b8b165bd14e28565a1efe46fce3be0500e6f6e0..f375287d31ec4bdd9737fa0573ecdb93633aa024 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -101,7 +101,7 @@ context *context_new(void) ctx->json_filename = NULL; ctx->dnn_model_filename = NULL; - + ctx->l_rules_filename = NULL; return ctx; } @@ -194,7 +194,7 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[24] = + static struct option long_options[26] = { {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, @@ -219,13 +219,15 @@ context *context_read_options(int argc, char *argv[]) {"f2p", required_argument, 0, 'P'}, {"traces", required_argument, 0, 'T'}, {"json", required_argument, 0, 'J'}, - {"dnn_model", required_argument, 0, 'N'} + {"dnn_model", required_argument, 0, 'N'}, + {"l_rules", required_argument, 0, 'l'}, + {"fplm", required_argument, 0, 'w'} }; optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "hvdcSTm:i:n:x:u:r:M:b:f:s:C:F:V:L:D:R:P:J:N:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "hvdcSTm:i:n:x:u:r:M:b:f:s:C:F:V:L:D:R:P:J:N:w:l:", long_options, &option_index)) != -1){ switch (c) { case 'h': @@ -255,6 +257,9 @@ context *context_read_options(int argc, char *argv[]) case 'x': ctx->cff_filename = strdup(optarg); break; + case 'w': + ctx->fplm_filename = strdup(optarg); + break; case 'u': ctx->feature_cutoff = atoi(optarg); break; @@ -270,6 +275,9 @@ context *context_read_options(int argc, char *argv[]) case 'f': ctx->fann_filename = strdup(optarg); break; + case 'l': + ctx->l_rules_filename = strdup(optarg); + break; case 's': ctx->sent_nb = atoi(optarg); break; diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 752d7604488818ef4a6aa109065c88317ebbfa0a..9b1b6fed317f1f07e9aab9c762dbd9d1d6e153b1 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -14,6 +14,13 @@ #define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab" #define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model" +#define DEFAULT_MULTI_COL_DESC_LEMMATIZER_FILENAME "maca_trans_lemmatizer.mcd" +#define DEFAULT_FEATURES_MODEL_LEMMATIZER_FILENAME "maca_trans_lemmatizer.fm" +#define DEFAULT_VOCABS_LEMMATIZER_FILENAME "maca_trans_lemmatizer.vocab" +#define DEFAULT_MODEL_LEMMATIZER_FILENAME "maca_trans_lemmatizer.model" +#define DEFAULT_RULES_LEMMATIZER_FILENAME "maca_trans_lemmatizer_rules.txt" +#define DEFAULT_EXCEPTIONS_LEMMATIZER_FILENAME "maca_trans_lemmatizer_exceptions.fplm" + #define DEFAULT_MULTI_COL_DESC_MORPHO_FILENAME "maca_trans_morpho.mcd" #define DEFAULT_FEATURES_MODEL_MORPHO_FILENAME "maca_trans_morpho.fm" #define DEFAULT_VOCABS_MORPHO_FILENAME "maca_trans_morpho.vocab" @@ -82,6 +89,7 @@ typedef struct { char *json_filename; char *dnn_model_filename; + char *l_rules_filename; } context; diff --git a/maca_trans_parser/src/maca_trans_lemmatizer.c b/maca_trans_parser/src/maca_trans_lemmatizer.c index c8097da1195fda0d09d8101e878506cbf52418c9..f6e9803cc28a5cd38ba6cf2b6c18ad176dcaf44a 100644 --- a/maca_trans_parser/src/maca_trans_lemmatizer.c +++ b/maca_trans_parser/src/maca_trans_lemmatizer.c @@ -9,6 +9,8 @@ #include"dico.h" #include"config.h" #include"fplm.h" +#include"l_rule.h" +#include"config2feat_vec.h" void maca_lemmatizer_help_message(context *ctx) { @@ -20,29 +22,58 @@ void maca_lemmatizer_help_message(context *ctx) context_mcd_help_message(ctx); } -void maca_lemmatizer_check_options(context *ctx){ - if(ctx->help - ){ - maca_lemmatizer_help_message(ctx); - exit(1); - } -} - void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx) { char absolute_filename[500]; + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MODEL_LEMMATIZER_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_VOCABS_LEMMATIZER_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_LEMMATIZER_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } if(!ctx->fplm_filename){ strcpy(absolute_filename, ctx->maca_data_path); - strcat(absolute_filename, DEFAULT_FPLM_FILENAME); + strcat(absolute_filename, DEFAULT_EXCEPTIONS_LEMMATIZER_FILENAME); ctx->fplm_filename = strdup(absolute_filename); } + if(!ctx->l_rules_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_RULES_LEMMATIZER_FILENAME); + ctx->l_rules_filename = strdup(absolute_filename); + } + + if(ctx->verbose){ - fprintf(stderr, "fplm_filename = %s\n", ctx->fplm_filename); + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + fprintf(stderr, "rules filename = %s\n", ctx->l_rules_filename); + fprintf(stderr, "exceptions filename = %s\n", ctx->fplm_filename); } } +void maca_lemmatizer_check_options(context *ctx){ + if(ctx->help + ){ + maca_lemmatizer_help_message(ctx); + exit(1); + } +} /* a bit messy */ void print_word(word *w, mcd *mcd_struct, char *lemma) @@ -76,6 +107,90 @@ void print_word(word *w, mcd *mcd_struct, char *lemma) int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + feat_vec *fv = feat_vec_new(10); + word *b0; + char lemma[200]; + char form[200]; + char pos[200]; + char *lemma_from_fplm; + config *c; + int l_rule_code; + char *l_rule; + float max; + + + maca_lemmatizer_check_options(ctx); + maca_lemmatizer_set_linguistic_resources_filenames(ctx); + + dico *d_l_rules = dico_read(ctx->l_rules_filename, 0.5); + fplm_struct *exceptions = fplm_load_file(ctx->fplm_filename, ctx->debug_mode); + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + c = config_new(f, ctx->mcd_struct, 5); + + while(!config_is_terminal(c)){ + b0 = word_buffer_b0(c->bf); + word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct)); + word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct)); + word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct)); + // fprintf(stderr, "form = %s pos = %s lemma = %s\n", b0->form, pos, lemma); + + // if lemma is not specified in input it is looked up in exceptions file + if(strlen(lemma) && strcmp(lemma, "_")) + print_word(b0, ctx->mcd_struct, lemma); + else{ + lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose); + if(lemma_from_fplm){ + // printf("lemma %s found in exceptions file\n", lemma_from_fplm); + print_word(b0, ctx->mcd_struct, lemma_from_fplm); + } + // if lemma is not found in exception file, predict an l_rule + else{ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + // feat_vec_print_string(fv, ctx->d_perceptron_features); + // feat_vec_print(stdout, fv); + + l_rule_code = feature_table_argmax(fv, ft, &max); + //fprintf(stderr, "lrule code %d predicted\n", l_rule_code); + l_rule = dico_int2string(d_l_rules, l_rule_code); + // printf("lrule %s predicted\n", l_rule); + char *transformed_lemma = apply_l_rule(form, l_rule); + + // printf("transformed_lemma = %s\n", transformed_lemma); + print_word(b0, ctx->mcd_struct, transformed_lemma); + + free(transformed_lemma); + if(ctx->debug_mode){ + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + for(int i=0; i < 10; i++){ + fprintf(stderr, "%d\t", i); + fprintf(stderr, "%s\t%.4f\n", dico_int2string(d_l_rules, vcode_array[i].class_code), vcode_array[i].score); + } + free(vcode_array); + } + } + } + word_buffer_move_right(c->bf); + } + config_free(c); + if (ctx->input_filename) fclose(f); + context_free(ctx); + fplm_free(exceptions); + feature_table_free(ft); + + + return 0; +} + +#if 0 + int main(int argc, char *argv[]) { context *ctx = context_read_options(argc, argv); word *b0; @@ -118,3 +233,4 @@ int main(int argc, char *argv[]) return 0; } +#endif diff --git a/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c b/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c index c90e3f3aeab8847445b3cd83b64d37c6dd6fab02..296476e412753dd46649f209d877517aa3e52c77 100644 --- a/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c @@ -37,6 +37,54 @@ int movement_lemmatizer(config *c, int feats) return 1; } +void decode_lemmatizer_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_filename[500]; + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MODEL_LEMMATIZER_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_VOCABS_LEMMATIZER_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_LEMMATIZER_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + if(!ctx->fplm_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_EXCEPTIONS_LEMMATIZER_FILENAME); + ctx->fplm_filename = strdup(absolute_filename); + } + + if(!ctx->l_rules_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_RULES_LEMMATIZER_FILENAME); + ctx->l_rules_filename = strdup(absolute_filename); + } + + + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + + fprintf(stderr, "rules filename = %s\n", ctx->l_rules_filename); + fprintf(stderr, "exceptions filename = %s\n", ctx->fplm_filename); + + + fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename); + } +} void maca_trans_lemmatizer_mcf2cff_help_message(context *ctx) { @@ -90,9 +138,9 @@ void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fp word_sprint_col_n(form, word_buffer_b0(c->bf), mcd_get_form_col(ctx->mcd_struct)); word_sprint_col_n(pos, word_buffer_b0(c->bf), mcd_get_pos_col(ctx->mcd_struct)); - // printf("form = %s pos = %s lemma = %s\n", form, pos, lemma); + //printf("form = %s pos = %s lemma = %s\n", form, pos, lemma); - lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, 0); + lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose); if(lemma_from_fplm){ // printf("exception\n"); @@ -102,17 +150,18 @@ void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fp l_rule = compute_l_rule(lemma, form, strict); - //printf("rule = %s\n", l_rule); + // printf("rule = %s\n", l_rule); l_rule_code = dico_string2int(d_l_rules, l_rule); - free(l_rule); if(l_rule_code != -1){ - // fprintf(stdout, "rule exists\n"); - config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); - fprintf(output_file, "%d", l_rule_code); - feat_vec_print(output_file, fv); + // if(strcmp(l_rule, "@@")){ + // fprintf(stdout, "rule exists\n"); + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + fprintf(output_file, "%d", l_rule_code); + feat_vec_print(output_file, fv); + // } } else{ // fprintf(stdout, "rule does not exist\n"); @@ -120,6 +169,8 @@ void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fp } word_buffer_move_right(c->bf); // movement_lemmatizer(c, l_rule); + free(l_rule); + } } @@ -130,11 +181,11 @@ int main(int argc, char *argv[]) dico *d_l_rules; fplm_struct *exceptions; ctx = context_read_options(argc, argv); - maca_trans_lemmatizer_mcf2cff_check_options(ctx); - // exceptions = fplm_load_file(ctx->fplm_filename, ctx->verbose); - exceptions = fplm_load_file((char *)"exceptions.fplm", ctx->verbose); - d_l_rules = dico_read((char *)"rules", 0.5); + // decode_lemmatizer_set_linguistic_resources_filenames(ctx); + maca_trans_lemmatizer_mcf2cff_check_options(ctx); + exceptions = fplm_load_file(ctx->fplm_filename, ctx->verbose); + d_l_rules = dico_read(ctx->l_rules_filename, 0.5); ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);