diff --git a/maca_common/include/l_rule.h b/maca_common/include/l_rule.h index 6a74b082c5b45f7597ab223d4a201e23bd2ff5e9..efeb8509ff9c4b70a0fd3f856d5b53f1cfc60aac 100644 --- a/maca_common/include/l_rule.h +++ b/maca_common/include/l_rule.h @@ -3,5 +3,6 @@ char *apply_l_rule(char *form, char *l_rule); char *compute_l_rule(char *lemma, char *form, int strict); +int l_rule_is_applicable(char *form, char *l_rule); #endif diff --git a/maca_common/src/l_rule.c b/maca_common/src/l_rule.c index ee494398ab5381f645db0276bf8db9ee5534cd9b..0388f859b691645ad34e9d8a03e1a19baf41e94d 100644 --- a/maca_common/src/l_rule.c +++ b/maca_common/src/l_rule.c @@ -9,22 +9,26 @@ int l_rule_is_applicable(char *form, char *l_rule) { int i,j; int sep_index; - int lemma_suffix_length; int form_suffix_length; int form_length = strlen(form); int l_rule_length = strlen(l_rule); + + // printf("in is_applicable form = %s lrune = %s\n", form, l_rule); for(sep_index=1; sep_index < l_rule_length; sep_index++) if(l_rule[sep_index] == '@') break; - lemma_suffix_length = l_rule_length - 1 - sep_index; form_suffix_length = sep_index - 1; - - for(j=0, i=form_length - form_suffix_length; j < sep_index; i++, j++) + + for(j=1, i=form_length - form_suffix_length; j < sep_index; i++, j++){ + // printf("l_rule[%d] = %c (%d) form[%d] = %c (%d)\n", j, l_rule[j],l_rule[j], i, form[i], form[i]); if((l_rule[j] != '*') && (l_rule[j] != form[i])) - return 0; - return 1; + // if(l_rule[j] != form[i]) + return 0; + } + // printf("rule can be applied\n"); + return 1; } char *apply_l_rule(char *form, char *l_rule) diff --git a/maca_trans_parser/src/maca_trans_lemmatizer.c b/maca_trans_parser/src/maca_trans_lemmatizer.c index f6e9803cc28a5cd38ba6cf2b6c18ad176dcaf44a..8970bf5e75e320cf5ed3cb4309fbb169d9646074 100644 --- a/maca_trans_parser/src/maca_trans_lemmatizer.c +++ b/maca_trans_parser/src/maca_trans_lemmatizer.c @@ -121,6 +121,97 @@ int main(int argc, char *argv[]) float max; + maca_lemmatizer_check_options(ctx); + maca_lemmatizer_set_linguistic_resources_filenames(ctx); + + dico *d_l_rules = dico_read(ctx->l_rules_filename, 0.5); + fplm_struct *exceptions = fplm_load_file(ctx->fplm_filename, ctx->debug_mode); + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + c = config_new(f, ctx->mcd_struct, 5); + + while(!config_is_terminal(c)){ + b0 = word_buffer_b0(c->bf); + word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct)); + word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct)); + word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct)); + // fprintf(stderr, "form = %s pos = %s lemma = %s\n", b0->form, pos, lemma); + + // if lemma is not specified in input it is looked up in exceptions file + if(strlen(lemma) && strcmp(lemma, "_")) + print_word(b0, ctx->mcd_struct, lemma); + else{ + lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose); + if(lemma_from_fplm){ + // printf("lemma %s found in exceptions file\n", lemma_from_fplm); + print_word(b0, ctx->mcd_struct, lemma_from_fplm); + } + // if lemma is not found in exception file, predict an l_rule + else{ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + // feat_vec_print_string(fv, ctx->d_perceptron_features); + // feat_vec_print(stdout, fv); + + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + if(ctx->debug_mode){ + for(int i=0; i < 10; i++){ + l_rule = dico_int2string(d_l_rules, vcode_array[i].class_code); + fprintf(stderr, "%d", i); + if(l_rule_is_applicable(form, l_rule)) fprintf(stderr, "*"); + fprintf(stderr, "\t%s\t%.4f\n", l_rule, vcode_array[i].score); + } + } + int i; + for(i=0; i < 10; i++){ + l_rule = dico_int2string(d_l_rules, vcode_array[i].class_code); + if(l_rule_is_applicable(form, l_rule)){ + char *transformed_lemma = apply_l_rule(form, l_rule); + // printf("transformed_lemma = %s\n", transformed_lemma); + // print_word(b0, ctx->mcd_struct, to_lower_string(transformed_lemma)); + print_word(b0, ctx->mcd_struct, transformed_lemma); + free(transformed_lemma); + break; + } + } + /* no rule applied */ + if(i == 10){ + print_word(b0, ctx->mcd_struct, form); + } + + free(vcode_array); + } + } + word_buffer_move_right(c->bf); + } + config_free(c); + if (ctx->input_filename) fclose(f); + context_free(ctx); + fplm_free(exceptions); + feature_table_free(ft); + return 0; +} + +#if 0 +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + feat_vec *fv = feat_vec_new(10); + word *b0; + char lemma[200]; + char form[200]; + char pos[200]; + char *lemma_from_fplm; + config *c; + int l_rule_code; + char *l_rule; + float max; + + maca_lemmatizer_check_options(ctx); maca_lemmatizer_set_linguistic_resources_filenames(ctx); @@ -189,6 +280,20 @@ int main(int argc, char *argv[]) return 0; } +#endif + + + + + + + + + + + + + #if 0 int main(int argc, char *argv[]) { diff --git a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c index f6c966c6abd824d68fd82397ff7af136e9f70993..d709f702b9ed113714bbac742b3634277e0b3f10 100644 --- a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c @@ -17,16 +17,18 @@ void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p) { int i; word *w; - char lower_form[100]; + char lower_form[1000]; for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){ w = word_buffer_get_word_n(bf, i); if(word_get_signature(w) != -1) break; w->signature = form2pos_get_signature(f2p, w->form); if(w->signature == -1){ - strcpy(lower_form, w->form); - to_lower_string(lower_form); - w->signature = form2pos_get_signature(f2p, lower_form); + if(w->form){ + strcpy(lower_form, w->form); + to_lower_string(lower_form); + w->signature = form2pos_get_signature(f2p, lower_form); + } } } }