diff --git a/maca_common/src/fplm.c b/maca_common/src/fplm.c index a64af843e65d372c730276dd4543ced99085f4e5..6e146738ca65f807cb61e5f4d0cfb15890564f8a 100644 --- a/maca_common/src/fplm.c +++ b/maca_common/src/fplm.c @@ -108,5 +108,6 @@ char *fplm_lookup_lemma(fplm_struct *fplm, char *form, char *pos, int verbose) if(verbose) fprintf(stderr, "cannot find an entry for %s %s\n", form, pos); - return form; + return NULL; + // return form; } diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 409b89371366e720c8702aaa01f385521171d7da..1ffc335cc55b5988b509537b5e7527d6e3bbb46e 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -38,6 +38,12 @@ target_link_libraries(transparse perceptron) #target_link_libraries(maca_trans_parser_nn maca_common) #install (TARGETS maca_trans_parser_nn DESTINATION bin) +add_executable(maca_trans_lemmatizer_mcf2cff ./src/maca_trans_lemmatizer_mcf2cff.c) +target_link_libraries(maca_trans_lemmatizer_mcf2cff perceptron) +target_link_libraries(maca_trans_lemmatizer_mcf2cff transparse) +target_link_libraries(maca_trans_lemmatizer_mcf2cff maca_common) +install (TARGETS maca_trans_lemmatizer_mcf2cff DESTINATION bin) + add_executable(maca_trans_tagger_mcf2cff ./src/maca_trans_tagger_mcf2cff.c) target_link_libraries(maca_trans_tagger_mcf2cff perceptron) target_link_libraries(maca_trans_tagger_mcf2cff transparse) diff --git a/maca_trans_parser/src/maca_trans_lemmatizer.c b/maca_trans_parser/src/maca_trans_lemmatizer.c index a6c732d37fa0d975dbb1a113f1c70873a2534fd5..c8097da1195fda0d09d8101e878506cbf52418c9 100644 --- a/maca_trans_parser/src/maca_trans_lemmatizer.c +++ b/maca_trans_parser/src/maca_trans_lemmatizer.c @@ -82,6 +82,7 @@ int main(int argc, char *argv[]) char lemma[200]; char form[200]; char pos[200]; + char *lemma_from_fplm; config *c; fplm_struct *fplm; FILE *f; @@ -101,8 +102,13 @@ int main(int argc, char *argv[]) /* if lemma is not specified in input it is looked up */ if(strlen(lemma) && strcmp(lemma, "_")) print_word(b0, ctx->mcd_struct, lemma); - else - print_word(b0, ctx->mcd_struct, fplm_lookup_lemma(fplm, form, pos, ctx->verbose)); + else{ + lemma_from_fplm = fplm_lookup_lemma(fplm, form, pos, ctx->verbose); + if(lemma_from_fplm) + print_word(b0, ctx->mcd_struct, lemma_from_fplm); + else + print_word(b0, ctx->mcd_struct, form); + } word_buffer_move_right(c->bf); } config_free(c); diff --git a/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c b/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c index 91aa5a57efd02bbae6c45673e3a61feaa3c4b7ee..c90e3f3aeab8847445b3cd83b64d37c6dd6fab02 100644 --- a/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_lemmatizer_mcf2cff.c @@ -11,8 +11,10 @@ #include"dico_vec.h" #include"word_emb.h" #include"config2feat_vec.h" +#include"fplm.h" +#include"l_rule.h" -int oracle_lemmatizer(config *c, context *ctx, dico *d_l_rules) +/*int oracle_lemmatizer(config *c, context *ctx, dico *d_l_rules, fplm_struct *exceptions) { char lemma[200]; char form[200]; @@ -26,7 +28,7 @@ int oracle_lemmatizer(config *c, context *ctx, dico *d_l_rules) l_rule_code = dico_string2int(d_l_rule, l_rule); free(l_rule); return l_rule_code; -} + }*/ int movement_lemmatizer(config *c, int feats) { @@ -66,28 +68,58 @@ void maca_trans_lemmatizer_mcf2cff_check_options(context *ctx) } } -void generate_training_file(FILE *output_file, context *ctx) +void generate_training_file(FILE *output_file, context *ctx, dico *d_l_rules, fplm_struct *exceptions) { config *c; feat_vec *fv = feat_vec_new(feature_types_nb); FILE *conll_file = myfopen(ctx->input_filename, "r"); - int l_rule; + char *l_rule; + int l_rule_code; + char lemma[200]; + char form[200]; + char pos[200]; + char *lemma_from_fplm; + int strict = 1; + /* dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */ c = config_new(conll_file, ctx->mcd_struct, 5); while(!config_is_terminal(c)){ + word_sprint_col_n(lemma, word_buffer_b0(c->bf), mcd_get_lemma_col(ctx->mcd_struct)); + word_sprint_col_n(form, word_buffer_b0(c->bf), mcd_get_form_col(ctx->mcd_struct)); + word_sprint_col_n(pos, word_buffer_b0(c->bf), mcd_get_pos_col(ctx->mcd_struct)); - - l_rule = oracle_lemmatizer(c); - + // printf("form = %s pos = %s lemma = %s\n", form, pos, lemma); - config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, 0); + + if(lemma_from_fplm){ + // printf("exception\n"); + word_buffer_move_right(c->bf); + continue; + } + + l_rule = compute_l_rule(lemma, form, strict); + + //printf("rule = %s\n", l_rule); - fprintf(output_file, "%d", l_rule); - feat_vec_print(output_file, fv); - movement_lemmatizer(c, l_rule); + l_rule_code = dico_string2int(d_l_rules, l_rule); + free(l_rule); + + if(l_rule_code != -1){ + // fprintf(stdout, "rule exists\n"); + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + fprintf(output_file, "%d", l_rule_code); + feat_vec_print(output_file, fv); + } + else{ + // fprintf(stdout, "rule does not exist\n"); + + } + word_buffer_move_right(c->bf); + // movement_lemmatizer(c, l_rule); } } @@ -96,10 +128,14 @@ int main(int argc, char *argv[]) context *ctx; FILE *output_file; dico *d_l_rules; + fplm_struct *exceptions; ctx = context_read_options(argc, argv); maca_trans_lemmatizer_mcf2cff_check_options(ctx); - d_l_rules = dico_read("l_rules", "r"); + // exceptions = fplm_load_file(ctx->fplm_filename, ctx->verbose); + exceptions = fplm_load_file((char *)"exceptions.fplm", ctx->verbose); + d_l_rules = dico_read((char *)"rules", 0.5); + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); if(ctx->mode == TRAIN_MODE){ @@ -128,7 +164,7 @@ int main(int argc, char *argv[]) else output_file = stdout; - generate_training_file(output_file, ctx); + generate_training_file(output_file, ctx, d_l_rules, exceptions); if(ctx->mode == TRAIN_MODE){ /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */