From c1b334fd249325f49c31ed5a8db6bb2cd00df2f1 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Tue, 11 Apr 2017 21:07:16 +0200 Subject: [PATCH] implemented a simple morphological analyzer --- CMakeLists.txt | 2 +- maca_common/include/mcd.h | 1 + maca_common/src/mcd.c | 57 ++++++ maca_lemmatizer/src/maca_lemmatizer.c | 14 +- maca_trans_parser/CMakeLists.txt | 12 ++ maca_trans_parser/src/context.c | 3 +- maca_trans_parser/src/context.h | 5 + maca_trans_parser/src/maca_trans_lemmatizer.c | 1 + maca_trans_parser/src/maca_trans_morpho.c | 177 ++++++++++++++++++ .../src/maca_trans_morpho_mcf2cff.c | 129 +++++++++++++ 10 files changed, 398 insertions(+), 3 deletions(-) create mode 100644 maca_trans_parser/src/maca_trans_morpho.c create mode 100644 maca_trans_parser/src/maca_trans_morpho_mcf2cff.c diff --git a/CMakeLists.txt b/CMakeLists.txt index e155629..7ac8c72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ add_subdirectory(maca_common) add_subdirectory(maca_tools) add_subdirectory(perceptron) #add_subdirectory(maca_lemmatizer) -add_subdirectory(maca_morpho) +#add_subdirectory(maca_morpho) add_subdirectory(maca_tokenizer) add_subdirectory(maca_lexer) add_subdirectory(maca_trans_parser) diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index 392faf9..0d7761d 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -199,6 +199,7 @@ mcd *mcd_build_conll07(void); mcd *mcd_build_ifpls(void); mcd *mcd_build_wplgf(void); mcd *mcd_build_wplgfs(void); +mcd *mcd_build_wpmlgfs(void); mcd *mcd_read(char *mcd_filename, int verbose); void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose); diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index ca924f2..6911740 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -422,6 +422,63 @@ mcd *mcd_build_wplgfs(void) return m; } +mcd *mcd_build_wpmlgfs(void) +{ + mcd *m = mcd_new(7); + int col; + + col = 0; + m->wf[col]=MCD_WF_FORM; + m->wf_str[col]=strdup("FORM"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_FORM] = col; + + col = 1; + m->wf[col]=MCD_WF_POS; + m->wf_str[col]=strdup("POS"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_POS] = col; + + col = 2; + m->wf[col]=MCD_WF_FEATS; + m->wf_str[col]=strdup("FEATS"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_FEATS] = col; + + col = 3; + m->wf[col]=MCD_WF_LEMMA; + m->wf_str[col]=strdup("LEMMA"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_LEMMA] = col; + + col = 4; + m->wf[col]=MCD_WF_GOV; + m->wf_str[col]=strdup("GOV"); + m->representation[col]= MCD_REPRESENTATION_INT; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_GOV] = col; + + col = 5; + m->wf[col]=MCD_WF_LABEL; + m->wf_str[col]=strdup("LABEL"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_LABEL] = col; + + col = 6; + m->wf[col]=MCD_WF_SENT_SEG; + m->wf_str[col]=strdup("SENT_SEG"); + m->representation[col]= MCD_REPRESENTATION_INT; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_SENT_SEG] = col; + + return m; +} + /* returns a dico_vec containing the different dictionnaries found in an mcd structure */ diff --git a/maca_lemmatizer/src/maca_lemmatizer.c b/maca_lemmatizer/src/maca_lemmatizer.c index 5d9cacd..e8aeecd 100644 --- a/maca_lemmatizer/src/maca_lemmatizer.c +++ b/maca_lemmatizer/src/maca_lemmatizer.c @@ -125,6 +125,7 @@ int main(int argc, char *argv[]) char *buffer_copy; char *form; char *pos; + char *feats; char *token; int column_nb; @@ -136,11 +137,16 @@ int main(int argc, char *argv[]) int form_column; int pos_column; int lemma_column; + int feats_column; FILE *f = NULL; ctx = context_read_options(argc, argv); maca_lemmatizer_check_options(ctx); + + feats_column = ctx->mcd_struct->wf2col[MCD_WF_FEATS]; + + if(ctx->pos_column != -1) pos_column = ctx->pos_column; else @@ -177,6 +183,7 @@ int main(int argc, char *argv[]) form = NULL; pos = NULL; lemma = NULL; + feats = NULL; do{ if(column_nb == lemma_column) /* lemma is present in the input file */ if(strcmp(token, "_")) /* and it is not an underscore */ @@ -188,6 +195,9 @@ int main(int argc, char *argv[]) if(column_nb == pos_column){ pos = strdup(token); } + if(column_nb == feats_column){ + feats = strdup(token); + } column_nb++; } while((token = strtok(NULL , "\t"))); @@ -215,11 +225,13 @@ int main(int argc, char *argv[]) /* print_word(buffer, ctx->mcd_struct, lemma); */ - /* printf("form = %s pos = %s (%s) lemma = %s\n", form, pos, form_pos, lemma); */ + printf("form = %s pos = %s (%s) feats = %s lemma = %s\n", form, pos, form_pos, feats, lemma); + printf("form = %s pos = %s (%s) feats = %s lemma = %s\n", form, pos, form_pos, feats, lemma); printf("\t%s\n", lemma); if(pos)free(pos); if(form)free(form); + if(feats)free(feats); } free(buffer_copy); free(lemma_array); diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 04cb203..91bb573 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -40,6 +40,12 @@ target_link_libraries(maca_trans_tagger_mcf2cff transparse) target_link_libraries(maca_trans_tagger_mcf2cff maca_common) install (TARGETS maca_trans_tagger_mcf2cff DESTINATION bin) +add_executable(maca_trans_morpho_mcf2cff ./src/maca_trans_morpho_mcf2cff.c) +target_link_libraries(maca_trans_morpho_mcf2cff perceptron) +target_link_libraries(maca_trans_morpho_mcf2cff transparse) +target_link_libraries(maca_trans_morpho_mcf2cff maca_common) +install (TARGETS maca_trans_morpho_mcf2cff DESTINATION bin) + #add_executable(maca_trans_tagger_mcf2cff_bt ./src/maca_trans_tagger_mcf2cff_bt.c) #target_link_libraries(maca_trans_tagger_mcf2cff_bt perceptron) #target_link_libraries(maca_trans_tagger_mcf2cff_bt transparse) @@ -100,6 +106,12 @@ target_link_libraries(maca_trans_tagger transparse) target_link_libraries(maca_trans_tagger maca_common) install (TARGETS maca_trans_tagger DESTINATION bin) +add_executable(maca_trans_morpho ./src/maca_trans_morpho.c) +target_link_libraries(maca_trans_morpho perceptron) +target_link_libraries(maca_trans_morpho transparse) +target_link_libraries(maca_trans_morpho maca_common) +install (TARGETS maca_trans_morpho DESTINATION bin) + #add_executable(maca_trans_tagger_bt ./src/maca_trans_tagger_bt.c) #target_link_libraries(maca_trans_tagger_bt perceptron) #target_link_libraries(maca_trans_tagger_bt transparse) diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 72c2d61..bc75f11 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -283,7 +283,8 @@ context *context_read_options(int argc, char *argv[]) if(ctx->mcd_filename) ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); else - ctx->mcd_struct = mcd_build_wplgfs(); + ctx->mcd_struct = mcd_build_wpmlgfs(); + /* ctx->mcd_struct = mcd_build_wplgfs(); */ /* initialize maca_data_path field */ diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 932e671..82db602 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -14,6 +14,11 @@ #define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab" #define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model" +#define DEFAULT_MULTI_COL_DESC_MORPHO_FILENAME "maca_trans_morpho.mcd" +#define DEFAULT_FEATURES_MODEL_MORPHO_FILENAME "maca_trans_morpho.fm" +#define DEFAULT_VOCABS_MORPHO_FILENAME "maca_trans_morpho.vocab" +#define DEFAULT_MODEL_MORPHO_FILENAME "maca_trans_morpho.model" + #define DEFAULT_MULTI_COL_DESC_TAGPARSER_FILENAME "maca_trans_tagparser.mcd" #define DEFAULT_FEATURES_MODEL_TAGPARSER_FILENAME "maca_trans_tagparser.fm" #define DEFAULT_VOCABS_TAGPARSER_FILENAME "maca_trans_tagparser.vocab" diff --git a/maca_trans_parser/src/maca_trans_lemmatizer.c b/maca_trans_parser/src/maca_trans_lemmatizer.c index 351305c..ee38e7a 100644 --- a/maca_trans_parser/src/maca_trans_lemmatizer.c +++ b/maca_trans_parser/src/maca_trans_lemmatizer.c @@ -46,6 +46,7 @@ char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, in { char form[1000]; char pos[1000]; + char lemma[1000]; char morpho[1000]; int num = 0; diff --git a/maca_trans_parser/src/maca_trans_morpho.c b/maca_trans_parser/src/maca_trans_morpho.c new file mode 100644 index 0000000..be9db3a --- /dev/null +++ b/maca_trans_parser/src/maca_trans_morpho.c @@ -0,0 +1,177 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"feat_fct.h" +#include"feature_table.h" +#include"dico.h" +#include"config2feat_vec.h" + +void decode_morpho_help_message(context *ctx); +void decode_morpho_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_beam_help_message(ctx); + context_conll_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_input_help_message(ctx); + context_mcd_help_message(ctx); + context_model_help_message(ctx); + context_vocabs_help_message(ctx); + context_features_model_help_message(ctx); + context_f2p_filename_help_message(ctx); +} + +void decode_morpho_check_options(context *ctx){ + if(ctx->help + /*!ctx->conll_filename*/ + /* || !ctx->perc_model_filename + || !ctx->mcd_filename + || !ctx->vocabs_filename + || !ctx->features_model_filename*/ + ){ + decode_morpho_help_message(ctx); + exit(1); + } +} + +void decode_morpho_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_filename[500]; + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MODEL_MORPHO_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_VOCABS_MORPHO_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_MORPHO_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + } +} +void print_word(word *w, mcd *mcd_struct, dico *dico_morph, int postag) +{ + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + if(mcd_get_pos_col(mcd_struct) == -1){ + printf("%s\t%s\n", w->input, dico_int2string(dico_morph, postag)); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) printf("\t"); + if(col_nb == mcd_get_feats_col(mcd_struct)) + printf("%s", dico_int2string(dico_morph, postag)); + else + word_print_col_n(stdout, w, col_nb); + col_nb++; + token = strtok(NULL, "\t"); + } + if(col_nb <= mcd_get_feats_col(mcd_struct)) + printf("\t%s", dico_int2string(dico_morph, postag)); + printf("\n"); + free(buffer); + } +} + +int movement_morpho(config *c, int feats) +{ + word_set_feats(word_buffer_b0(c->bf), feats); + word_buffer_move_right(c->bf); + + return 1; +} + +void simple_decoder_morpho(context *ctx) +{ + config *c; + feat_vec *fv = feat_vec_new(feature_types_nb); + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + int feats; + float max; + word *b0; + dico *dico_feats = dico_vec_get_dico(ctx->vocabs, (char *)"FEATS"); + + c = config_new(f, ctx->mcd_struct, 5); + + while(!config_is_terminal(c)){ + b0 = word_buffer_b0(c->bf); + feats = word_get_feats(b0); + + if(ctx->debug_mode){ + fprintf(stderr, "***********************************\n"); + config_print(stderr, c); + } + + /* if feats is not specified in input it is predicted */ + if(feats == -1){ + /* config_print(stdout, c); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + + /* feat_vec_print(stdout, fv); */ + feats = feature_table_argmax(fv, ft, &max); + /* printf("feats = %d\n", feats); */ + + if(ctx->debug_mode){ + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + for(int i=0; i < 3; i++){ + fprintf(stderr, "%d\t", i); + fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_feats, vcode_array[i].class_code), vcode_array[i].score); + } + free(vcode_array); + } + } + + print_word(b0, ctx->mcd_struct, dico_feats, feats); + + movement_morpho(c, feats); + + } + /* config_print(stdout, c); */ + feat_vec_free(fv); + feature_table_free(ft); + config_free(c); + if (ctx->input_filename) fclose(f); +} + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + decode_morpho_check_options(ctx); + + decode_morpho_set_linguistic_resources_filenames(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + if(ctx->beam_width == 1) + simple_decoder_morpho(ctx); + + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/maca_trans_morpho_mcf2cff.c b/maca_trans_parser/src/maca_trans_morpho_mcf2cff.c new file mode 100644 index 0000000..a821863 --- /dev/null +++ b/maca_trans_parser/src/maca_trans_morpho_mcf2cff.c @@ -0,0 +1,129 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"feat_fct.h" +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"config2feat_vec.h" + + +int oracle_morpho(config *c) +{ + return word_get_feats(word_buffer_b0(config_get_buffer(c))); +} + + +int movement_morpho(config *c, int feats) +{ + word_set_feats(word_buffer_b0(c->bf), feats); + word_buffer_move_right(c->bf); + + return 1; +} + +void maca_trans_morpho_mcf2cff_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + context_mcd_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_vocabs_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_vocabs_help_message(ctx); + + +} + +void maca_trans_morpho_mcf2cff_check_options(context *ctx) +{ + if(!ctx->input_filename + || ctx->help + /* || !ctx->mcd_filename */ + || !(ctx->cff_filename || ctx->fann_filename) + ){ + maca_trans_morpho_mcf2cff_help_message(ctx); + exit(1); + } +} + +void morpho_generate_training_file(FILE *output_file, context *ctx) +{ + config *c; + feat_vec *fv = feat_vec_new(feature_types_nb); + FILE *conll_file = myfopen(ctx->input_filename, "r"); + int feats; + /* dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */ + + c = config_new(conll_file, ctx->mcd_struct, 5); + + while(!config_is_terminal(c)){ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + feats = oracle_morpho(c); + + fprintf(output_file, "%d", feats); + feat_vec_print(output_file, fv); + movement_morpho(c, feats); + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + + ctx = context_read_options(argc, argv); + maca_trans_morpho_mcf2cff_check_options(ctx); + + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + + + if(ctx->mode == TRAIN_MODE){ + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + } + else if(ctx->mode == TEST_MODE){ + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + } + + /* in train mode create feature dictionnary for perceptron */ + if(ctx->mode == TRAIN_MODE) + ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); + + /* in test mode read feature dictionnary for perceptron */ + if(ctx->mode == TEST_MODE) + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + /* add the feature dictionnary to the dico vector */ + dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + + /* open output file */ + if(ctx->cff_filename) + output_file = myfopen(ctx->cff_filename, "w"); + else + output_file = stdout; + + morpho_generate_training_file(output_file, ctx); + + if(ctx->mode == TRAIN_MODE){ + /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ + dico_vec_print(ctx->vocabs_filename, ctx->vocabs); + + } + + if(ctx->cff_filename) + fclose(output_file); + context_free(ctx); + return 0; +} + -- GitLab