From bbb0919cadd8e77ad9caf6639495879d75f3ef1f Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Tue, 12 Jul 2016 14:58:38 -0400 Subject: [PATCH] code refactoring --- INSTALL | 3 ++ maca_common/include/util.h | 1 + maca_common/src/form2pos.c | 4 +- maca_common/src/util.c | 9 +++++ maca_lemmatizer/src/maca_lemmatizer.c | 2 +- maca_trans_parser/src/context.c | 37 +++++++++---------- maca_trans_parser/src/context.h | 7 ++++ maca_trans_parser/src/decode.c | 3 ++ maca_trans_parser/src/decode_tagger.c | 3 +- maca_trans_parser/src/depset.c | 3 +- .../src/maca_trans_parser_conll2cff.c | 4 ++ .../src/maca_trans_parser_conll2cff_tagger.c | 6 +-- maca_trans_parser/src/queue.c | 2 +- maca_trans_parser/src/simple_decoder_tagger.c | 25 ++----------- maca_trans_parser/src/word.c | 1 + 15 files changed, 62 insertions(+), 48 deletions(-) diff --git a/INSTALL b/INSTALL index 8306709..b6e0cf2 100644 --- a/INSTALL +++ b/INSTALL @@ -10,6 +10,9 @@ The basic procedure to build and install macaon from sources is the following. - Launch the cmake command: cmake .. + If you want to compile macaon with debugging options type: + cmake -DCMAKE_BUILD_TYPE=Debug .. + If you want to install macaon locally, you can specify the install path with : cmake -DCMAKE_INSTALL_PREFIX:PATH=/absolute/path/to/macaon_install_dir diff --git a/maca_common/include/util.h b/maca_common/include/util.h index 1700e95..26c0952 100644 --- a/maca_common/include/util.h +++ b/maca_common/include/util.h @@ -5,4 +5,5 @@ void myfree(void *ptr); void *memalloc(size_t s); FILE *myfopen(const char *path, const char *mode); +FILE *myfopen_no_exit(const char *path, const char *mode); #endif diff --git a/maca_common/src/form2pos.c b/maca_common/src/form2pos.c index 610ccf4..1a98ad3 100644 --- a/maca_common/src/form2pos.c +++ b/maca_common/src/form2pos.c @@ -31,7 +31,7 @@ void form2pos_free(form2pos *f2p) form2pos *form2pos_read(char *filename) { - FILE *f = myfopen(filename, "r"); + FILE *f = myfopen_no_exit(filename, "r"); int nbelem; int pos_nb; char pos_list[10000]; @@ -39,6 +39,8 @@ form2pos *form2pos_read(char *filename) char signature[200]; form2pos *f2p = NULL; + if(f == NULL) return NULL; + /* read number of forms */ fscanf(f, "%d\n", &nbelem); diff --git a/maca_common/src/util.c b/maca_common/src/util.c index 84a1ba7..4ff0352 100644 --- a/maca_common/src/util.c +++ b/maca_common/src/util.c @@ -25,3 +25,12 @@ FILE *myfopen(const char *path, const char *mode) } return f; } + +FILE *myfopen_no_exit(const char *path, const char *mode) +{ + FILE *f = fopen(path, mode); + if(f == NULL){ + fprintf(stderr, "cannot open file %s\n", path); + } + return f; +} diff --git a/maca_lemmatizer/src/maca_lemmatizer.c b/maca_lemmatizer/src/maca_lemmatizer.c index d705599..b748e7b 100644 --- a/maca_lemmatizer/src/maca_lemmatizer.c +++ b/maca_lemmatizer/src/maca_lemmatizer.c @@ -123,7 +123,7 @@ int main(int argc, char *argv[]) /* look for a valid word */ while(fgets(buffer, 10000, f)){ if(feof(f)) return 0; /* no more words to read */ - if((buffer[0] == '\n') || (buffer[0] == ' ')){ + if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){ printf("\n"); continue; } diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 76dff3d..4300e4a 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -7,9 +7,6 @@ #include "context.h" #include "util.h" - -void context_set_linguistic_resources_filenames(context *ctx); - void context_free(context *ctx) { if(ctx->program_name) free(ctx->program_name); @@ -306,13 +303,8 @@ context *context_read_options(int argc, char *argv[]) } } - context_set_linguistic_resources_filenames(ctx); - if(ctx->features_model_filename){ - ctx->features_model = feat_model_read(ctx->features_model_filename); - } - /* if(ctx->mcd_filename && ctx->conll_filename){ ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->conll_filename); ctx->mvt_nb = ctx->mcd_struct->dico_array[ctx->mcd_struct->type2col[FEAT_TYPE_LABEL]]->nbelem * 2 + 1; @@ -341,7 +333,7 @@ context *context_read_options(int argc, char *argv[]) return ctx; } -void context_set_linguistic_resources_filenames(context *ctx) +void context_set_linguistic_resources_filenames_parser(context *ctx) { char absolute_path[500]; char absolute_filename[500]; @@ -382,11 +374,10 @@ void context_set_linguistic_resources_filenames(context *ctx) ctx->features_model_filename = strdup(absolute_filename); } - /* fprintf(stdout, "perc_model_filename = %s\n", ctx->perc_model_filename); - fprintf(stdout, "vocabs_filename = %s\n", ctx->vocabs_filename); - fprintf(stdout, "mcd_filename = %s\n", ctx->mcd_filename); - fprintf(stdout, "perc_features_model_filename = %s\n", ctx->features_model_filename);*/ - + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); } void context_set_linguistic_resources_filenames_tagger(context *ctx) @@ -405,7 +396,7 @@ void context_set_linguistic_resources_filenames_tagger(context *ctx) strcat(absolute_path, ctx->language); strcat(absolute_path, "/bin/"); - + if(!ctx->perc_model_filename){ strcpy(absolute_filename, absolute_path); strcat(absolute_filename, DEFAULT_MODEL_TAGGER_FILENAME); @@ -430,9 +421,17 @@ void context_set_linguistic_resources_filenames_tagger(context *ctx) ctx->features_model_filename = strdup(absolute_filename); } - /* fprintf(stdout, "perc_model_filename = %s\n", ctx->perc_model_filename); - fprintf(stdout, "vocabs_filename = %s\n", ctx->vocabs_filename); - fprintf(stdout, "mcd_filename = %s\n", ctx->mcd_filename); - fprintf(stdout, "perc_features_model_filename = %s\n", ctx->features_model_filename);*/ + if(!ctx->f2p_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_F2P_FILENAME); + ctx->f2p_filename = strdup(absolute_filename); + ctx->f2p = form2pos_read(ctx->f2p_filename); + } + + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename); } diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 83d859b..ff17413 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -14,6 +14,7 @@ #define DEFAULT_FEATURES_MODEL_TAGGER_FILENAME "maca_trans_tagger.fm" #define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab" #define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model" +#define DEFAULT_F2P_FILENAME "fP" #include "dico_vec.h" #include "feat_model.h" @@ -92,4 +93,10 @@ void context_maca_data_path_help_message(context *ctx); void context_f2p_filename_help_message(context *ctx); +void context_set_linguistic_resources_filenames_tagger(context *ctx); +void context_set_linguistic_resources_filenames_parser(context *ctx); + + + + #endif diff --git a/maca_trans_parser/src/decode.c b/maca_trans_parser/src/decode.c index fcd0e66..aabe26a 100644 --- a/maca_trans_parser/src/decode.c +++ b/maca_trans_parser/src/decode.c @@ -53,6 +53,9 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); decode_check_options(ctx); + context_set_linguistic_resources_filenames_parser(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename); + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); diff --git a/maca_trans_parser/src/decode_tagger.c b/maca_trans_parser/src/decode_tagger.c index f2d92c8..722cdd7 100644 --- a/maca_trans_parser/src/decode_tagger.c +++ b/maca_trans_parser/src/decode_tagger.c @@ -46,11 +46,12 @@ int main(int argc, char *argv[]) { FILE *conll_file = NULL; context *ctx; - /* struct fann *ann; */ ctx = context_read_options(argc, argv); decode_check_options(ctx); + context_set_linguistic_resources_filenames_tagger(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename); ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); diff --git a/maca_trans_parser/src/depset.c b/maca_trans_parser/src/depset.c index 4f7b8a5..a71bba8 100644 --- a/maca_trans_parser/src/depset.c +++ b/maca_trans_parser/src/depset.c @@ -77,7 +77,8 @@ void depset_print2(FILE *f, depset *d, dico *dico_labels) for(i=1; i < d->length; i++){ if((d->array[i].gov) && (d->array[i].dep)){ - fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov) - word_get_index(d->array[i].dep), dico_int2string(dico_labels, d->array[i].label)); + /* fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov) - word_get_index(d->array[i].dep), dico_int2string(dico_labels, d->array[i].label ));*/ + fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label)); } } fprintf(f, "\n"); diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff.c b/maca_trans_parser/src/maca_trans_parser_conll2cff.c index 8193933..d42ad86 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2cff.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff.c @@ -169,6 +169,10 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); maca_trans_parser_conll2cff_check_options(ctx); + + + ctx->features_model = feat_model_read(ctx->features_model_filename); + if(ctx->mode == TRAIN_MODE){ mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c index cfd7965..85f0f4e 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c @@ -82,7 +82,6 @@ void generate_training_file_stream(FILE *output_file, context *ctx) fprintf(output_file, "%d", postag); feat_vec_print(output_file, fv); - if(postag != -1) movement_tagger(c, postag, 0, 1); } @@ -111,7 +110,6 @@ void generate_training_file_buffer(FILE *output_file, context *ctx) if(ctx->f2p) add_signature_to_words_in_queue(c->bf, ctx->f2p); - while(!config_is_terminal(c)){ /* config_print(stdout, c); */ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); @@ -136,6 +134,9 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); maca_trans_parser_conll2cff_check_options(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename); + + if(ctx->mode == TRAIN_MODE){ mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); @@ -146,7 +147,6 @@ int main(int argc, char *argv[]) } feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); - /* in train mode create feature dictionnary for perceptron */ if(ctx->mode == TRAIN_MODE) diff --git a/maca_trans_parser/src/queue.c b/maca_trans_parser/src/queue.c index 3b268ef..9d1315f 100644 --- a/maca_trans_parser/src/queue.c +++ b/maca_trans_parser/src/queue.c @@ -22,7 +22,7 @@ int queue_read_sentence(queue *bf, FILE *f, mcd *mcd_struct) while(fgets(buffer, 10000, f)){ if(feof(f)) break; /* fprintf(stderr, "%s", buffer); */ - if((buffer[0] == '\n') || (buffer[0] == ' ')) break; /* end of the sentence */ + if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')) break; /* end of the sentence */ w = word_parse_buffer(buffer, mcd_struct); if(word_get_index(w) == -1){ w->feat_array[FEAT_TYPE_INDEX] = index++; diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c index 6eb1cce..d312fdd 100644 --- a/maca_trans_parser/src/simple_decoder_tagger.c +++ b/maca_trans_parser/src/simple_decoder_tagger.c @@ -21,25 +21,17 @@ void add_signature_to_words_in_queue(queue *bf, form2pos *f2p) } } - void simple_decoder_buffer(context *ctx) { - FILE *f = NULL; dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); feature_table *ft = feature_table_load(ctx->perc_model_filename); - config *c = NULL; int postag; feat_vec *fv = feat_vec_new(feature_types_nb); float max; int i; - word *w; - - if(ctx->conll_filename) - f= myfopen(ctx->conll_filename, "r"); - else - f= stdin; - - c = config_initial(f, ctx->mcd_struct, 1000, 0); + word *w = NULL; + FILE *f = (ctx->conll_filename)? myfopen(ctx->conll_filename, "r") : stdin; + config *c = config_initial(f, ctx->mcd_struct, 1000, 0); /* read a sentence and put it in the buffer */ while(queue_read_sentence(c->bf, f, ctx->mcd_struct)){ @@ -59,6 +51,7 @@ void simple_decoder_buffer(context *ctx) w = stack_elt_n(c->st, i); printf("%s\t%s\n", w->input, dico_int2string(dico_pos, word_get_pos(w))); } + printf("\n"); /* config_free(c); */ c = config_initial(f, ctx->mcd_struct, 1000, 0); @@ -74,7 +67,6 @@ void simple_decoder_stream(context *ctx) feat_vec *fv = feat_vec_new(feature_types_nb); FILE *f = NULL; - /* when in stream mode, force to renumber the tokens (ugly !) */ ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; @@ -82,23 +74,14 @@ void simple_decoder_stream(context *ctx) while(!config_is_terminal(c)){ config_print(stdout, c); config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); - - } - /* config_print(stdout, c); */ - /* config_free(c); */ - } void simple_decoder_tagger(context *ctx) -/* (FILE *f, mcd *mcd_struct, dico *d_perceptron_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose, int stream_mode)*/ { - - /*conll_file, ctx->mcd_struct, ctx->d_perceptron_features, dico_pos, ft, ctx->features_model, ctx->verbose, ctx->stream_mode);*/ - ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); if(ctx->stream_mode) diff --git a/maca_trans_parser/src/word.c b/maca_trans_parser/src/word.c index d2a28f7..d163f13 100644 --- a/maca_trans_parser/src/word.c +++ b/maca_trans_parser/src/word.c @@ -31,6 +31,7 @@ word *word_read(FILE *f, mcd *mcd_struct) while(fgets(buffer, 10000, f)){ if(feof(f)) return NULL; /* no more words to read */ if((buffer[0] != '\n') && (buffer[0] != ' ')){ + /* printf("word = %s\n", buffer); */ return word_parse_buffer(buffer, mcd_struct); } } -- GitLab