diff --git a/INSTALL b/INSTALL index 83067099ddb933c288004adc6350c8100061a32a..b6e0cf27d4a76ae5457feb20ce44741336ee5860 100644 --- a/INSTALL +++ b/INSTALL @@ -10,6 +10,9 @@ The basic procedure to build and install macaon from sources is the following. - Launch the cmake command: cmake .. + If you want to compile macaon with debugging options type: + cmake -DCMAKE_BUILD_TYPE=Debug .. + If you want to install macaon locally, you can specify the install path with : cmake -DCMAKE_INSTALL_PREFIX:PATH=/absolute/path/to/macaon_install_dir diff --git a/maca_common/include/util.h b/maca_common/include/util.h index 1700e95e657497f369d092cd526d3adc59da44f8..26c0952c1798dc24103f20a5e9e6a6619f3a8cce 100644 --- a/maca_common/include/util.h +++ b/maca_common/include/util.h @@ -5,4 +5,5 @@ void myfree(void *ptr); void *memalloc(size_t s); FILE *myfopen(const char *path, const char *mode); +FILE *myfopen_no_exit(const char *path, const char *mode); #endif diff --git a/maca_common/src/form2pos.c b/maca_common/src/form2pos.c index 610ccf484380740a2f231d74baa87178dba510d2..1a98ad310005f57ab8207aa2fc6499065e4097eb 100644 --- a/maca_common/src/form2pos.c +++ b/maca_common/src/form2pos.c @@ -31,7 +31,7 @@ void form2pos_free(form2pos *f2p) form2pos *form2pos_read(char *filename) { - FILE *f = myfopen(filename, "r"); + FILE *f = myfopen_no_exit(filename, "r"); int nbelem; int pos_nb; char pos_list[10000]; @@ -39,6 +39,8 @@ form2pos *form2pos_read(char *filename) char signature[200]; form2pos *f2p = NULL; + if(f == NULL) return NULL; + /* read number of forms */ fscanf(f, "%d\n", &nbelem); diff --git a/maca_common/src/util.c b/maca_common/src/util.c index 84a1ba7ab2f6638faf60751b242ad8b91b83c745..4ff03522692bd20ddba0020216d2c1cf44bec0d4 100644 --- a/maca_common/src/util.c +++ b/maca_common/src/util.c @@ -25,3 +25,12 @@ FILE *myfopen(const char *path, const char *mode) } return f; } + +FILE *myfopen_no_exit(const char *path, const char *mode) +{ + FILE *f = fopen(path, mode); + if(f == NULL){ + fprintf(stderr, "cannot open file %s\n", path); + } + return f; +} diff --git a/maca_lemmatizer/src/maca_lemmatizer.c b/maca_lemmatizer/src/maca_lemmatizer.c index d7055995fbea59931bd93334194b3fa5835e241a..b748e7b2ecc1d2f0fba81b4fafe90844f71cf5f7 100644 --- a/maca_lemmatizer/src/maca_lemmatizer.c +++ b/maca_lemmatizer/src/maca_lemmatizer.c @@ -123,7 +123,7 @@ int main(int argc, char *argv[]) /* look for a valid word */ while(fgets(buffer, 10000, f)){ if(feof(f)) return 0; /* no more words to read */ - if((buffer[0] == '\n') || (buffer[0] == ' ')){ + if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){ printf("\n"); continue; } diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 76dff3db872da51e308bc45283963fa5529b9c57..4300e4acf9e4be79bc0598dde057b715e6a2d88d 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -7,9 +7,6 @@ #include "context.h" #include "util.h" - -void context_set_linguistic_resources_filenames(context *ctx); - void context_free(context *ctx) { if(ctx->program_name) free(ctx->program_name); @@ -306,13 +303,8 @@ context *context_read_options(int argc, char *argv[]) } } - context_set_linguistic_resources_filenames(ctx); - if(ctx->features_model_filename){ - ctx->features_model = feat_model_read(ctx->features_model_filename); - } - /* if(ctx->mcd_filename && ctx->conll_filename){ ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->conll_filename); ctx->mvt_nb = ctx->mcd_struct->dico_array[ctx->mcd_struct->type2col[FEAT_TYPE_LABEL]]->nbelem * 2 + 1; @@ -341,7 +333,7 @@ context *context_read_options(int argc, char *argv[]) return ctx; } -void context_set_linguistic_resources_filenames(context *ctx) +void context_set_linguistic_resources_filenames_parser(context *ctx) { char absolute_path[500]; char absolute_filename[500]; @@ -382,11 +374,10 @@ void context_set_linguistic_resources_filenames(context *ctx) ctx->features_model_filename = strdup(absolute_filename); } - /* fprintf(stdout, "perc_model_filename = %s\n", ctx->perc_model_filename); - fprintf(stdout, "vocabs_filename = %s\n", ctx->vocabs_filename); - fprintf(stdout, "mcd_filename = %s\n", ctx->mcd_filename); - fprintf(stdout, "perc_features_model_filename = %s\n", ctx->features_model_filename);*/ - + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); } void context_set_linguistic_resources_filenames_tagger(context *ctx) @@ -405,7 +396,7 @@ void context_set_linguistic_resources_filenames_tagger(context *ctx) strcat(absolute_path, ctx->language); strcat(absolute_path, "/bin/"); - + if(!ctx->perc_model_filename){ strcpy(absolute_filename, absolute_path); strcat(absolute_filename, DEFAULT_MODEL_TAGGER_FILENAME); @@ -430,9 +421,17 @@ void context_set_linguistic_resources_filenames_tagger(context *ctx) ctx->features_model_filename = strdup(absolute_filename); } - /* fprintf(stdout, "perc_model_filename = %s\n", ctx->perc_model_filename); - fprintf(stdout, "vocabs_filename = %s\n", ctx->vocabs_filename); - fprintf(stdout, "mcd_filename = %s\n", ctx->mcd_filename); - fprintf(stdout, "perc_features_model_filename = %s\n", ctx->features_model_filename);*/ + if(!ctx->f2p_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_F2P_FILENAME); + ctx->f2p_filename = strdup(absolute_filename); + ctx->f2p = form2pos_read(ctx->f2p_filename); + } + + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename); } diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 83d859bf4cc8ba54493b095d738e7be700971bf2..ff174133009135a6feac56049d62fa49748fa087 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -14,6 +14,7 @@ #define DEFAULT_FEATURES_MODEL_TAGGER_FILENAME "maca_trans_tagger.fm" #define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab" #define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model" +#define DEFAULT_F2P_FILENAME "fP" #include "dico_vec.h" #include "feat_model.h" @@ -92,4 +93,10 @@ void context_maca_data_path_help_message(context *ctx); void context_f2p_filename_help_message(context *ctx); +void context_set_linguistic_resources_filenames_tagger(context *ctx); +void context_set_linguistic_resources_filenames_parser(context *ctx); + + + + #endif diff --git a/maca_trans_parser/src/decode.c b/maca_trans_parser/src/decode.c index fcd0e66ba6f49fdee01ba14717ec70647b66dd01..aabe26a6db463d362eae0cb536ccc15faa9ed3ba 100644 --- a/maca_trans_parser/src/decode.c +++ b/maca_trans_parser/src/decode.c @@ -53,6 +53,9 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); decode_check_options(ctx); + context_set_linguistic_resources_filenames_parser(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename); + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); diff --git a/maca_trans_parser/src/decode_tagger.c b/maca_trans_parser/src/decode_tagger.c index f2d92c876b3833eb9002979afc73313251e0ffa5..722cdd773b3c42399b649a1c8b2113b0a2948912 100644 --- a/maca_trans_parser/src/decode_tagger.c +++ b/maca_trans_parser/src/decode_tagger.c @@ -46,11 +46,12 @@ int main(int argc, char *argv[]) { FILE *conll_file = NULL; context *ctx; - /* struct fann *ann; */ ctx = context_read_options(argc, argv); decode_check_options(ctx); + context_set_linguistic_resources_filenames_tagger(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename); ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); diff --git a/maca_trans_parser/src/depset.c b/maca_trans_parser/src/depset.c index 4f7b8a56b5eeb09ed43f8a8b5e91a73dc3341693..a71bba8d19aed0727fa982474eef064634ff004a 100644 --- a/maca_trans_parser/src/depset.c +++ b/maca_trans_parser/src/depset.c @@ -77,7 +77,8 @@ void depset_print2(FILE *f, depset *d, dico *dico_labels) for(i=1; i < d->length; i++){ if((d->array[i].gov) && (d->array[i].dep)){ - fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov) - word_get_index(d->array[i].dep), dico_int2string(dico_labels, d->array[i].label)); + /* fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov) - word_get_index(d->array[i].dep), dico_int2string(dico_labels, d->array[i].label ));*/ + fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label)); } } fprintf(f, "\n"); diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff.c b/maca_trans_parser/src/maca_trans_parser_conll2cff.c index 81939332a90d226839fca522a5d7f2135e7a47d1..d42ad86ec901ce2a0c1c527b0db64c379b9a14e3 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2cff.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff.c @@ -169,6 +169,10 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); maca_trans_parser_conll2cff_check_options(ctx); + + + ctx->features_model = feat_model_read(ctx->features_model_filename); + if(ctx->mode == TRAIN_MODE){ mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c index cfd79657fe15d1f44fa55b535266f3d674ebf5f6..85f0f4e92927ece2bf7f9c67fd585836a513fda3 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c @@ -82,7 +82,6 @@ void generate_training_file_stream(FILE *output_file, context *ctx) fprintf(output_file, "%d", postag); feat_vec_print(output_file, fv); - if(postag != -1) movement_tagger(c, postag, 0, 1); } @@ -111,7 +110,6 @@ void generate_training_file_buffer(FILE *output_file, context *ctx) if(ctx->f2p) add_signature_to_words_in_queue(c->bf, ctx->f2p); - while(!config_is_terminal(c)){ /* config_print(stdout, c); */ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); @@ -136,6 +134,9 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); maca_trans_parser_conll2cff_check_options(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename); + + if(ctx->mode == TRAIN_MODE){ mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); @@ -146,7 +147,6 @@ int main(int argc, char *argv[]) } feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); - /* in train mode create feature dictionnary for perceptron */ if(ctx->mode == TRAIN_MODE) diff --git a/maca_trans_parser/src/queue.c b/maca_trans_parser/src/queue.c index 3b268ef3f8543dd77dd9c0d4a1660a2c1a4100ac..9d1315f748a9f23fe7512bf94a57a0c6ff65228b 100644 --- a/maca_trans_parser/src/queue.c +++ b/maca_trans_parser/src/queue.c @@ -22,7 +22,7 @@ int queue_read_sentence(queue *bf, FILE *f, mcd *mcd_struct) while(fgets(buffer, 10000, f)){ if(feof(f)) break; /* fprintf(stderr, "%s", buffer); */ - if((buffer[0] == '\n') || (buffer[0] == ' ')) break; /* end of the sentence */ + if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')) break; /* end of the sentence */ w = word_parse_buffer(buffer, mcd_struct); if(word_get_index(w) == -1){ w->feat_array[FEAT_TYPE_INDEX] = index++; diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c index 6eb1cce8f7c16d915f5ffc93ad044c65e8c63b92..d312fdd6f6ea2eaa402eca9ad00b2bc92fbfb815 100644 --- a/maca_trans_parser/src/simple_decoder_tagger.c +++ b/maca_trans_parser/src/simple_decoder_tagger.c @@ -21,25 +21,17 @@ void add_signature_to_words_in_queue(queue *bf, form2pos *f2p) } } - void simple_decoder_buffer(context *ctx) { - FILE *f = NULL; dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); feature_table *ft = feature_table_load(ctx->perc_model_filename); - config *c = NULL; int postag; feat_vec *fv = feat_vec_new(feature_types_nb); float max; int i; - word *w; - - if(ctx->conll_filename) - f= myfopen(ctx->conll_filename, "r"); - else - f= stdin; - - c = config_initial(f, ctx->mcd_struct, 1000, 0); + word *w = NULL; + FILE *f = (ctx->conll_filename)? myfopen(ctx->conll_filename, "r") : stdin; + config *c = config_initial(f, ctx->mcd_struct, 1000, 0); /* read a sentence and put it in the buffer */ while(queue_read_sentence(c->bf, f, ctx->mcd_struct)){ @@ -59,6 +51,7 @@ void simple_decoder_buffer(context *ctx) w = stack_elt_n(c->st, i); printf("%s\t%s\n", w->input, dico_int2string(dico_pos, word_get_pos(w))); } + printf("\n"); /* config_free(c); */ c = config_initial(f, ctx->mcd_struct, 1000, 0); @@ -74,7 +67,6 @@ void simple_decoder_stream(context *ctx) feat_vec *fv = feat_vec_new(feature_types_nb); FILE *f = NULL; - /* when in stream mode, force to renumber the tokens (ugly !) */ ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; @@ -82,23 +74,14 @@ void simple_decoder_stream(context *ctx) while(!config_is_terminal(c)){ config_print(stdout, c); config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); - - } - /* config_print(stdout, c); */ - /* config_free(c); */ - } void simple_decoder_tagger(context *ctx) -/* (FILE *f, mcd *mcd_struct, dico *d_perceptron_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose, int stream_mode)*/ { - - /*conll_file, ctx->mcd_struct, ctx->d_perceptron_features, dico_pos, ft, ctx->features_model, ctx->verbose, ctx->stream_mode);*/ - ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); if(ctx->stream_mode) diff --git a/maca_trans_parser/src/word.c b/maca_trans_parser/src/word.c index d2a28f71b0d2eca8ec7d6196f7d1c5e3b8483783..d163f133dcd353c3b86ca150a3baff3753064cc9 100644 --- a/maca_trans_parser/src/word.c +++ b/maca_trans_parser/src/word.c @@ -31,6 +31,7 @@ word *word_read(FILE *f, mcd *mcd_struct) while(fgets(buffer, 10000, f)){ if(feof(f)) return NULL; /* no more words to read */ if((buffer[0] != '\n') && (buffer[0] != ' ')){ + /* printf("word = %s\n", buffer); */ return word_parse_buffer(buffer, mcd_struct); } }