From 64d47ec0a02987ca57d04a3e13318234777ef82e Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Thu, 14 Jul 2016 10:04:03 -0400 Subject: [PATCH] changed option names --- maca_common/include/mcd.h | 6 +- maca_common/src/form2pos.c | 5 +- maca_common/src/mcd.c | 65 +++++- maca_lemmatizer/src/context.c | 10 +- maca_trans_parser/src/cff_cutoff.c | 1 - maca_trans_parser/src/context.c | 193 ++++++------------ maca_trans_parser/src/context.h | 15 +- maca_trans_parser/src/decode.c | 72 ++----- maca_trans_parser/src/decode_tagger.c | 45 ++-- maca_trans_parser/src/feat_model.c | 8 +- maca_trans_parser/src/feat_model.h | 2 +- maca_trans_parser/src/feature_table.c | 6 +- maca_trans_parser/src/feature_table.h | 2 +- .../src/maca_trans_parser_conll2cff.c | 20 +- .../src/maca_trans_parser_conll2cff_tagger.c | 30 ++- .../src/maca_trans_parser_conll2fann.c | 48 ++--- maca_trans_parser/src/simple_decoder.c | 81 ++++---- maca_trans_parser/src/simple_decoder.h | 3 +- maca_trans_parser/src/simple_decoder_tagger.c | 8 +- maca_trans_parser/src/train_perceptron.c | 14 +- 20 files changed, 284 insertions(+), 350 deletions(-) diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index 07f29af..f86fcc4 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -33,8 +33,10 @@ typedef struct { } mcd; mcd *mcd_build_conll07(void); -mcd *mcd_read(char *mcd_filename); -void mcd_link_to_dico(mcd *m, dico_vec *vocabs); +mcd *mcd_build_ifpls(void); + +mcd *mcd_read(char *mcd_filename, int verbose); +void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose); void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename); void mcd_free(mcd *m); int mcd_get_code(mcd *m, char *str, int col); diff --git a/maca_common/src/form2pos.c b/maca_common/src/form2pos.c index 1a98ad3..ab86d12 100644 --- a/maca_common/src/form2pos.c +++ b/maca_common/src/form2pos.c @@ -65,7 +65,10 @@ form2pos *form2pos_read(char *filename) int form2pos_get_signature(form2pos *f2p, char *form) { - return hash_get_val(f2p->h_form2signature, form); + /* if(form == NULL) + return -1; + else*/ + return hash_get_val(f2p->h_form2signature, form); } int form2pos_form_has_pos(form2pos *f2p, char *form, char *pos) diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index 7806270..39f22b6 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -110,7 +110,7 @@ void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename) /* takes as argument an mcd structure (m) and a dictionary vector (vocabs) */ /* links the vocabularies of m to vocabularies of vocabs (based on their names) */ -void mcd_link_to_dico(mcd *m, dico_vec *vocabs) +void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose) { int column; for(column=0; column < m->nb_col; column++){ @@ -118,14 +118,14 @@ void mcd_link_to_dico(mcd *m, dico_vec *vocabs) && (!strcmp(m->filename[column], "_")) && (m->dico_array[column] == NULL)){ m->dico_array[column] = dico_vec_get_dico(vocabs, m->type_str[column]); - fprintf(stderr, "linking to dico %s\n", m->type_str[column]); + if(verbose) fprintf(stderr, "linking to dico %s\n", m->type_str[column]); } } } /* read an multi column description file and produces an mcd structure */ -mcd *mcd_read(char *mcd_filename) +mcd *mcd_read(char *mcd_filename, int verbose) { int column; char type[100]; @@ -148,7 +148,7 @@ mcd *mcd_read(char *mcd_filename) /* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */ continue; } - fprintf(stderr, "column = %d type = %s representation = %s filename = %s\n", column, type, representation, filename); + if(verbose) fprintf(stderr, "column = %d type = %s representation = %s filename = %s\n", column, type, representation, filename); m->type[column] = feat_type_string2int(type); m->type_str[column] = strdup(type); if(m->type[column] == -1){ @@ -170,11 +170,11 @@ mcd *mcd_read(char *mcd_filename) if(strcmp(m->filename[column], "_")){ if(m->representation[column] == MCD_REPRESENTATION_EMB){ - fprintf(stderr, "loading word embedding %s\n", m->filename[column]); + if(verbose) fprintf(stderr, "loading word embedding %s\n", m->filename[column]); m->word_emb_array[column] = word_emb_load(m->filename[column]); } else if(m->representation[column] == MCD_REPRESENTATION_VOCAB){ - fprintf(stderr, "loading dico %s\n", m->filename[column]); + if(verbose) fprintf(stderr, "loading dico %s\n", m->filename[column]); m->dico_array[column] = dico_read(m->filename[column], 0.5); } } @@ -190,46 +190,97 @@ mcd *mcd_build_conll07(void) m->type[0]=FEAT_TYPE_INDEX; m->type_str[0]=strdup("INDEX"); m->representation[0]= MCD_REPRESENTATION_INT; + m->filename[0] = strdup("_"); m->type2col[FEAT_TYPE_INDEX] = 0; - + m->type[1]=FEAT_TYPE_FORM; m->type_str[1]=strdup("FORM"); m->representation[1]= MCD_REPRESENTATION_VOCAB; + m->filename[1] = strdup("_"); m->type2col[FEAT_TYPE_FORM] = 1; m->type[2]=FEAT_TYPE_LEMMA; m->type_str[2]=strdup("LEMMA"); m->representation[2]= MCD_REPRESENTATION_VOCAB; + m->filename[2] = strdup("_"); m->type2col[FEAT_TYPE_LEMMA] = 2; m->type[3]=FEAT_TYPE_CPOS; m->type_str[3]=strdup("CPOS"); m->representation[3]= MCD_REPRESENTATION_VOCAB; + m->filename[3] = strdup("_"); m->type2col[FEAT_TYPE_CPOS] = 3; m->type[4]=FEAT_TYPE_POS; m->type_str[4]=strdup("POS"); m->representation[4]= MCD_REPRESENTATION_VOCAB; + m->filename[4] = strdup("_"); m->type2col[FEAT_TYPE_POS] = 4; m->type[5]=FEAT_TYPE_FEATS; m->type_str[5]=strdup("FEATS"); m->representation[5]= MCD_REPRESENTATION_VOCAB; + m->filename[5] = strdup("_"); m->type2col[FEAT_TYPE_FEATS] = 5; m->type[6]=FEAT_TYPE_GOV; m->type_str[6]=strdup("GOV"); m->representation[6]= MCD_REPRESENTATION_INT; + m->filename[6] = strdup("_"); m->type2col[FEAT_TYPE_GOV] = 6; m->type[7]=FEAT_TYPE_LABEL; m->type_str[7]=strdup("LABEL"); m->representation[7]= MCD_REPRESENTATION_VOCAB; + m->filename[7] = strdup("_"); m->type2col[FEAT_TYPE_LABEL] = 7; return m; } +mcd *mcd_build_ifpls(void) +{ + mcd *m = mcd_new(6); + + m->type[0]=FEAT_TYPE_INDEX; + m->type_str[0]=strdup("INDEX"); + m->representation[0]= MCD_REPRESENTATION_INT; + m->filename[0] = strdup("_"); + m->type2col[FEAT_TYPE_INDEX] = 0; + + m->type[1]=FEAT_TYPE_FORM; + m->type_str[1]=strdup("FORM"); + m->representation[1]= MCD_REPRESENTATION_VOCAB; + m->filename[1] = strdup("_"); + m->type2col[FEAT_TYPE_FORM] = 1; + + m->type[2]=FEAT_TYPE_POS; + m->type_str[2]=strdup("POS"); + m->representation[2]= MCD_REPRESENTATION_VOCAB; + m->filename[2] = strdup("_"); + m->type2col[FEAT_TYPE_POS] = 2; + + m->type[3]=FEAT_TYPE_LEMMA; + m->type_str[3]=strdup("LEMMA"); + m->representation[3]= MCD_REPRESENTATION_VOCAB; + m->filename[3] = strdup("_"); + m->type2col[FEAT_TYPE_LEMMA] = 3; + + m->type[4]=FEAT_TYPE_GOV; + m->type_str[4]=strdup("GOV"); + m->representation[4]= MCD_REPRESENTATION_INT; + m->filename[4] = strdup("_"); + m->type2col[FEAT_TYPE_GOV] = 4; + + m->type[5]=FEAT_TYPE_LABEL; + m->type_str[5]=strdup("LABEL"); + m->representation[5]= MCD_REPRESENTATION_VOCAB; + m->filename[5] = strdup("_"); + m->type2col[FEAT_TYPE_LABEL] = 5; + + return m; +} + mcd *mcd_read_old(char *mcd_filename, char *corpus_filename, dico_vec *vocabs) { int column; diff --git a/maca_lemmatizer/src/context.c b/maca_lemmatizer/src/context.c index 1bc694b..220a1c6 100644 --- a/maca_lemmatizer/src/context.c +++ b/maca_lemmatizer/src/context.c @@ -33,8 +33,8 @@ context *context_new(void) ctx->mcd_struct = NULL; ctx->language = strdup("fr"); ctx->maca_data_path = NULL; - ctx->form_column = -1; - ctx->pos_column = -1; + ctx->form_column = 0; + ctx->pos_column = 1; return ctx; } @@ -123,7 +123,6 @@ context *context_read_options(int argc, char *argv[]) break; case 'm': ctx->mcd_filename = strdup(optarg); - ctx->mcd_struct = mcd_read(ctx->mcd_filename); break; case 'C': ctx->language = strdup(optarg); @@ -136,6 +135,11 @@ context *context_read_options(int argc, char *argv[]) context_set_linguistic_resources_filenames(ctx); + + if(ctx->mcd_filename) + ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); + + if((ctx->mcd_filename == NULL) && ((ctx->form_column == -1) || (ctx->pos_column == -1))) ctx->mcd_struct = mcd_build_conll07(); diff --git a/maca_trans_parser/src/cff_cutoff.c b/maca_trans_parser/src/cff_cutoff.c index 6ea94b8..818cd26 100644 --- a/maca_trans_parser/src/cff_cutoff.c +++ b/maca_trans_parser/src/cff_cutoff.c @@ -18,7 +18,6 @@ void cff_cutoff_help_message(context *ctx) context_cutoff_help_message(ctx); context_cff_help_message(ctx); fprintf(stderr, "INPUT/OUTPUT\n"); - context_alphabet_help_message(ctx); } void cff_cutoff_check_options(context *ctx) diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 7ab956f..6f7f226 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -10,11 +10,9 @@ void context_free(context *ctx) { if(ctx->program_name) free(ctx->program_name); - if(ctx->conll_filename) free(ctx->conll_filename); + if(ctx->input_filename) free(ctx->input_filename); if(ctx->perc_model_filename) free(ctx->perc_model_filename); if(ctx->dnn_model_filename) free(ctx->dnn_model_filename); - if(ctx->dico_features_filename) free(ctx->dico_features_filename); - if(ctx->dico_classes_filename) free(ctx->dico_classes_filename); if(ctx->cff_filename) free(ctx->cff_filename); if(ctx->fann_filename) free(ctx->fann_filename); if(ctx->mcd_filename) free(ctx->mcd_filename); @@ -46,11 +44,9 @@ context *context_new(void) ctx->verbose = 0; ctx->program_name = NULL; - ctx->conll_filename = NULL; + ctx->input_filename = NULL; ctx->perc_model_filename = NULL; ctx->dnn_model_filename = NULL; - ctx->dico_features_filename = NULL; - ctx->dico_classes_filename = NULL; ctx->cff_filename = NULL; ctx->fann_filename = NULL; ctx->stag_desc_filename = NULL; @@ -82,8 +78,8 @@ context *context_new(void) ctx->hidden_neurons_nb = 100; ctx->stream_mode = 0; - ctx->form_column = -1; - + ctx->conll = 0; + ctx->ifpls = 1; return ctx; } @@ -92,84 +88,66 @@ void context_general_help_message(context *ctx) { fprintf(stderr, "usage: %s [options]\n", ctx->program_name); fprintf(stderr, "Options:\n"); - fprintf(stderr, "\t-h --help : print this message\n"); - fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); - fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); + fprintf(stderr, "\t-D --maca_data_path <str> : path to the maca_data directory\n"); + fprintf(stderr, "\t-L --language <str> : identifier of the language to use (default is fr)\n"); } void context_model_help_message(context *ctx){ - fprintf(stderr, "\t-m --model <file> : model file name\n"); + fprintf(stderr, "\t-m --model <file> : model file name\n"); +} +void context_input_help_message(context *ctx){ + fprintf(stderr, "\t-i --input <file> : input file name\n"); } void context_iterations_help_message(context *ctx){ - fprintf(stderr, "\t-n --iter <int> : number of iterations (default is 4)\n"); + fprintf(stderr, "\t-n --iter <int> : number of iterations (default is 4)\n"); } void context_cff_help_message(context *ctx){ - fprintf(stderr, "\t-x --cff <file> : CFF format file name\n"); + fprintf(stderr, "\t-x --cff <file> : CFF format file name\n"); } void context_fann_help_message(context *ctx){ - fprintf(stderr, "\t-y --fann <file> : FANN format file name\n"); -} -void context_d_features_help_message(context *ctx){ - fprintf(stderr, "\t-f --df <file> : features dictionnary file name\n"); -} -void context_d_classes_help_message(context *ctx){ - fprintf(stderr, "\t-c --dc <file> : classes dictionnary file name\n"); + fprintf(stderr, "\t-f --fann <file> : FANN format file name\n"); } void context_conll_help_message(context *ctx){ - fprintf(stderr, "\t-i --conll <file> : conll file name\n"); + fprintf(stderr, "\t-c --conll : input is in conll07 format\n"); } void context_cutoff_help_message(context *ctx){ - fprintf(stderr, "\t-u --cutoff <int> : cutoff value\n"); + fprintf(stderr, "\t-u --cutoff <int> : cutoff value\n"); } void context_mode_help_message(context *ctx){ - fprintf(stderr, "\t-o --mode TEST|TRAIN\n"); + fprintf(stderr, "\t-M --mode : TEST|TRAIN\n"); } void context_beam_help_message(context *ctx){ - fprintf(stderr, "\t-b --beam <int> : beam width (default is 1)\n"); + fprintf(stderr, "\t-b --beam <int> : beam width (default is 1)\n"); } void context_sent_nb_help_message(context *ctx){ - fprintf(stderr, "\t-s --sent_nb <int> : number of sentences to process (default is 1000000)\n"); -} -void context_alphabet_help_message(context *ctx){ - fprintf(stderr, "\t-a --alphabet <file> : name of the file containing the different dictionaries\n"); -} -void context_dnn_model_help_message(context *ctx){ - fprintf(stderr, "\t-M --dnn_model <file> : FANN model file\n"); -} -void context_hidden_neurons_nb_help_message(context *ctx){ - fprintf(stderr, "\t-H --hidden_neurons_nb <int> : number of neurons in the hidden layer (default is 100)\n"); -} -void context_stag_desc_filename_help_message(context *ctx){ - fprintf(stderr, "\t-S --stag_file <file> : name of the file containing the stag description\n"); + fprintf(stderr, "\t-s --sent_nb <int> : number of sentences to process (default is 1000000)\n"); } void context_mcd_help_message(context *ctx){ - fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n"); + fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n"); } void context_features_model_help_message(context *ctx){ - fprintf(stderr, "\t-F --feat_model <file> : feature model file name\n"); + fprintf(stderr, "\t-F --feat_model <file> : feature model file name\n"); } void context_stream_help_message(context *ctx){ - fprintf(stderr, "\t-T --stream (0|1) : steam mode\n"); + fprintf(stderr, "\t-S --stream : steam mode\n"); } - void context_vocabs_help_message(context *ctx){ - fprintf(stderr, "\t-V --vocabs : vocabularies file\n"); + fprintf(stderr, "\t-V --vocabs <file> : vocabularies file\n"); } - void context_language_help_message(context *ctx){ - fprintf(stderr, "\t-X --language : identifier of the language to use\n"); + fprintf(stderr, "\t-L --language : identifier of the language to use\n"); } - void context_maca_data_path_help_message(context *ctx){ - fprintf(stderr, "\t-Y --maca_data_path : path to the maca_data directory\n"); + fprintf(stderr, "\t-D --maca_data_path : path to maca_data directory\n"); } - void context_root_label_help_message(context *ctx){ - fprintf(stderr, "\t-R --root_label : name of the root label (default is \"root\")\n"); + fprintf(stderr, "\t-R --root_label : name of the root label (default is \"root\")\n"); } - void context_f2p_filename_help_message(context *ctx){ - fprintf(stderr, "\t-P --f2p : form to pos (f2p) filename\n"); + fprintf(stderr, "\t-P --f2p <file> : form to pos (f2p) filename\n"); } context *context_read_options(int argc, char *argv[]) @@ -180,75 +158,58 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[28] = + static struct option long_options[21] = { {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, - {"debug", no_argument, 0, 'd'}, + {"conll", no_argument, 0, 'c'}, + {"stream", no_argument, 0, 'S'}, {"model", required_argument, 0, 'm'}, - {"df", required_argument, 0, 'f'}, - {"dc", required_argument, 0, 'c'}, - {"conll", required_argument, 0, 'i'}, + {"input", required_argument, 0, 'i'}, {"iter", required_argument, 0, 'n'}, {"cff", required_argument, 0, 'x'}, {"cutoff", required_argument, 0, 'u'}, {"hratio", required_argument, 0, 'r'}, - {"mode", required_argument, 0, 'o'}, + {"mode", required_argument, 0, 'M'}, {"beam", required_argument, 0, 'b'}, - {"fann", required_argument, 0, 'y'}, + {"fann", required_argument, 0, 'f'}, {"sent_nb", required_argument, 0, 's'}, - /* {"alphabet", required_argument, 0, 'a'}, */ - {"dnn_model", required_argument, 0, 'M'}, - {"hidden_neurons_nb", required_argument, 0, 'H'}, - {"stag_file", required_argument, 0, 'S'}, {"mcd", required_argument, 0, 'C'}, {"feat_model", required_argument, 0, 'F'}, {"vocabs", required_argument, 0, 'V'}, - {"stream", required_argument, 0, 'T'}, - {"language", required_argument, 0, 'X'}, - {"maca_data_path", required_argument, 0, 'Y'}, + {"language", required_argument, 0, 'L'}, + {"maca_data_path", required_argument, 0, 'D'}, {"root_label", required_argument, 0, 'R'}, - {"form_col", required_argument, 0, 'O'}, {"f2p", required_argument, 0, 'P'} }; optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "dhvT:m:f:c:i:n:x:u:r:o:b:y:s:M:H:S:C:F:V:X:Y:R:O:P:", long_options, &option_index)) != -1){ + + while ((c = getopt_long (argc, argv, "hvcSm:i:n:x:u:r:M:b:f:s:C:F:V:L:D:R:P:", long_options, &option_index)) != -1){ switch (c) { - case 'd': - ctx->debug_mode = 1; - break; case 'h': ctx->help = 1; break; case 'v': ctx->verbose = 1; break; - case 'T': - ctx->stream_mode = atoi(optarg); - break; - case 'y': - ctx->fann_filename = strdup(optarg); - break; - case 'n': - ctx->iteration_nb = atoi(optarg); + case 'c': + ctx->conll = 1; break; - case 'i': - ctx->conll_filename = strdup(optarg); + case 'S': + ctx->stream_mode = 1; break; + case 'm': ctx->perc_model_filename = strdup(optarg); break; - case 'M': - ctx->dnn_model_filename = strdup(optarg); - break; - case 'f': - ctx->dico_features_filename = strdup(optarg); + case 'i': + ctx->input_filename = strdup(optarg); break; - case 'c': - ctx->dico_classes_filename = strdup(optarg); + case 'n': + ctx->iteration_nb = atoi(optarg); break; case 'x': ctx->cff_filename = strdup(optarg); @@ -256,27 +217,23 @@ context *context_read_options(int argc, char *argv[]) case 'u': ctx->feature_cutoff = atoi(optarg); break; - case 'b': - ctx->beam_width = atoi(optarg); - break; case 'r': ctx->hash_ratio = atof(optarg); break; - case 'o': + case 'M': ctx->mode = (!strcmp(optarg, "TEST"))? TEST_MODE : TRAIN_MODE; break; - case 's': - ctx->sent_nb = atoi(optarg); + case 'b': + ctx->beam_width = atoi(optarg); break; - case 'H': - ctx->hidden_neurons_nb = atoi(optarg); + case 'f': + ctx->fann_filename = strdup(optarg); break; - case 'S': - ctx->stag_desc_filename = strdup(optarg); + case 's': + ctx->sent_nb = atoi(optarg); break; case 'C': ctx->mcd_filename = strdup(optarg); - ctx->mcd_struct = mcd_read(ctx->mcd_filename); break; case 'F': ctx->features_model_filename = strdup(optarg); @@ -284,18 +241,15 @@ context *context_read_options(int argc, char *argv[]) case 'V': ctx->vocabs_filename = strdup(optarg); break; - case 'X': + case 'L': ctx->language = strdup(optarg); break; - case 'Y': + case 'D': ctx->maca_data_path = strdup(optarg); break; case 'R': ctx->root_label = strdup(optarg); break; - case 'O': - ctx->form_column = atoi(optarg); - break; case 'P': ctx->f2p_filename = strdup(optarg); ctx->f2p = form2pos_read(ctx->f2p_filename); @@ -303,33 +257,14 @@ context *context_read_options(int argc, char *argv[]) } } - - - /* if(ctx->mcd_filename && ctx->conll_filename){ - ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->conll_filename); - ctx->mvt_nb = ctx->mcd_struct->dico_array[ctx->mcd_struct->type2col[FEAT_TYPE_LABEL]]->nbelem * 2 + 1; - }*/ + if(ctx->mcd_filename) + ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); + else + if(ctx->conll) + ctx->mcd_struct = mcd_build_conll07(); + else + ctx->mcd_struct = mcd_build_ifpls(); - /* - if(ctx->features_model && ctx->mcd_struct) - feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); - */ - - /* if the form column has been set by user, change it in the mcd file */ - /* if(ctx->form_column != -1){ - ctx->mcd_struct = mcd_new(ctx->form_column + 1); - mcd_set_form_col(ctx->mcd_struct, ctx->form_column); - ctx->mcd_struct->representation[ctx->form_column] = MCD_REPRESENTATION_VOCAB; - ctx->mcd_struct->filename[ctx->form_column] = strdup("_"); - ctx->mcd_struct->dico_array[ctx->form_column] = NULL; - ctx->mcd_struct->type_str[ctx->form_column] = strdup("FORM"); - - }*/ - - if(ctx->mcd_struct == NULL){ - ctx->mcd_struct = mcd_build_conll07(); - } - return ctx; } diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 2b8bdb0..1d1d1bd 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -25,11 +25,9 @@ typedef struct { int help; char *program_name; - char *conll_filename; + char *input_filename; char *perc_model_filename; char *dnn_model_filename; - char *dico_features_filename; - char *dico_classes_filename; char *cff_filename; char *fann_filename; char *stag_desc_filename; @@ -57,8 +55,9 @@ typedef struct { char *maca_data_path; char *language; char *root_label; - int form_column; form2pos *f2p; + int conll; + int ifpls; } context; context *context_new(void); @@ -76,21 +75,21 @@ void context_cutoff_help_message(context *ctx); void context_mode_help_message(context *ctx); void context_beam_help_message(context *ctx); void context_sent_nb_help_message(context *ctx); -void context_alphabet_help_message(context *ctx); void context_dnn_model_help_message(context *ctx); void context_hidden_neurons_nb_help_message(context *ctx); void context_stag_desc_filename_help_message(context *ctx); +void context_input_filename_help_message(context *ctx); void context_mcd_help_message(context *ctx); void context_features_model_help_message(context *ctx); void context_vocabs_help_message(context *ctx); -void context_load_alphabets(context *ctx); -void context_print_alphabets(context *ctx); - void context_language_help_message(context *ctx); void context_maca_data_path_help_message(context *ctx); void context_f2p_filename_help_message(context *ctx); +void context_conll_help_message(context *ctx); +void context_ifpls_help_message(context *ctx); +void context_input_help_message(context *ctx); #endif diff --git a/maca_trans_parser/src/decode.c b/maca_trans_parser/src/decode.c index 299a640..225aab9 100644 --- a/maca_trans_parser/src/decode.c +++ b/maca_trans_parser/src/decode.c @@ -18,15 +18,13 @@ void decode_help_message(context *ctx) { context_general_help_message(ctx); context_beam_help_message(ctx); - fprintf(stderr, "INPUT\n"); context_conll_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_input_help_message(ctx); context_mcd_help_message(ctx); context_model_help_message(ctx); context_vocabs_help_message(ctx); context_features_model_help_message(ctx); - context_language_help_message(ctx); - context_maca_data_path_help_message(ctx); - } void decode_check_options(context *ctx){ @@ -83,29 +81,25 @@ void set_linguistic_resources_filenames_parser(context *ctx) ctx->features_model_filename = strdup(absolute_filename); } - fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); - fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); - fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); - fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + } } - int main(int argc, char *argv[]) { - FILE *conll_file = NULL; context *ctx; - feature_table *ft; - /* struct fann *ann; */ - int root_label; ctx = context_read_options(argc, argv); decode_check_options(ctx); set_linguistic_resources_filenames_parser(ctx); - ctx->features_model = feat_model_read(ctx->features_model_filename); - + ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); - mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); @@ -115,50 +109,18 @@ int main(int argc, char *argv[]) } ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 1; - root_label = dico_string2int(ctx->dico_labels, ctx->root_label); - - if(root_label == -1) root_label = 0; - - /* when in stream mode, force to renumber the tokens (ugly !) */ - if(ctx->stream_mode){ - ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; - } - /* load models */ - if(ctx->perc_model_filename){ - /* ctx->d_perceptron_features = dico_read(ctx->perceptron_features_filename, ctx->hash_ratio); */ - ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); - ft = feature_table_load(ctx->perc_model_filename); - /* hash_stats(dico_features->htable); */ - } - - /* else if(ctx->dnn_model_filename){ - ann = fann_create_from_file(ctx->dnn_model_filename); - if(!ann){ - fprintf(stderr, "Error creating ann --- ABORTING.\n"); - return -1; - } - } - else{*/ + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); - if(ctx->conll_filename) - conll_file= myfopen(ctx->conll_filename, "r"); - else - conll_file = stdin; - - if(ctx->perc_model_filename){ - if(ctx->beam_width == 1){ - simple_decoder(conll_file, ctx->mcd_struct, ctx->d_perceptron_features, ctx->dico_labels, ft, ctx->features_model, ctx->verbose, root_label, ctx->stream_mode); - } - else - beam_decoder(conll_file, ctx->mcd_struct, ctx->d_perceptron_features, ctx->dico_labels, ft, ctx->features_model, ctx->verbose, root_label, ctx->beam_width, ctx->mvt_nb); - } + if(ctx->beam_width == 1){ + simple_decoder(ctx); + } + /* else + beam_decoder(conll_file, ctx->mcd_struct, ctx->d_perceptron_features, ctx->dico_labels, ft, ctx->features_model, ctx->verbose, root_label, ctx->beam_width, ctx->mvt_nb); + }*/ - /* else if(ctx->dnn_model_filename){ - dnn_decoder(conll_file, ctx->mcd_struct, ann, ctx->features_model, ctx->verbose, root_label, ctx->stream_mode); - }*/ context_free(ctx); return 0; } diff --git a/maca_trans_parser/src/decode_tagger.c b/maca_trans_parser/src/decode_tagger.c index 99112ba..db91da6 100644 --- a/maca_trans_parser/src/decode_tagger.c +++ b/maca_trans_parser/src/decode_tagger.c @@ -13,23 +13,21 @@ /*#include"dnn_decoder.h"*/ #include"config2feat_vec.h" -void decode_help_message(context *ctx) +void decode_tagger_help_message(context *ctx) { context_general_help_message(ctx); context_beam_help_message(ctx); - fprintf(stderr, "INPUT\n"); context_conll_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_input_help_message(ctx); context_mcd_help_message(ctx); context_model_help_message(ctx); context_vocabs_help_message(ctx); context_features_model_help_message(ctx); - context_features_model_help_message(ctx); - context_language_help_message(ctx); - context_maca_data_path_help_message(ctx); context_f2p_filename_help_message(ctx); } -void decode_check_options(context *ctx){ +void decode_tagger_check_options(context *ctx){ if(ctx->help /*!ctx->conll_filename*/ /* || !ctx->perc_model_filename @@ -37,13 +35,12 @@ void decode_check_options(context *ctx){ || !ctx->vocabs_filename || !ctx->features_model_filename*/ ){ - decode_help_message(ctx); + decode_tagger_help_message(ctx); exit(1); } } - -void set_linguistic_resources_filenames_tagger(context *ctx) +void decode_tagger_set_linguistic_resources_filenames(context *ctx) { char absolute_path[500]; char absolute_filename[500]; @@ -91,28 +88,28 @@ void set_linguistic_resources_filenames_tagger(context *ctx) ctx->f2p = form2pos_read(ctx->f2p_filename); } - fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); - fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); - fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); - fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); - fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename); - + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename); + } } int main(int argc, char *argv[]) { - FILE *conll_file = NULL; - context *ctx; - - ctx = context_read_options(argc, argv); - decode_check_options(ctx); + context *ctx = context_read_options(argc, argv); + decode_tagger_check_options(ctx); - set_linguistic_resources_filenames_tagger(ctx); - ctx->features_model = feat_model_read(ctx->features_model_filename); + decode_tagger_set_linguistic_resources_filenames(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); - mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); - + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + if(ctx->beam_width == 1) simple_decoder_tagger(ctx); diff --git a/maca_trans_parser/src/feat_model.c b/maca_trans_parser/src/feat_model.c index 486e306..267a662 100644 --- a/maca_trans_parser/src/feat_model.c +++ b/maca_trans_parser/src/feat_model.c @@ -17,7 +17,7 @@ void feat_model_free(feat_model *fm) free(fm); } -feat_model *feat_model_read(char *filename) +feat_model *feat_model_read(char *filename, int verbose) { FILE *f = myfopen(filename, "r"); feat_model *fm = feat_model_new(filename); @@ -31,16 +31,16 @@ feat_model *feat_model_read(char *filename) while(fgets(buffer, 1000, f)){ if(feof(f)) break; if((buffer[0] == '\n') || (buffer[0] == '#')) continue; - fprintf(stderr, "%d", feature_number + 1); + if(verbose) fprintf(stderr, "%d", feature_number + 1); fd = feat_desc_new(); feat_name = strtok(buffer, " \n"); do{ - fprintf(stderr, "\t%s", feat_name); + if(verbose) fprintf(stderr, "\t%s", feat_name); sfd = feat_lib_get_simple_feat_desc(fl, feat_name); if(sfd) feat_desc_add(fd, sfd); }while((feat_name = strtok(NULL, " \n"))); - fprintf(stderr, "\n"); + if(verbose) fprintf(stderr, "\n"); feat_model_add(fm, fd); feature_number++; } diff --git a/maca_trans_parser/src/feat_model.h b/maca_trans_parser/src/feat_model.h index ecd1831..fc7dc59 100644 --- a/maca_trans_parser/src/feat_model.h +++ b/maca_trans_parser/src/feat_model.h @@ -22,6 +22,6 @@ feat_model *feat_model_new(char *name); feat_desc *feat_model_add(feat_model *fm, feat_desc *fd); int feat_model_get_feat_value_fann(feat_model *fm, config *c, int feat_nb); int feat_model_get_feat_value_cff(feat_model *fm, config *c, dico *dico_features, int feat_nb, int mode); -feat_model *feat_model_read(char *filename); +feat_model *feat_model_read(char *filename, int verbose); void feat_model_compute_ranges(feat_model *fm, mcd *m, int mvt_nb); #endif diff --git a/maca_trans_parser/src/feature_table.c b/maca_trans_parser/src/feature_table.c index db8363a..7450eb1 100644 --- a/maca_trans_parser/src/feature_table.c +++ b/maca_trans_parser/src/feature_table.c @@ -5,7 +5,7 @@ #include"feature_table.h" #include"util.h" -feature_table *feature_table_load(char *filename) +feature_table *feature_table_load(char *filename, int verbose) { int i; feature_table *ft = NULL; @@ -17,9 +17,9 @@ feature_table *feature_table_load(char *filename) exit(1); } fread(&features_nb, sizeof(int), 1, f); - fprintf(stderr, "features_nb = %d\n", features_nb); + if(verbose)fprintf(stderr, "features_nb = %d\n", features_nb); fread(&classes_nb, sizeof(int), 1, f); - fprintf(stderr, "classes_nb = %d\n", classes_nb); + if(verbose)fprintf(stderr, "classes_nb = %d\n", classes_nb); ft = (feature_table *)memalloc(sizeof(feature_table)); ft->features_nb = features_nb; diff --git a/maca_trans_parser/src/feature_table.h b/maca_trans_parser/src/feature_table.h index 0aa1656..ff2ed6c 100644 --- a/maca_trans_parser/src/feature_table.h +++ b/maca_trans_parser/src/feature_table.h @@ -16,7 +16,7 @@ typedef struct { float score; } vcode; -feature_table *feature_table_load(char *filename); +feature_table *feature_table_load(char *filename, int verbose); void feature_table_dump(char *filename, feature_table *ft); feature_table *feature_table_new(int features_nb, int classes_nb); void feature_table_print(char *filename, feature_table *ft); diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff.c b/maca_trans_parser/src/maca_trans_parser_conll2cff.c index d42ad86..96533a1 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2cff.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff.c @@ -22,18 +22,18 @@ void maca_trans_parser_conll2cff_help_message(context *ctx) fprintf(stderr, "INPUT\n"); context_conll_help_message(ctx); fprintf(stderr, "IN TEST MODE\n"); - context_alphabet_help_message(ctx); + context_vocabs_help_message(ctx); fprintf(stderr, "OUTPUT\n"); context_cff_help_message(ctx); fprintf(stderr, "IN TRAIN MODE\n"); - context_alphabet_help_message(ctx); + context_vocabs_help_message(ctx); } void maca_trans_parser_conll2cff_check_options(context *ctx) { - if(!ctx->conll_filename + if(!ctx->input_filename || ctx->help /* || !ctx->mcd_filename */ || !(ctx->cff_filename || ctx->fann_filename) @@ -53,8 +53,8 @@ void generate_training_file_stream(FILE *output_file, context *ctx) sentence *ref = NULL; int sentence_nb = 0; int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), ctx->root_label); - FILE *conll_file = myfopen(ctx->conll_filename, "r"); - FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); + FILE *conll_file = myfopen(ctx->input_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); c = config_initial(conll_file, ctx->mcd_struct, 10, 5); @@ -120,8 +120,8 @@ void generate_training_file_buffer(FILE *output_file, context *ctx) feat_vec *fv = feat_vec_new(feature_types_nb); sentence *ref = NULL; int sentence_nb = 0; - FILE *conll_file = myfopen(ctx->conll_filename, "r"); - FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); + FILE *conll_file = myfopen(ctx->input_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); c = config_initial(conll_file, ctx->mcd_struct, 1000, 0); @@ -171,16 +171,16 @@ int main(int argc, char *argv[]) maca_trans_parser_conll2cff_check_options(ctx); - ctx->features_model = feat_model_read(ctx->features_model_filename); + ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); if(ctx->mode == TRAIN_MODE){ - mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); } else if(ctx->mode == TEST_MODE){ ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); - mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); } ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c index 85f0f4e..11d4760 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c @@ -4,7 +4,7 @@ #include<unistd.h> #include<getopt.h> #include"movement_tagger.h" -#include"oracle.h" +#include"oracle_tagger.h" #include"feat_fct.h" #include"context.h" #include"feat_vec.h" @@ -35,18 +35,19 @@ void maca_trans_parser_conll2cff_help_message(context *ctx) fprintf(stderr, "INPUT\n"); context_conll_help_message(ctx); fprintf(stderr, "IN TEST MODE\n"); - context_alphabet_help_message(ctx); + context_vocabs_help_message(ctx); fprintf(stderr, "OUTPUT\n"); context_cff_help_message(ctx); fprintf(stderr, "IN TRAIN MODE\n"); - context_alphabet_help_message(ctx); + context_vocabs_help_message(ctx); + } void maca_trans_parser_conll2cff_check_options(context *ctx) { - if(!ctx->conll_filename + if(!ctx->input_filename || ctx->help /* || !ctx->mcd_filename */ || !(ctx->cff_filename || ctx->fann_filename) @@ -59,15 +60,11 @@ void maca_trans_parser_conll2cff_check_options(context *ctx) void generate_training_file_stream(FILE *output_file, context *ctx) { config *c; - int mvt_code; - char mvt_type; - int mvt_label; feat_vec *fv = feat_vec_new(feature_types_nb); sentence *ref = NULL; int sentence_nb = 0; - int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), ctx->root_label); - FILE *conll_file = myfopen(ctx->conll_filename, "r"); - FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); + FILE *conll_file = myfopen(ctx->input_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); int postag; c = config_initial(conll_file, ctx->mcd_struct, 10, 5); @@ -91,14 +88,11 @@ void generate_training_file_stream(FILE *output_file, context *ctx) void generate_training_file_buffer(FILE *output_file, context *ctx) { config *c; - int mvt_code; - char mvt_type; - int mvt_label; feat_vec *fv = feat_vec_new(feature_types_nb); sentence *ref = NULL; int sentence_nb = 0; - FILE *conll_file = myfopen(ctx->conll_filename, "r"); - FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); + FILE *conll_file = myfopen(ctx->input_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); int postag; c = config_initial(conll_file, ctx->mcd_struct, 1000, 0); @@ -134,16 +128,16 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); maca_trans_parser_conll2cff_check_options(ctx); - ctx->features_model = feat_model_read(ctx->features_model_filename); + ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); if(ctx->mode == TRAIN_MODE){ - mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); } else if(ctx->mode == TEST_MODE){ ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); - mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); } feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); diff --git a/maca_trans_parser/src/maca_trans_parser_conll2fann.c b/maca_trans_parser/src/maca_trans_parser_conll2fann.c index 73ad593..940a9ca 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2fann.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2fann.c @@ -39,7 +39,7 @@ void transform_treebank_help_message(context *ctx) void transform_treebank_check_options(context *ctx) { - if(!ctx->conll_filename + if(!ctx->input_filename || ctx->help /* || !ctx->mcd_filename */ || !(ctx->cff_filename || ctx->fann_filename) @@ -75,8 +75,8 @@ int generate_training_file_buffer(FILE *output_file, context *ctx) sentence *ref = NULL; int nb_trans = 0; int sentence_nb = 0; - FILE *conll_file = myfopen(ctx->conll_filename, "r"); - FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); + FILE *conll_file = myfopen(ctx->input_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); c = config_initial(conll_file, ctx->mcd_struct, 1000, 0); @@ -85,41 +85,25 @@ int generate_training_file_buffer(FILE *output_file, context *ctx) queue_read_sentence(c->bf, conll_file, ctx->mcd_struct); while(!config_is_terminal(c)){ /* config_print(stdout,c); */ - - if(ctx->fann_filename) - config2feat_vec_fann(ctx->features_model, c, fv, ctx->mode); - else /*if(ctx->cff_filename)*/ - config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); - + config2feat_vec_fann(ctx->features_model, c, fv, ctx->mode); mvt_code = oracle(c, ref); nb_trans++; - + + feat_vec_print_dnn(output_file, fv, ctx->features_model, ctx->mcd_struct); + print_mvt_fann(output_file, ctx->mvt_nb, mvt_code); + fprintf(output_file, "\n\n"); + mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); - - /* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */ - - if(ctx->cff_filename){ - fprintf(output_file, "%d", mvt_code); - feat_vec_print(output_file, fv); - } - else if(ctx->fann_filename){ - feat_vec_print_dnn(output_file, fv, ctx->features_model, ctx->mcd_struct); - print_mvt_fann(output_file, ctx->mvt_nb, mvt_code); - fprintf(output_file, "\n\n"); - } if(mvt_type == MVT_LEFT){ - /* printf("LEFT\n"); */ movement_left_arc(c, mvt_label, 0); continue; } if(mvt_type == MVT_RIGHT){ - /* printf("RIGHT\n"); */ movement_right_arc(c, mvt_label, 0); continue; } if(mvt_type == MVT_SHIFT){ - /* printf("SHIFT\n"); */ movement_shift(c, 0, 0); continue; } @@ -141,15 +125,15 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); transform_treebank_check_options(ctx); - ctx->features_model = feat_model_read(ctx->features_model_filename); + ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); if(ctx->mode == TRAIN_MODE){ - mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); } else if(ctx->mode == TEST_MODE){ ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); - mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); } ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); @@ -176,16 +160,10 @@ int main(int argc, char *argv[]) rewind(output_file); fprintf(output_file, "%d", nb_trans); - fclose(output_file); - if(ctx->mode == TRAIN_MODE){ - /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ + if(ctx->mode == TRAIN_MODE) dico_vec_print(ctx->vocabs_filename, ctx->vocabs); - - } - - context_free(ctx); return 0; diff --git a/maca_trans_parser/src/simple_decoder.c b/maca_trans_parser/src/simple_decoder.c index fe39b22..c585a7f 100644 --- a/maca_trans_parser/src/simple_decoder.c +++ b/maca_trans_parser/src/simple_decoder.c @@ -11,34 +11,19 @@ #include"feature_table.h" #include"dico.h" -void simple_decoder_buffer(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_labels, feature_table *ft, feat_model *fm, int verbose, int root_label); -void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_labels, feature_table *ft, feat_model *fm, int verbose, int root_label); - - -void simple_decoder(FILE *f, mcd *mcd_struct, dico *d_perceptron_features, dico *dico_labels, feature_table *ft, feat_model *fm, int verbose, int root_label, int stream_mode) -{ - if(stream_mode) - simple_decoder_stream(f, mcd_struct, d_perceptron_features, dico_labels, ft, fm, verbose, root_label); - else - simple_decoder_buffer(f, mcd_struct, d_perceptron_features, dico_labels, ft, fm, verbose, root_label); -} - -void simple_decoder_buffer(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_labels, feature_table *ft, feat_model *fm, int verbose, int root_label) +void simple_decoder_buffer(context *ctx, FILE *f, feature_table *ft, int root_label) { - config *c; int mvt_code; int mvt_type; int mvt_label; - feat_vec *fv = feat_vec_new(feature_types_nb); float max; - - c = config_initial(f, mcd_struct, 1000, 0); - + feat_vec *fv = feat_vec_new(feature_types_nb); + config *c = config_initial(f, ctx->mcd_struct, 1000, 0); + /* read a sentence and put it in the buffer */ - while(queue_read_sentence(c->bf, f, mcd_struct)){ - + while(queue_read_sentence(c->bf, f, ctx->mcd_struct)){ while(!config_is_terminal(c)){ - config2feat_vec_cff(fm, c, dico_features, fv, LOOKUP_MODE); + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); mvt_code = feature_table_argmax(fv, ft, &max); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); @@ -57,28 +42,32 @@ void simple_decoder_buffer(FILE *f, mcd *mcd_struct, dico *dico_features, dico * /* config_print(stdout, c); */ config_connect_subtrees(c, root_label); - depset_print2(stdout, c->ds, dico_labels); + depset_print2(stdout, c->ds, ctx->dico_labels); /* config_free(c); */ - c = config_initial(f, mcd_struct, 1000, 0); + c = config_initial(f, ctx->mcd_struct, 1000, 0); } + + feat_vec_free(fv); } -void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_labels, feature_table *ft, feat_model *fm, int verbose, int root_label) +void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_label) { - config *c; int mvt_code; int mvt_type; int mvt_label; - feat_vec *fv = feat_vec_new(feature_types_nb); float max; + feat_vec *fv = feat_vec_new(feature_types_nb); + config *c = config_initial(f, ctx->mcd_struct, 10, 5); + + /* when in stream mode, force to renumber the tokens (ugly !) */ + ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; - c = config_initial(f, mcd_struct, 10, 5); while(!config_is_terminal(c)){ config_print(stdout, c); - config2feat_vec_cff(fm, c, dico_features, fv, LOOKUP_MODE); - /* feat_vec_print_string(fv, dico_features); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + /* feat_vec_print_string(fv, ctx->d_perceptron_features); */ mvt_code = feature_table_argmax(fv, ft, &max); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); @@ -98,13 +87,13 @@ void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico * /* config_print(stdout, c); */ config_connect_subtrees(c, root_label); - /* depset_print_new_index(stdout, c->ds, dico_labels);*/ - depset_print2(stdout, c->ds, dico_labels); + /* depset_print_new_index(stdout, c->ds, ctx->dico_labels);*/ + depset_print2(stdout, c->ds, ctx->dico_labels); /* pop the dummy word */ stack_pop(c->st); /* remplace it with a fresh one */ - stack_push(c->st, word_create_dummy(mcd_struct)); + stack_push(c->st, word_create_dummy(ctx->mcd_struct)); /* empty depset */ depset_free(c->ds); @@ -125,11 +114,33 @@ void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico * } /* config_print(stdout, c); */ - + config_connect_subtrees(c, root_label); - depset_print_new_index(stdout, c->ds, dico_labels); - + depset_print_new_index(stdout, c->ds, ctx->dico_labels); + /* config_free(c); */ + feat_vec_free(fv); +} + + +void simple_decoder(context *ctx) +{ + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + int root_label; + + root_label = dico_string2int(ctx->dico_labels, ctx->root_label); + if(root_label == -1) root_label = 0; + + if(ctx->stream_mode) + simple_decoder_stream(ctx, f, ft, root_label); + else + simple_decoder_buffer(ctx, f, ft, root_label); + + feature_table_free(ft); + if(ctx->input_filename) + fclose(f); } + diff --git a/maca_trans_parser/src/simple_decoder.h b/maca_trans_parser/src/simple_decoder.h index 81a4846..5617845 100644 --- a/maca_trans_parser/src/simple_decoder.h +++ b/maca_trans_parser/src/simple_decoder.h @@ -1,6 +1,7 @@ #ifndef __SIMPLE_DECODER__ #define __SIMPLE_DECODER__ +#include"context.h" -void simple_decoder(FILE *f, mcd *mcd_struct, dico *d_perceptron_features, dico *d_labels, feature_table *ft, feat_model *fm, int verbose, int root_label, int stream_mode); +void simple_decoder(context *ctx); #endif diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c index d312fdd..6066fc3 100644 --- a/maca_trans_parser/src/simple_decoder_tagger.c +++ b/maca_trans_parser/src/simple_decoder_tagger.c @@ -24,13 +24,13 @@ void add_signature_to_words_in_queue(queue *bf, form2pos *f2p) void simple_decoder_buffer(context *ctx) { dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); - feature_table *ft = feature_table_load(ctx->perc_model_filename); + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); int postag; feat_vec *fv = feat_vec_new(feature_types_nb); float max; int i; word *w = NULL; - FILE *f = (ctx->conll_filename)? myfopen(ctx->conll_filename, "r") : stdin; + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; config *c = config_initial(f, ctx->mcd_struct, 1000, 0); /* read a sentence and put it in the buffer */ @@ -56,7 +56,7 @@ void simple_decoder_buffer(context *ctx) /* config_free(c); */ c = config_initial(f, ctx->mcd_struct, 1000, 0); } - if(ctx->conll_filename) + if(ctx->input_filename) fclose(f); } @@ -82,8 +82,6 @@ void simple_decoder_stream(context *ctx) void simple_decoder_tagger(context *ctx) { - ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); - if(ctx->stream_mode) simple_decoder_stream(ctx); else diff --git a/maca_trans_parser/src/train_perceptron.c b/maca_trans_parser/src/train_perceptron.c index 33da5eb..b4433a4 100644 --- a/maca_trans_parser/src/train_perceptron.c +++ b/maca_trans_parser/src/train_perceptron.c @@ -40,7 +40,7 @@ void train_perceptron_help_message(context *ctx) void train_perceptron_check_options(context *ctx) { - if(!ctx->conll_filename + if(!ctx->input_filename || ctx->help /* || !ctx->mcd_filename */ || !ctx->features_model_filename @@ -60,7 +60,7 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); train_perceptron_check_options(ctx); - mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); @@ -109,8 +109,8 @@ feature_table *train_perceptron(context *ctx) for(epoch = 0; epoch < ctx->iteration_nb; epoch++){ fprintf(stderr, "[%d]", epoch + 1); - conll_file = myfopen(ctx->conll_filename, "r"); - conll_file_ref = myfopen(ctx->conll_filename, "r"); + conll_file = myfopen(ctx->input_filename, "r"); + conll_file_ref = myfopen(ctx->input_filename, "r"); config_oracle = config_initial(conll_file, ctx->mcd_struct, 1000, 0); /* config_pred = config_initial(conll_file, ctx->mcd_struct, 1000, 0); */ @@ -252,9 +252,9 @@ feature_table *train_perceptron_early_update(context *ctx) for(epoch = 0; epoch < ctx->iteration_nb; epoch++){ fprintf(stderr, "[%d]", epoch + 1); - conll_file = myfopen(ctx->conll_filename, "r"); - conll_file2 = myfopen(ctx->conll_filename, "r"); - conll_file_ref = myfopen(ctx->conll_filename, "r"); + conll_file = myfopen(ctx->input_filename, "r"); + conll_file2 = myfopen(ctx->input_filename, "r"); + conll_file_ref = myfopen(ctx->input_filename, "r"); config_oracle = config_initial(conll_file, ctx->mcd_struct, 1000, 0); config_pred = config_initial(conll_file2, ctx->mcd_struct, 1000, 0); -- GitLab