diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index f86fcc4d6214ee49023d8b34146282ec0f4ba72d..1186de181b049137a32d620412d79ef2ddf92e90 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -20,16 +20,29 @@ #define mcd_get_form_col(m) (m)->type[FEAT_TYPE_FORM] #define mcd_set_form_col(m, v) (m)->type[FEAT_TYPE_FORM] = (v) +/* mcd (multi column description) files describe the format of corpus files */ +/* every line of an mcd file describes the content of a column of the corpus file */ +/* every line contains four fields separated by a space character */ +/* first field is the index of the column described (first column corresponds to index zero) */ +/* second field is the name of the column. Such must be taken from the following list: */ +/* INDEX, FORM, LEMMA, CPOS, POS, FEAT, LABEL, STAG, INT, GOV, A ... Z */ +/* third field correspond to the internal representation of the tokens found in the column described. Four values are possible : */ +/* VOCAB if the internal representation is an integer code corresponding to the token */ +/* INT if the token is already an integer and its corresponding internal value is the same integer */ +/* EMB if the internal representation of the token is a real valued vector. */ +/* _ if no internal representation is associated to the field */ +/* fourth field is the name of a file in which the encoding is represented, this file can either be a dico (see dico.h) format file or an embedding file (see word_emb.h)*/ + typedef struct { - int nb_col; - int type2col[FEAT_TYPE_NB]; + int nb_col; /* number of columns in the mcd file */ + int type2col[FEAT_TYPE_NB]; /* in which column is represented is the form (FEAT_TYPE_FORM) lemma ... represented */ /* int *col2type; */ - int *type; - char **type_str; - int *representation; - char **filename; - dico **dico_array; - word_emb **word_emb_array; + int *type; /* array containing the type of every column */ + char **type_str; /* a string version of array type */ + int *representation; /* array containing the representation mode of every column (integer, vocabulary, embedding, NULL) */ + char **filename; /* array containing the file in which the different values for a columnn is represented */ + dico **dico_array; /* array containing the dico corresponding to each column (NULL if no file) */ + word_emb **word_emb_array; /* array containing the word embedding structure corresponding to each column (NULL if no file) */ } mcd; mcd *mcd_build_conll07(void); diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index 39f22b699e3b9eb36971a6efcc1795929be038a5..a6385d011f25ad46156e20ce415876c89cedd3ba 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -17,12 +17,12 @@ mcd *mcd_new(int nb_col) for(i=0; i < FEAT_TYPE_NB; i++) m->type2col[i] = -1; - m->representation = (int *)memalloc(nb_col * sizeof(int)); - m->type = (int *)memalloc(nb_col * sizeof(int)); - m->type_str = (char **)memalloc(nb_col * sizeof(char *)); - m->filename = (char **)memalloc(nb_col * sizeof(char *)); - m->dico_array = (dico **)memalloc(nb_col * sizeof(dico *)); - m->word_emb_array = (word_emb **)memalloc(nb_col * sizeof(word_emb *)); + m->representation = (int *) memalloc(nb_col * sizeof(int)); + m->type = (int *) memalloc(nb_col * sizeof(int)); + m->type_str = (char **) memalloc(nb_col * sizeof(char *)); + m->filename = (char **) memalloc(nb_col * sizeof(char *)); + m->dico_array = (dico **) memalloc(nb_col * sizeof(dico *)); + m->word_emb_array = (word_emb **) memalloc(nb_col * sizeof(word_emb *)); for(i=0; i < nb_col; i++){ m->representation[i] = MCD_REPRESENTATION_NULL; @@ -52,6 +52,10 @@ void mcd_free(mcd *m) free(m); } +/* this function is used when reading an corpus file which structure is described in mcd m */ +/* it returns the code associated to string str found in column col */ +/* the code depends on the way the column is represented (vocabulary, embedding or integer) */ + int mcd_get_code(mcd *m, char *str, int col){ if(m->representation[col] == MCD_REPRESENTATION_VOCAB) return dico_string2int(m->dico_array[col], str); @@ -62,6 +66,8 @@ int mcd_get_code(mcd *m, char *str, int col){ return MCD_INVALID_VALUE; } +/* look for the number of columns in an mcd file */ + int mcd_max_column_index_in_file(char *mcd_filename) { int max_col = -1; @@ -183,6 +189,7 @@ mcd *mcd_read(char *mcd_filename, int verbose) return m; } +/* builds an mcd corresponding to the conll07 format */ mcd *mcd_build_conll07(void) { @@ -238,6 +245,8 @@ mcd *mcd_build_conll07(void) return m; } +/* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */ + mcd *mcd_build_ifpls(void) { mcd *m = mcd_new(6); @@ -350,6 +359,7 @@ mcd *mcd_read_old(char *mcd_filename, char *corpus_filename, dico_vec *vocabs) return m; } +/* returns a dico_vec containing the different dictionnaries found in an mcd structure */ dico_vec *mcd_build_dico_vec(mcd *mcd_struct) { diff --git a/maca_lemmatizer/src/context.c b/maca_lemmatizer/src/context.c index 61b957e4ef5a9ad76105295d54bfd9067bb17e4a..f83fae202924839b01dfd29b86f787a963cc81b9 100644 --- a/maca_lemmatizer/src/context.c +++ b/maca_lemmatizer/src/context.c @@ -33,8 +33,8 @@ context *context_new(void) ctx->mcd_struct = NULL; ctx->language = strdup("fr"); ctx->maca_data_path = NULL; - ctx->form_column = 1; - ctx->pos_column = 2; + ctx->form_column = -1; + ctx->pos_column = -1; return ctx; } @@ -64,10 +64,10 @@ void context_fplm_help_message(context *ctx){ fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n"); } void context_mcd_help_message(context *ctx){ - fprintf(stderr, "\t-m --mcd <file> : multi column description file name\n"); + fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n"); } void context_language_help_message(context *ctx){ - fprintf(stderr, "\t-C --language : identifier of the language to use\n"); + fprintf(stderr, "\t-L --language : identifier of the language to use\n"); } void context_maca_data_path_help_message(context *ctx){ fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n"); @@ -87,7 +87,7 @@ context *context_read_options(int argc, char *argv[]) {"verbose", no_argument, 0, 'v'}, {"debug", no_argument, 0, 'd'}, {"conll", required_argument, 0, 'i'}, - {"mcd", required_argument, 0, 'm'}, + {"mcd", required_argument, 0, 'C'}, {"language", required_argument, 0, 'L'}, {"fplm", required_argument, 0, 'f'}, {"form_column", required_argument, 0, 'F'}, @@ -97,7 +97,7 @@ context *context_read_options(int argc, char *argv[]) optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "hvdi:f:m:L:M:F:D:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "hvdi:f:C:L:M:F:D:P:", long_options, &option_index)) != -1){ switch (c) { case 'd': @@ -121,7 +121,7 @@ context *context_read_options(int argc, char *argv[]) case 'i': ctx->conll_filename = strdup(optarg); break; - case 'm': + case 'C': ctx->mcd_filename = strdup(optarg); break; case 'L': diff --git a/maca_lemmatizer/src/maca_lemmatizer.c b/maca_lemmatizer/src/maca_lemmatizer.c index 702b691f17f6efece97e3b740b6426c70a6a1f8d..7e22f08b15877319984f95907e35026cb471736f 100644 --- a/maca_lemmatizer/src/maca_lemmatizer.c +++ b/maca_lemmatizer/src/maca_lemmatizer.c @@ -109,12 +109,12 @@ int main(int argc, char *argv[]) else pos_column = ctx->mcd_struct->type2col[FEAT_TYPE_POS]; + if(ctx->form_column != -1) form_column = ctx->form_column; else form_column = ctx->mcd_struct->type2col[FEAT_TYPE_FORM]; - - + if(ctx->conll_filename == NULL) f = stdin; else @@ -141,8 +141,9 @@ int main(int argc, char *argv[]) if(column_nb == form_column) form = strdup(token); /* if((column_nb < ctx->mcd_struct->nb_col) && (column_nb == pos_column)) */ - if(column_nb == pos_column) + if(column_nb == pos_column){ pos = strdup(token); + } column_nb++; } while((token = strtok(NULL , "\t"))); diff --git a/maca_trans_parser/src/cf_file.c b/maca_trans_parser/src/cf_file.c index 06a1baee6e0db480e137964cfc1e3f017cf2ffe1..ef43d01bbc40aa5d860a76b5ec0e73c93788731f 100644 --- a/maca_trans_parser/src/cf_file.c +++ b/maca_trans_parser/src/cf_file.c @@ -22,6 +22,7 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int } } + *max_feat = *max_feat + 1; *max_class = *max_class + 1; fclose(f); diff --git a/maca_trans_parser/src/config.c b/maca_trans_parser/src/config.c index 2385b061431bcdf79e60d78b4144d8a9f4d2afd7..84fea5ab35953dfc67a67c10b91755c6c42513a4 100644 --- a/maca_trans_parser/src/config.c +++ b/maca_trans_parser/src/config.c @@ -23,17 +23,18 @@ config *config_new(FILE *f, mcd *mcd_struct) return c; } -void config_add_next_word_to_buffer(config *c) +word *config_add_next_word_to_buffer(config *c) { word *w = NULL; w = word_read(c->f, c->mcd_struct); - if(w == NULL) return; + if(w == NULL) return NULL; if(word_get_index(w) == -1){ w->feat_array[FEAT_TYPE_INDEX] = c->current_index++; - printf("current index = %d\n", c->current_index); + /* printf("current index = %d\n", c->current_index); */ } queue_add(c->bf, w); + return w; } void config_free(config *c) @@ -64,6 +65,17 @@ config *config_initial(FILE *f, mcd *mcd_struct, int lookahead) return c; } +config *config_initial_no_dummy_word(FILE *f, mcd *mcd_struct, int lookahead) +{ + int i; + config *c = config_new(f, mcd_struct); + + for(i=0; i < lookahead; i++) + config_add_next_word_to_buffer(c); + + return c; +} + config *config_copy(config *o) { int i; diff --git a/maca_trans_parser/src/config.h b/maca_trans_parser/src/config.h index f14814f52070df732af0bcdaa03dcd61b6ff57c7..b08204ce69aa42aac6b6eccb32e9cf4d786835d6 100644 --- a/maca_trans_parser/src/config.h +++ b/maca_trans_parser/src/config.h @@ -34,12 +34,13 @@ int config_equal(config *c1, config *c2); int config_equal2(config *c1, config *c2); config *config_new(FILE *f, mcd *mcd_struct); config *config_initial(FILE *f, mcd *mcd_struct, int lookahead); +config *config_initial_no_dummy_word(FILE *f, mcd *mcd_struct, int lookahead); config *config_copy(config *o); void config_print(FILE *buffer, config *c); int config_is_terminal(config *c); void config_free(config *c); void config_add_mvt(config *c, int mvt); -void config_add_next_word_to_buffer(config *c); +word *config_add_next_word_to_buffer(config *c); void config_connect_subtrees(config *c, int root_label); diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 125796d812e4996708e719a5ef5808dd46bf5e54..54465328a86010387629033e527a7b003f219872 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -26,9 +26,10 @@ void context_free(context *ctx) if(ctx->d_perceptron_features) dico_free(ctx->d_perceptron_features); + /* if(ctx->mcd_struct) mcd_free(ctx->mcd_struct); - + */ if(ctx->features_model) feat_model_free(ctx->features_model); @@ -88,11 +89,12 @@ void context_general_help_message(context *ctx) { fprintf(stderr, "usage: %s [options]\n", ctx->program_name); fprintf(stderr, "Options:\n"); - fprintf(stderr, "\t-h --help : print this message\n"); - fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); - fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); fprintf(stderr, "\t-D --maca_data_path <str> : path to the maca_data directory\n"); - fprintf(stderr, "\t-L --language <str> : identifier of the language to use (default is fr)\n"); + fprintf(stderr, "\t-L --language <str> : identifier of the language to use (default is fr)\n"); + fprintf(stderr, "\t-S --stream : stream mode\n"); } void context_model_help_message(context *ctx){ diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 1d1d1bd80a3d31cc658943fccd146b4b038a02ee..b349449ada8182a8754a33cffd68d260db87d231 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -91,5 +91,6 @@ void context_f2p_filename_help_message(context *ctx); void context_conll_help_message(context *ctx); void context_ifpls_help_message(context *ctx); void context_input_help_message(context *ctx); +void context_root_label_help_message(context *ctx); #endif diff --git a/maca_trans_parser/src/decode.c b/maca_trans_parser/src/decode.c index 225aab9050d8885132340d5f85172e24fe12e7cf..6e749d156e73c8714d11145413bcc3c93a7dc654 100644 --- a/maca_trans_parser/src/decode.c +++ b/maca_trans_parser/src/decode.c @@ -25,6 +25,7 @@ void decode_help_message(context *ctx) context_model_help_message(ctx); context_vocabs_help_message(ctx); context_features_model_help_message(ctx); + context_root_label_help_message(ctx); } void decode_check_options(context *ctx){ diff --git a/maca_trans_parser/src/depset.c b/maca_trans_parser/src/depset.c index a71bba8d19aed0727fa982474eef064634ff004a..d949351638f937473b430c9fca0eb343ba5d979c 100644 --- a/maca_trans_parser/src/depset.c +++ b/maca_trans_parser/src/depset.c @@ -81,7 +81,20 @@ void depset_print2(FILE *f, depset *d, dico *dico_labels) fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label)); } } - fprintf(f, "\n"); + /* fprintf(f, "\n"); */ +} + +void depset_print3(FILE *f, depset *d, dico *dico_labels) +{ + int i; + + for(i=1; i < d->length; i++){ + if((d->array[i].gov) && (d->array[i].dep)){ + /* fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov) - word_get_index(d->array[i].dep), dico_int2string(dico_labels, d->array[i].label ));*/ + fprintf(f, "%d\t%s\t%d\t%s\n", word_get_index(d->array[i].dep), d->array[i].dep->input, word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label)); + } + } + /* fprintf(f, "\n"); */ } char *skip_index(char *buffer) @@ -100,7 +113,8 @@ void depset_print_new_index(FILE *f, depset *d, dico *dico_labels) for(i=1; i < d->length; i++){ if((d->array[i].gov) && (d->array[i].dep)){ - fprintf(f, "%d", word_get_index(d->array[i].dep)); + /* fprintf(f, "%d\t", word_get_index(d->array[i].dep)); */ + fprintf(f, "%d\t", word_get_index(d->array[i].dep)); fprintf(f, "%s\t%d\t%s\n", skip_index(d->array[i].dep->input), word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label)); } } diff --git a/maca_trans_parser/src/depset.h b/maca_trans_parser/src/depset.h index f7ba0c56f95df65cad9028251f9f85165a716e47..f8a5ccb78285bb43090a9513d651561d560f5791 100644 --- a/maca_trans_parser/src/depset.h +++ b/maca_trans_parser/src/depset.h @@ -24,6 +24,7 @@ void depset_init(depset *d); void depset_add(depset *d, word *gov, int label, word *dep); void depset_print(FILE *f, depset *d); void depset_print2(FILE *f, depset *d, dico *dico_labels); +void depset_print3(FILE *f, depset *d, dico *dico_labels); void depset_print_new_index(FILE *f, depset *d, dico *dico_labels); diff --git a/maca_trans_parser/src/feat_model.c b/maca_trans_parser/src/feat_model.c index 267a662a5d4ab8261c682a800b2defd6781884d3..6b6ad227d61582e2e5aade9e14c5ab3a4ffd567e 100644 --- a/maca_trans_parser/src/feat_model.c +++ b/maca_trans_parser/src/feat_model.c @@ -113,9 +113,11 @@ int feat_model_get_feat_value_cff(feat_model *fm, config *c, dico *dico_features catenate_int(fm->string, feat_val); } - if(mode == LOOKUP_MODE) + if(mode == LOOKUP_MODE){ + if(fm->string) + /* printf("fmstring = %s\n", fm->string); */ return dico_string2int(dico_features, fm->string); - + } return dico_add(dico_features, fm->string); } diff --git a/maca_trans_parser/src/feature_table.c b/maca_trans_parser/src/feature_table.c index 7450eb1ca61b444acffbb30304b8c5818daa525a..3db2da4bb1059e9e4bb32c62bf1f0e3f2cfd5bf7 100644 --- a/maca_trans_parser/src/feature_table.c +++ b/maca_trans_parser/src/feature_table.c @@ -150,7 +150,8 @@ int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max) for(feat=0; feat < fv->nb; feat++){ for(cla=0; cla < classes_nb; cla++){ - if(fv->t[feat] != -1){ + if((fv->t[feat] != -1) && (fv->t[feat] < ft->features_nb)){ + /* if(fv->t[feat] != -1){ */ /* printf("feat score = %f\n", ft->table[fv->t[feat]][cla]); */ classes_score[cla] += ft->table[fv->t[feat]][cla]; } diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff.c b/maca_trans_parser/src/maca_trans_parser_conll2cff.c index 3a4937917cc3b8970c520be3ca8e3260222ecf2c..3ca37e8cc3b2ffc1ae25184e82d8c9653aefa15a 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2cff.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff.c @@ -36,7 +36,7 @@ void maca_trans_parser_conll2cff_check_options(context *ctx) if(!ctx->input_filename || ctx->help /* || !ctx->mcd_filename */ - || !(ctx->cff_filename || ctx->fann_filename) + /* || !(ctx->cff_filename || ctx->fann_filename) */ ){ maca_trans_parser_conll2cff_help_message(ctx); exit(1); diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c index b7d5fae82ad860ffeb974d8d1e5f85e117642d95..9aa8c060469b001badbbbf03bd21106f287f377a 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c @@ -20,8 +20,10 @@ void add_signature_to_words_in_queue(queue *bf, form2pos *f2p) for(i=0; i < queue_nbelem(bf); i++){ w = queue_elt_n(bf, i); - /* printf("add signature %d to word %s\n", form2pos_get_signature(f2p, w->form), w->form); */ - w->signature = form2pos_get_signature(f2p, w->form); + if(!w->signature){ + /* printf("add signature %d to word %s\n", form2pos_get_signature(f2p, w->form), w->form); */ + w->signature = form2pos_get_signature(f2p, w->form); + } } } @@ -61,27 +63,20 @@ void generate_training_file_stream(FILE *output_file, context *ctx) { config *c; feat_vec *fv = feat_vec_new(feature_types_nb); - sentence *ref = NULL; - int sentence_nb = 0; FILE *conll_file = myfopen(ctx->input_filename, "r"); - FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); int postag; - c = config_initial(conll_file, ctx->mcd_struct, 5); - - while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ - /* sentence_print(stdout, ref, mcd_get_dico_label(ctx->mcd_struct)); */ - while(1){ - /* config_print(stdout,c); */ - config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); - postag = oracle_tagger(c, ref); + c = config_initial_no_dummy_word(conll_file, ctx->mcd_struct, 5); - fprintf(output_file, "%d", postag); - feat_vec_print(output_file, fv); - - if(postag != -1) - movement_tagger(c, postag, 0, 1); - } + while(!config_is_terminal(c)){ + /* config_print(stdout,c); */ + if(ctx->f2p) + add_signature_to_words_in_queue(c->bf, ctx->f2p); + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + postag = oracle_tagger(c, NULL); + fprintf(output_file, "%d", postag); + feat_vec_print(output_file, fv); + movement_tagger(c, postag, 0, 1); } } diff --git a/maca_trans_parser/src/movement_tagger.c b/maca_trans_parser/src/movement_tagger.c index ff5e3057625de5bc0f16a4fdd55a1d2ba5243847..d1f46b682675d693e0cfe3c26321ba10e90ce10f 100644 --- a/maca_trans_parser/src/movement_tagger.c +++ b/maca_trans_parser/src/movement_tagger.c @@ -7,15 +7,19 @@ int movement_tagger(config *c, int postag, float score, int stream) { word *b0 = NULL; + int k = 5; + if(queue_is_empty(c->bf)) return 0; - b0 = queue_elt_n(c->bf, 0); + b0 = queue_remove(c->bf); word_set_pos(b0, postag); - stack_push(c->st, queue_remove(c->bf)); + stack_push(c->st, b0); - /* in stream mode, read a new word and add it to the buffer */ - if(stream) - config_add_next_word_to_buffer(c); + /* in stream mode, read a new word, add it to the buffer and keen only (k = 5) elts in the stack */ + if(stream){ + stack_trim_to_size(c->st, k); + config_add_next_word_to_buffer(c); + } return 1; } diff --git a/maca_trans_parser/src/oracle_tagger.c b/maca_trans_parser/src/oracle_tagger.c index c08cc234a0463f215ac485be9284eaddf8db7d1e..10d53d8ffa4def5f6c27b9b21d6c4f45df7b3a92 100644 --- a/maca_trans_parser/src/oracle_tagger.c +++ b/maca_trans_parser/src/oracle_tagger.c @@ -3,12 +3,15 @@ int oracle_tagger(config *c, sentence *ref) { word *b0; /* next word in the bufer */ - int b0_index; - + /* int b0_index; */ + int b0_pos; if(!queue_is_empty(c->bf)){ b0 = queue_elt_n(c->bf, 0); - b0_index = word_get_index(b0); - return word_get_pos(ref->words[b0_index]); + b0_pos = word_get_pos(b0); + /* printf("b0_pos = %d\n", b0_pos); */ + /* b0_index = word_get_index(b0); */ + /* return word_get_pos(ref->words[b0_index]); */ + return b0_pos; } return -1; } diff --git a/maca_trans_parser/src/perceptron.c b/maca_trans_parser/src/perceptron.c index 8b1560957681f733adc6729737d9d6b95487d5d2..3425a3c396adf67c8d18a88f8066d911146edd2b 100644 --- a/maca_trans_parser/src/perceptron.c +++ b/maca_trans_parser/src/perceptron.c @@ -1,9 +1,8 @@ #include<stdio.h> #include<stdlib.h> #include<string.h> -#include"feat_fct.h" #include"feature_table.h" -#include"config2feat_vec.h" +#include"util.h" void perceptron_avg(char *filename, feature_table *ft, int n_iter) { @@ -18,7 +17,8 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter) int epoch; int i,j; float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float)); - feat_vec *fv = feat_vec_new(feature_types_nb); + /* feat_vec *fv = feat_vec_new(feature_types_nb); */ + feat_vec *fv = feat_vec_new(1); char *token; feature_table *ft_sum = feature_table_new(ft->features_nb, ft->classes_nb); int counter = 1; @@ -100,7 +100,8 @@ void perceptron(char *filename, feature_table *ft, int n_iter) int epoch; int i; float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float)); - feat_vec *fv = feat_vec_new(feature_types_nb); + /* feat_vec *fv = feat_vec_new(feature_types_nb); */ + feat_vec *fv = feat_vec_new(1); char *token; for(epoch = 0; epoch < n_iter; epoch++){ diff --git a/maca_trans_parser/src/queue.c b/maca_trans_parser/src/queue.c index 4e142d22f97005627e74d26d5cafb50fe7b81624..53985be98cd6c7f7b8b825ffc4d31c234ecd78d1 100644 --- a/maca_trans_parser/src/queue.c +++ b/maca_trans_parser/src/queue.c @@ -28,7 +28,8 @@ int queue_read_sentence(queue *bf, FILE *f, mcd *mcd_struct) } queue_add(bf, w); } - return bf->nbelem - 1; /* because of the dummy word */ + /* return bf->nbelem - 1; */ /* because of the dummy word */ + return bf->nbelem ; } diff --git a/maca_trans_parser/src/sentence.c b/maca_trans_parser/src/sentence.c index 63cdb8b8a76caebd127b24530d2321ecf46671ab..750aaac7c71468e6105f4a33d6ef332ac5b719fe 100644 --- a/maca_trans_parser/src/sentence.c +++ b/maca_trans_parser/src/sentence.c @@ -19,7 +19,7 @@ sentence *sentence_new(mcd *m, FILE *f) sentence *sentence_init(mcd *m, FILE *f) { sentence *s = sentence_new(m, f); - sentence_add_word(s, word_create_dummy(m)); + sentence_add_word(s, word_create_dummy(m)); return s; } @@ -81,3 +81,23 @@ sentence *sentence_read(FILE *f, mcd *mcd_struct) } return s; } + +sentence *sentence_read_no_dummy_word(FILE *f, mcd *mcd_struct) +{ + sentence *s = sentence_new(mcd_struct, f); + char buffer[1000]; + word *w = NULL; + + while(fgets(buffer, 1000, f)){ + if(feof(f)) break; + if((buffer[0] == '\n') || (buffer[0] == ' ')) break; /* end of the sentence */ + w = word_parse_buffer(buffer, mcd_struct); + sentence_add_word(s, w); + } + + if(s->length == 1){ + sentence_free(s); + return NULL; + } + return s; +} diff --git a/maca_trans_parser/src/sentence.h b/maca_trans_parser/src/sentence.h index 6a80509dab72ff1627001a81158936d2cc180841..cdd1019d3dc30b57f51b795d5d17d6e1072a597e 100644 --- a/maca_trans_parser/src/sentence.h +++ b/maca_trans_parser/src/sentence.h @@ -18,6 +18,7 @@ sentence *sentence_init(mcd *m, FILE *f); void sentence_print(FILE *f, sentence *s, dico *dico_labels); sentence *sentence_read(FILE *f, mcd *mcd_struct); +sentence *sentence_read_no_dummy_word(FILE *f, mcd *mcd_struct); void sentence_add_word(sentence *s, word *w); void sentence_free(sentence *s); diff --git a/maca_trans_parser/src/simple_decoder.c b/maca_trans_parser/src/simple_decoder.c index 8d43e9901c964bb06898692dc02948b15d314381..b43dcbeacfc2a6e587b8800961758191e98f5cfe 100644 --- a/maca_trans_parser/src/simple_decoder.c +++ b/maca_trans_parser/src/simple_decoder.c @@ -21,7 +21,7 @@ void simple_decoder_buffer(context *ctx, FILE *f, feature_table *ft, int root_la config *c = config_initial(f, ctx->mcd_struct, 0); /* read a sentence and put it in the buffer */ - while(queue_read_sentence(c->bf, f, ctx->mcd_struct)){ + while(queue_read_sentence(c->bf, f, ctx->mcd_struct) > 1){ while(!config_is_terminal(c)){ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); mvt_code = feature_table_argmax(fv, ft, &max); @@ -43,6 +43,8 @@ void simple_decoder_buffer(context *ctx, FILE *f, feature_table *ft, int root_la config_connect_subtrees(c, root_label); depset_print2(stdout, c->ds, ctx->dico_labels); + fprintf(stdout, "\n"); + /* config_free(c); */ c = config_initial(f, ctx->mcd_struct, 0); @@ -59,13 +61,14 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la int mvt_label; float max; feat_vec *fv = feat_vec_new(feature_types_nb); - config *c = config_initial(f, ctx->mcd_struct, 5); + config *c = NULL; /* when in stream mode, force to renumber the tokens (ugly !) */ - ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; + /* ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; */ + c = config_initial(f, ctx->mcd_struct, 5); while(!config_is_terminal(c)){ - config_print(stdout, c); + /* config_print(stdout, c); */ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); /* feat_vec_print_string(fv, ctx->d_perceptron_features); */ mvt_code = feature_table_argmax(fv, ft, &max); @@ -73,9 +76,10 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la mvt_label = movement_label(mvt_code); /* printf("code predicted = %d\n", mvt_code); */ - - if((stack_height(c->st)==1) && (mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ /* sentence is complete */ + /* sentence is complete */ + if((stack_height(c->st)==1) && (mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ + /* if(mvt_label == root_label){ */ /* config_print(stdout, c); */ /* create the root arc */ @@ -87,9 +91,13 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la /* config_print(stdout, c); */ config_connect_subtrees(c, root_label); - /* depset_print_new_index(stdout, c->ds, ctx->dico_labels);*/ - depset_print2(stdout, c->ds, ctx->dico_labels); + /* depset_print_new_index(stdout, c->ds, ctx->dico_labels); */ + if(ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] == -1) + depset_print3(stdout, c->ds, ctx->dico_labels); + else + depset_print2(stdout, c->ds, ctx->dico_labels); + /* pop the dummy word */ stack_pop(c->st); /* remplace it with a fresh one */ @@ -98,7 +106,7 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la /* empty depset */ depset_free(c->ds); c->ds = depset_new(); - c->current_index = queue_renumber_words(c->bf); + /* c->current_index = queue_renumber_words(c->bf); */ continue; } diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c index 9f58cddb9f33b7d9db30760487ad2fee218b3732..882a2aa5bc4c79a5194a07090f1e6a8865456c5b 100644 --- a/maca_trans_parser/src/simple_decoder_tagger.c +++ b/maca_trans_parser/src/simple_decoder_tagger.c @@ -31,11 +31,12 @@ void simple_decoder_buffer(context *ctx) int i; word *w = NULL; FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; - config *c = config_initial(f, ctx->mcd_struct, 0); + config *c = config_initial_no_dummy_word(f, ctx->mcd_struct, 0); /* read a sentence and put it in the buffer */ while(queue_read_sentence(c->bf, f, ctx->mcd_struct)){ - queue_remove(c->bf); /* get rid of dummy token */ + /* queue_remove(c->bf); */ + /* get rid of dummy token */ if(ctx->f2p) add_signature_to_words_in_queue(c->bf, ctx->f2p); while(!config_is_terminal(c)){ @@ -54,7 +55,7 @@ void simple_decoder_buffer(context *ctx) printf("\n"); /* config_free(c); */ - c = config_initial(f, ctx->mcd_struct, 0); + c = config_initial_no_dummy_word(f, ctx->mcd_struct, 0); } if(ctx->input_filename) fclose(f); @@ -65,16 +66,33 @@ void simple_decoder_stream(context *ctx) { config *c; feat_vec *fv = feat_vec_new(feature_types_nb); - FILE *f = NULL; + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + int postag; + float max; + word *w; + dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); - /* when in stream mode, force to renumber the tokens (ugly !) */ - ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; - - c = config_initial(f, ctx->mcd_struct, 5); + c = config_initial_no_dummy_word(f, ctx->mcd_struct, 5); while(!config_is_terminal(c)){ - config_print(stdout, c); + if(ctx->f2p) + add_signature_to_words_in_queue(c->bf, ctx->f2p); + /* config_print(stdout, c); */ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + + /* feat_vec_print(stdout, fv); */ + postag = feature_table_argmax(fv, ft, &max); + /* printf("postag = %d\n", postag); */ + + w = queue_elt_n(c->bf, 0); + printf("%s\t%s\n", w->input, dico_int2string(dico_pos, postag)); + + if(postag != -1) + movement_tagger(c, postag, max, 1); + + } + /* config_print(stdout, c); */ /* config_free(c); */ } diff --git a/maca_trans_parser/src/stack.c b/maca_trans_parser/src/stack.c index 2672732f260a5178cd6084ca09ddd3246803f912..cbd419e9c496e6e6cc6c89b3616d0f21ea6a161d 100644 --- a/maca_trans_parser/src/stack.c +++ b/maca_trans_parser/src/stack.c @@ -31,6 +31,16 @@ stack *stack_new(void) s->top = 0; return s; } +/* +stack *stack_static_new(int size) +{ + stack *s = (stack *)memalloc(sizeof(stack)); + s->size = size; + s->array = (word **)memalloc(size * sizeof(word*))); + s->top = 0; + return s; +} +*/ stack *stack_copy(stack *s) { @@ -87,3 +97,18 @@ void stack_print(FILE *buffer, stack *s) fprintf(buffer, "]"); } } + +/* keep k upper elements in the stack */ + +void stack_trim_to_size(stack *s, int k) +{ + int i, delta; + + if(stack_nbelem(s) > k){ + delta = stack_nbelem(s) - k; + for(i = 0; i < s->top; i++){ + s->array[i] = s->array[i + delta]; + } + s->top = k; + } +} diff --git a/maca_trans_parser/src/stack.h b/maca_trans_parser/src/stack.h index 2ccd6e20f5b875123411432ef2b4102e38edcfdb..de01374ad877b47e896a9917bde87ab00b71febd 100644 --- a/maca_trans_parser/src/stack.h +++ b/maca_trans_parser/src/stack.h @@ -23,4 +23,5 @@ void stack_print(FILE *buffer, stack *s); void stack_free(stack *s); int stack_is_empty(stack *s); /* int stack_height(stack *s); */ +void stack_trim_to_size(stack *s, int k); #endif diff --git a/maca_trans_parser/src/word.c b/maca_trans_parser/src/word.c index 277b15edf571c96c52303e3c09a86741eea5ae28..9d46d258e0c96d1501f5d0a3cbc1623f64d3c9ed 100644 --- a/maca_trans_parser/src/word.c +++ b/maca_trans_parser/src/word.c @@ -38,6 +38,9 @@ word *word_read(FILE *f, mcd *mcd_struct) return NULL; } +/* parse string buffer to extract the different word features */ +/* codes of the word features are stored in feat_array */ + word *word_parse_buffer(char *buffer, mcd *mcd_struct) { char *token; diff --git a/maca_trans_parser/src/word.h b/maca_trans_parser/src/word.h index 94141984f25f9db625ff30e16d4efb6d71a50add..2ecd333d6ca52762184aceb58d7079d0c67e4b9f 100644 --- a/maca_trans_parser/src/word.h +++ b/maca_trans_parser/src/word.h @@ -73,15 +73,16 @@ #define word_set_V(w, val) (w)->feat_array[FEAT_TYPE_V] = (val) #define word_set_W(w, val) (w)->feat_array[FEAT_TYPE_W] = (val) #define word_set_X(w, val) (w)->feat_array[FEAT_TYPE_X] = (val) + #define word_set_Y(w, val) (w)->feat_array[FEAT_TYPE_Y] = (val) #define word_set_Z(w, val) (w)->feat_array[FEAT_TYPE_Z] = (val) #define word_set_signature(w, val) (w)->signature = (val) typedef struct _word { - int feat_array[FEAT_TYPE_NB]; - char *input; - int U1; /* does the form begin with an uppercase character */ - int signature; /* pos tags that this form can have (represented as a boolean string) */ + int feat_array[FEAT_TYPE_NB]; /* array containing the codes corresponding to the different word features */ + char *input; /* the string corresponding to the actual line in the corpus file */ + int U1; /* does the form begin with an uppercase character */ + int signature; /* pos tags that this form can have (represented as a boolean string) */ int label; char *form; } word;