From e20a7679e8bd1cfcf05a2612e45aa413f4123137 Mon Sep 17 00:00:00 2001
From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
Date: Mon, 26 Sep 2016 09:27:56 -0400
Subject: [PATCH] fixing stream mode of trans_tagger and trans_parser

---
 maca_common/include/mcd.h                     | 29 ++++++++++-----
 maca_common/src/mcd.c                         | 22 ++++++++----
 maca_lemmatizer/src/context.c                 | 14 ++++----
 maca_lemmatizer/src/maca_lemmatizer.c         |  7 ++--
 maca_trans_parser/src/cf_file.c               |  1 +
 maca_trans_parser/src/config.c                | 18 ++++++++--
 maca_trans_parser/src/config.h                |  3 +-
 maca_trans_parser/src/context.c               | 12 ++++---
 maca_trans_parser/src/context.h               |  1 +
 maca_trans_parser/src/decode.c                |  1 +
 maca_trans_parser/src/depset.c                | 18 ++++++++--
 maca_trans_parser/src/depset.h                |  1 +
 maca_trans_parser/src/feat_model.c            |  6 ++--
 maca_trans_parser/src/feature_table.c         |  3 +-
 .../src/maca_trans_parser_conll2cff.c         |  2 +-
 .../src/maca_trans_parser_conll2cff_tagger.c  | 33 ++++++++---------
 maca_trans_parser/src/movement_tagger.c       | 14 +++++---
 maca_trans_parser/src/oracle_tagger.c         | 11 +++---
 maca_trans_parser/src/perceptron.c            |  9 ++---
 maca_trans_parser/src/queue.c                 |  3 +-
 maca_trans_parser/src/sentence.c              | 22 +++++++++++-
 maca_trans_parser/src/sentence.h              |  1 +
 maca_trans_parser/src/simple_decoder.c        | 26 +++++++++-----
 maca_trans_parser/src/simple_decoder_tagger.c | 36 ++++++++++++++-----
 maca_trans_parser/src/stack.c                 | 25 +++++++++++++
 maca_trans_parser/src/stack.h                 |  1 +
 maca_trans_parser/src/word.c                  |  3 ++
 maca_trans_parser/src/word.h                  |  9 ++---
 28 files changed, 236 insertions(+), 95 deletions(-)

diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h
index f86fcc4..1186de1 100644
--- a/maca_common/include/mcd.h
+++ b/maca_common/include/mcd.h
@@ -20,16 +20,29 @@
 #define mcd_get_form_col(m) (m)->type[FEAT_TYPE_FORM]
 #define mcd_set_form_col(m, v) (m)->type[FEAT_TYPE_FORM] = (v)
 
+/* mcd (multi column description) files describe the format of corpus files */
+/* every line of an mcd file describes the content of a column of the corpus file */
+/* every line contains four fields separated by a space character */
+/* first field is the index of the column described (first column corresponds to index zero) */
+/* second field is the name of the column. Such must be taken from the following list: */
+/* INDEX, FORM, LEMMA, CPOS, POS, FEAT, LABEL, STAG, INT, GOV, A ... Z */
+/* third field correspond to the internal representation of the tokens found in the column described. Four values are possible : */
+/* VOCAB if the internal representation is an integer code corresponding to the token */
+/* INT if the token is already an integer and its corresponding internal value is the same integer */
+/* EMB if the internal representation of the token is a real valued vector. */
+/* _   if no internal representation is associated to the field */
+/* fourth field is the name of a file in which the encoding is represented, this file can either be a dico (see dico.h) format file or an embedding file (see word_emb.h)*/
+
 typedef struct {
-  int nb_col;
-  int type2col[FEAT_TYPE_NB];
+  int nb_col;                 /* number of columns in the mcd file */
+  int type2col[FEAT_TYPE_NB]; /* in which column is represented is the form (FEAT_TYPE_FORM) lemma ... represented */ 
   /* int *col2type; */
-  int *type;
-  char **type_str;
-  int *representation;
-  char **filename;
-  dico **dico_array;
-  word_emb **word_emb_array;
+  int *type;                  /* array containing the type of every column */
+  char **type_str;            /* a string version of array type */
+  int *representation;        /* array containing the representation mode of every column (integer, vocabulary, embedding, NULL) */
+  char **filename;            /* array containing the file in which the different values for a columnn is represented */
+  dico **dico_array;          /* array containing the dico corresponding to each column (NULL if no file) */
+  word_emb **word_emb_array;  /* array containing the word embedding structure corresponding to each column (NULL if no file) */
 } mcd;
 
 mcd *mcd_build_conll07(void);
diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c
index 39f22b6..a6385d0 100644
--- a/maca_common/src/mcd.c
+++ b/maca_common/src/mcd.c
@@ -17,12 +17,12 @@ mcd *mcd_new(int nb_col)
   for(i=0; i < FEAT_TYPE_NB; i++)
     m->type2col[i] = -1;
 
-  m->representation = (int *)memalloc(nb_col * sizeof(int));
-  m->type =           (int *)memalloc(nb_col * sizeof(int));
-  m->type_str =           (char **)memalloc(nb_col * sizeof(char *));
-  m->filename =       (char **)memalloc(nb_col * sizeof(char *));
-  m->dico_array =     (dico **)memalloc(nb_col * sizeof(dico *));
-  m->word_emb_array = (word_emb **)memalloc(nb_col * sizeof(word_emb *));
+  m->representation = (int *)       memalloc(nb_col * sizeof(int));
+  m->type =           (int *)       memalloc(nb_col * sizeof(int));
+  m->type_str =       (char **)     memalloc(nb_col * sizeof(char *));
+  m->filename =       (char **)     memalloc(nb_col * sizeof(char *));
+  m->dico_array =     (dico **)     memalloc(nb_col * sizeof(dico *));
+  m->word_emb_array = (word_emb **) memalloc(nb_col * sizeof(word_emb *));
   
   for(i=0; i < nb_col; i++){
     m->representation[i] = MCD_REPRESENTATION_NULL;
@@ -52,6 +52,10 @@ void mcd_free(mcd *m)
   free(m);
 }
 
+/* this function is used when reading an corpus file which structure is described in mcd m */
+/* it returns the code associated to string str found in column col */
+/* the code depends on the way the column is represented (vocabulary, embedding or integer) */ 
+
 int mcd_get_code(mcd *m, char *str, int col){
   if(m->representation[col] == MCD_REPRESENTATION_VOCAB)
     return dico_string2int(m->dico_array[col], str);
@@ -62,6 +66,8 @@ int mcd_get_code(mcd *m, char *str, int col){
   return MCD_INVALID_VALUE;
 }
 
+/* look for the number of columns in an mcd file */
+
 int mcd_max_column_index_in_file(char *mcd_filename)
 {
   int max_col = -1;
@@ -183,6 +189,7 @@ mcd *mcd_read(char *mcd_filename, int verbose)
   return m;
 }
 
+/* builds an mcd corresponding to the conll07 format */
 
 mcd *mcd_build_conll07(void)
 {
@@ -238,6 +245,8 @@ mcd *mcd_build_conll07(void)
   return m;
 }
 
+/* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */
+
 mcd *mcd_build_ifpls(void)
 {
   mcd *m = mcd_new(6);
@@ -350,6 +359,7 @@ mcd *mcd_read_old(char *mcd_filename, char *corpus_filename, dico_vec *vocabs)
   return m;
 }
 
+/* returns a dico_vec containing the different dictionnaries found in an mcd structure */
 
 dico_vec *mcd_build_dico_vec(mcd *mcd_struct)
 {
diff --git a/maca_lemmatizer/src/context.c b/maca_lemmatizer/src/context.c
index 61b957e..f83fae2 100644
--- a/maca_lemmatizer/src/context.c
+++ b/maca_lemmatizer/src/context.c
@@ -33,8 +33,8 @@ context *context_new(void)
   ctx->mcd_struct = NULL;
   ctx->language = strdup("fr");
   ctx->maca_data_path = NULL;
-  ctx->form_column = 1;
-  ctx->pos_column = 2;
+  ctx->form_column = -1;
+  ctx->pos_column = -1;
   return ctx;
 }
 
@@ -64,10 +64,10 @@ void context_fplm_help_message(context *ctx){
   fprintf(stderr, "\t-f --fplm   <file>  : fplm (form pos lemma morpho) file\n");
 }
 void context_mcd_help_message(context *ctx){
-  fprintf(stderr, "\t-m --mcd   <file> : multi column description file name\n");
+  fprintf(stderr, "\t-C --mcd   <file> : multi column description file name\n");
 }
 void context_language_help_message(context *ctx){
-  fprintf(stderr, "\t-C --language  : identifier of the language to use\n");
+  fprintf(stderr, "\t-L --language  : identifier of the language to use\n");
 }
 void context_maca_data_path_help_message(context *ctx){
   fprintf(stderr, "\t-M --maca_data_path  : path to maca_data directory\n");
@@ -87,7 +87,7 @@ context *context_read_options(int argc, char *argv[])
       {"verbose",             no_argument,       0, 'v'},
       {"debug",               no_argument,       0, 'd'},
       {"conll",               required_argument, 0, 'i'},
-      {"mcd",                 required_argument, 0, 'm'}, 
+      {"mcd",                 required_argument, 0, 'C'}, 
       {"language",            required_argument, 0, 'L'},
       {"fplm",                required_argument, 0, 'f'},
       {"form_column",         required_argument, 0, 'F'},
@@ -97,7 +97,7 @@ context *context_read_options(int argc, char *argv[])
   optind = 0;
   opterr = 0;
   
-  while ((c = getopt_long (argc, argv, "hvdi:f:m:L:M:F:D:", long_options, &option_index)) != -1){ 
+  while ((c = getopt_long (argc, argv, "hvdi:f:C:L:M:F:D:P:", long_options, &option_index)) != -1){ 
     switch (c)
       {
       case 'd':
@@ -121,7 +121,7 @@ context *context_read_options(int argc, char *argv[])
       case 'i':
 	ctx->conll_filename = strdup(optarg);
 	break;
-      case 'm':
+      case 'C':
 	ctx->mcd_filename = strdup(optarg);
 	break;
       case 'L':
diff --git a/maca_lemmatizer/src/maca_lemmatizer.c b/maca_lemmatizer/src/maca_lemmatizer.c
index 702b691..7e22f08 100644
--- a/maca_lemmatizer/src/maca_lemmatizer.c
+++ b/maca_lemmatizer/src/maca_lemmatizer.c
@@ -109,12 +109,12 @@ int main(int argc, char *argv[])
   else
     pos_column = ctx->mcd_struct->type2col[FEAT_TYPE_POS];
 
+
   if(ctx->form_column != -1)
     form_column = ctx->form_column;
   else
     form_column = ctx->mcd_struct->type2col[FEAT_TYPE_FORM];
-
-
+  
   if(ctx->conll_filename == NULL)
     f = stdin;
   else
@@ -141,8 +141,9 @@ int main(int argc, char *argv[])
       if(column_nb == form_column)
 	form = strdup(token);
       /* if((column_nb < ctx->mcd_struct->nb_col) && (column_nb == pos_column)) */
-      if(column_nb == pos_column)
+      if(column_nb == pos_column){
 	pos = strdup(token);
+      }
       column_nb++;
     } while((token = strtok(NULL , "\t")));
     
diff --git a/maca_trans_parser/src/cf_file.c b/maca_trans_parser/src/cf_file.c
index 06a1bae..ef43d01 100644
--- a/maca_trans_parser/src/cf_file.c
+++ b/maca_trans_parser/src/cf_file.c
@@ -22,6 +22,7 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int
 
     }
   }
+
   *max_feat = *max_feat + 1;
   *max_class = *max_class + 1;
   fclose(f);
diff --git a/maca_trans_parser/src/config.c b/maca_trans_parser/src/config.c
index 2385b06..84fea5a 100644
--- a/maca_trans_parser/src/config.c
+++ b/maca_trans_parser/src/config.c
@@ -23,17 +23,18 @@ config *config_new(FILE *f, mcd *mcd_struct)
   return c;
 }
 
-void config_add_next_word_to_buffer(config *c)
+word *config_add_next_word_to_buffer(config *c)
 {
   word *w = NULL;
 
   w = word_read(c->f, c->mcd_struct);
-  if(w == NULL) return;
+  if(w == NULL) return NULL;
   if(word_get_index(w) == -1){
     w->feat_array[FEAT_TYPE_INDEX] = c->current_index++; 
-    printf("current index = %d\n", c->current_index);
+    /* printf("current index = %d\n", c->current_index); */
   }
   queue_add(c->bf, w);
+  return w;
 }
 
 void config_free(config *c)
@@ -64,6 +65,17 @@ config *config_initial(FILE *f, mcd *mcd_struct, int lookahead)
   return c;
 }
 
+config *config_initial_no_dummy_word(FILE *f, mcd *mcd_struct, int lookahead)
+{
+  int i;
+  config *c = config_new(f, mcd_struct);
+
+  for(i=0; i < lookahead; i++)
+    config_add_next_word_to_buffer(c);
+
+  return c;
+}
+
 config *config_copy(config *o) 
 {
   int i;
diff --git a/maca_trans_parser/src/config.h b/maca_trans_parser/src/config.h
index f14814f..b08204c 100644
--- a/maca_trans_parser/src/config.h
+++ b/maca_trans_parser/src/config.h
@@ -34,12 +34,13 @@ int config_equal(config *c1, config *c2);
 int config_equal2(config *c1, config *c2);
 config *config_new(FILE *f, mcd *mcd_struct);
 config *config_initial(FILE *f, mcd *mcd_struct, int lookahead);
+config *config_initial_no_dummy_word(FILE *f, mcd *mcd_struct, int lookahead);
 config *config_copy(config *o);
 void config_print(FILE *buffer, config *c);
 int config_is_terminal(config *c);
 void config_free(config *c);
 void config_add_mvt(config *c, int mvt);
-void config_add_next_word_to_buffer(config *c);
+word *config_add_next_word_to_buffer(config *c);
 void config_connect_subtrees(config *c, int root_label);
 
 
diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c
index 125796d..5446532 100644
--- a/maca_trans_parser/src/context.c
+++ b/maca_trans_parser/src/context.c
@@ -26,9 +26,10 @@ void context_free(context *ctx)
   if(ctx->d_perceptron_features)
     dico_free(ctx->d_perceptron_features);
 
+  /*
   if(ctx->mcd_struct)
     mcd_free(ctx->mcd_struct);
-
+  */
   if(ctx->features_model)
     feat_model_free(ctx->features_model);
 
@@ -88,11 +89,12 @@ void context_general_help_message(context *ctx)
 {
     fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
     fprintf(stderr, "Options:\n");
-    fprintf(stderr, "\t-h --help              : print this message\n");
-    fprintf(stderr, "\t-v --verbose           : activate verbose mode\n");
-    fprintf(stderr, "\t-r --hratio    <float> : set the occupation ratio of hash tables (default is 0.5)\n");
+    fprintf(stderr, "\t-h --help                 : print this message\n");
+    fprintf(stderr, "\t-v --verbose              : activate verbose mode\n");
+    fprintf(stderr, "\t-r --hratio    <float>    : set the occupation ratio of hash tables (default is 0.5)\n");
     fprintf(stderr, "\t-D --maca_data_path <str> : path to the maca_data directory\n");
-    fprintf(stderr, "\t-L --language    <str> : identifier of the language to use (default is fr)\n");
+    fprintf(stderr, "\t-L --language    <str>    : identifier of the language to use (default is fr)\n");
+    fprintf(stderr, "\t-S --stream               : stream mode\n");
 }
 
 void context_model_help_message(context *ctx){
diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h
index 1d1d1bd..b349449 100644
--- a/maca_trans_parser/src/context.h
+++ b/maca_trans_parser/src/context.h
@@ -91,5 +91,6 @@ void context_f2p_filename_help_message(context *ctx);
 void context_conll_help_message(context *ctx);
 void context_ifpls_help_message(context *ctx);
 void context_input_help_message(context *ctx);
+void context_root_label_help_message(context *ctx);
 
 #endif
diff --git a/maca_trans_parser/src/decode.c b/maca_trans_parser/src/decode.c
index 225aab9..6e749d1 100644
--- a/maca_trans_parser/src/decode.c
+++ b/maca_trans_parser/src/decode.c
@@ -25,6 +25,7 @@ void decode_help_message(context *ctx)
   context_model_help_message(ctx);
   context_vocabs_help_message(ctx);
   context_features_model_help_message(ctx);
+  context_root_label_help_message(ctx);
 }
 
 void decode_check_options(context *ctx){
diff --git a/maca_trans_parser/src/depset.c b/maca_trans_parser/src/depset.c
index a71bba8..d949351 100644
--- a/maca_trans_parser/src/depset.c
+++ b/maca_trans_parser/src/depset.c
@@ -81,7 +81,20 @@ void depset_print2(FILE *f, depset *d, dico *dico_labels)
       fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label));
     }
   }  
-  fprintf(f, "\n");
+  /* fprintf(f, "\n"); */
+}
+
+void depset_print3(FILE *f, depset *d, dico *dico_labels)
+{
+  int i;
+
+  for(i=1; i < d->length; i++){
+    if((d->array[i].gov) && (d->array[i].dep)){
+      /* fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov) - word_get_index(d->array[i].dep), dico_int2string(dico_labels, d->array[i].label ));*/
+      fprintf(f, "%d\t%s\t%d\t%s\n", word_get_index(d->array[i].dep), d->array[i].dep->input, word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label));
+    }
+  }  
+  /* fprintf(f, "\n"); */
 }
 
 char *skip_index(char *buffer)
@@ -100,7 +113,8 @@ void depset_print_new_index(FILE *f, depset *d, dico *dico_labels)
 
   for(i=1; i < d->length; i++){
     if((d->array[i].gov) && (d->array[i].dep)){
-       fprintf(f, "%d", word_get_index(d->array[i].dep));
+       /* fprintf(f, "%d\t", word_get_index(d->array[i].dep)); */
+       fprintf(f, "%d\t", word_get_index(d->array[i].dep));
        fprintf(f, "%s\t%d\t%s\n", skip_index(d->array[i].dep->input), word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label));
     }
   }  
diff --git a/maca_trans_parser/src/depset.h b/maca_trans_parser/src/depset.h
index f7ba0c5..f8a5ccb 100644
--- a/maca_trans_parser/src/depset.h
+++ b/maca_trans_parser/src/depset.h
@@ -24,6 +24,7 @@ void depset_init(depset *d);
 void depset_add(depset *d, word *gov, int label, word *dep);
 void depset_print(FILE *f, depset *d);
 void depset_print2(FILE *f, depset *d, dico *dico_labels);
+void depset_print3(FILE *f, depset *d, dico *dico_labels);
 void depset_print_new_index(FILE *f, depset *d, dico *dico_labels);
 
 
diff --git a/maca_trans_parser/src/feat_model.c b/maca_trans_parser/src/feat_model.c
index 267a662..6b6ad22 100644
--- a/maca_trans_parser/src/feat_model.c
+++ b/maca_trans_parser/src/feat_model.c
@@ -113,9 +113,11 @@ int feat_model_get_feat_value_cff(feat_model *fm, config *c, dico *dico_features
     catenate_int(fm->string, feat_val);
   }
  
-  if(mode == LOOKUP_MODE) 
+  if(mode == LOOKUP_MODE){
+    if(fm->string)
+    /* printf("fmstring = %s\n", fm->string); */
     return dico_string2int(dico_features, fm->string);
-    
+  } 
   return dico_add(dico_features, fm->string);
 }
 
diff --git a/maca_trans_parser/src/feature_table.c b/maca_trans_parser/src/feature_table.c
index 7450eb1..3db2da4 100644
--- a/maca_trans_parser/src/feature_table.c
+++ b/maca_trans_parser/src/feature_table.c
@@ -150,7 +150,8 @@ int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max)
   
   for(feat=0; feat < fv->nb; feat++){
     for(cla=0; cla < classes_nb; cla++){
-      if(fv->t[feat] != -1){
+      if((fv->t[feat] != -1) && (fv->t[feat] < ft->features_nb)){
+      /* if(fv->t[feat] != -1){ */
 	/* printf("feat score = %f\n", ft->table[fv->t[feat]][cla]); */
 	classes_score[cla] += ft->table[fv->t[feat]][cla];
       }
diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff.c b/maca_trans_parser/src/maca_trans_parser_conll2cff.c
index 3a49379..3ca37e8 100644
--- a/maca_trans_parser/src/maca_trans_parser_conll2cff.c
+++ b/maca_trans_parser/src/maca_trans_parser_conll2cff.c
@@ -36,7 +36,7 @@ void maca_trans_parser_conll2cff_check_options(context *ctx)
   if(!ctx->input_filename
      || ctx->help
      /* || !ctx->mcd_filename */
-     || !(ctx->cff_filename || ctx->fann_filename)
+     /* || !(ctx->cff_filename || ctx->fann_filename) */
      ){
     maca_trans_parser_conll2cff_help_message(ctx);
     exit(1);
diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c
index b7d5fae..9aa8c06 100644
--- a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c
+++ b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c
@@ -20,8 +20,10 @@ void add_signature_to_words_in_queue(queue *bf, form2pos *f2p)
 
   for(i=0; i < queue_nbelem(bf); i++){
     w = queue_elt_n(bf, i);
-    /* printf("add signature %d to word %s\n", form2pos_get_signature(f2p, w->form), w->form); */
-    w->signature = form2pos_get_signature(f2p, w->form);
+    if(!w->signature){
+      /* printf("add signature %d to word %s\n", form2pos_get_signature(f2p, w->form), w->form); */
+      w->signature = form2pos_get_signature(f2p, w->form);
+    }
   }
 }
 
@@ -61,27 +63,20 @@ void generate_training_file_stream(FILE *output_file, context *ctx)
 {  
   config *c;
   feat_vec *fv = feat_vec_new(feature_types_nb);
-  sentence *ref = NULL;
-  int sentence_nb = 0;
   FILE *conll_file = myfopen(ctx->input_filename, "r");
-  FILE *conll_file_ref = myfopen(ctx->input_filename, "r");
   int postag;
 
-  c = config_initial(conll_file, ctx->mcd_struct, 5);
-  
-  while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ 
-    /* sentence_print(stdout, ref, mcd_get_dico_label(ctx->mcd_struct)); */
-    while(1){
-       /* config_print(stdout,c);       */
-      config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
-      postag = oracle_tagger(c, ref);
+  c = config_initial_no_dummy_word(conll_file, ctx->mcd_struct, 5);
 
-      fprintf(output_file, "%d", postag);
-      feat_vec_print(output_file, fv);
-      
-      if(postag != -1)
-	movement_tagger(c, postag, 0, 1);
-    }
+  while(!config_is_terminal(c)){
+    /* config_print(stdout,c);         */
+    if(ctx->f2p)
+      add_signature_to_words_in_queue(c->bf, ctx->f2p);
+    config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
+    postag = oracle_tagger(c, NULL);
+    fprintf(output_file, "%d", postag);
+    feat_vec_print(output_file, fv);
+    movement_tagger(c, postag, 0, 1);
   }
 }
 
diff --git a/maca_trans_parser/src/movement_tagger.c b/maca_trans_parser/src/movement_tagger.c
index ff5e305..d1f46b6 100644
--- a/maca_trans_parser/src/movement_tagger.c
+++ b/maca_trans_parser/src/movement_tagger.c
@@ -7,15 +7,19 @@
 int movement_tagger(config *c, int postag, float score, int stream)
 {
   word *b0 = NULL;
+  int k = 5;
+  
   if(queue_is_empty(c->bf)) return 0;
 
-  b0 = queue_elt_n(c->bf, 0);
+  b0 = queue_remove(c->bf);
   word_set_pos(b0, postag); 
-  stack_push(c->st, queue_remove(c->bf));
+  stack_push(c->st, b0);
 
-  /* in stream mode, read a new word and add it to the buffer */
-  if(stream)
-     config_add_next_word_to_buffer(c); 
+  /* in stream mode, read a new word, add it to the buffer and keen only (k = 5) elts in the stack */
+  if(stream){
+    stack_trim_to_size(c->st, k); 
+    config_add_next_word_to_buffer(c);
+  }
 
   return 1;
 }
diff --git a/maca_trans_parser/src/oracle_tagger.c b/maca_trans_parser/src/oracle_tagger.c
index c08cc23..10d53d8 100644
--- a/maca_trans_parser/src/oracle_tagger.c
+++ b/maca_trans_parser/src/oracle_tagger.c
@@ -3,12 +3,15 @@
 int oracle_tagger(config *c, sentence *ref)
 {
   word *b0; /* next word in the bufer */
-  int b0_index;
-
+  /* int b0_index; */
+  int b0_pos;
   if(!queue_is_empty(c->bf)){
     b0 = queue_elt_n(c->bf, 0);
-    b0_index = word_get_index(b0);
-    return word_get_pos(ref->words[b0_index]);
+    b0_pos = word_get_pos(b0);
+    /* printf("b0_pos = %d\n", b0_pos); */
+    /* b0_index = word_get_index(b0); */
+    /* return word_get_pos(ref->words[b0_index]); */
+    return b0_pos;
   }
   return -1;
 }
diff --git a/maca_trans_parser/src/perceptron.c b/maca_trans_parser/src/perceptron.c
index 8b15609..3425a3c 100644
--- a/maca_trans_parser/src/perceptron.c
+++ b/maca_trans_parser/src/perceptron.c
@@ -1,9 +1,8 @@
 #include<stdio.h>
 #include<stdlib.h>
 #include<string.h>
-#include"feat_fct.h"
 #include"feature_table.h"
-#include"config2feat_vec.h"
+#include"util.h"
 
 void perceptron_avg(char *filename, feature_table *ft, int n_iter)
 {
@@ -18,7 +17,8 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter)
   int epoch;
   int i,j;
   float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float));
-  feat_vec *fv = feat_vec_new(feature_types_nb);
+  /* feat_vec *fv = feat_vec_new(feature_types_nb); */
+  feat_vec *fv = feat_vec_new(1);
   char *token;
   feature_table *ft_sum = feature_table_new(ft->features_nb, ft->classes_nb);
   int counter = 1;
@@ -100,7 +100,8 @@ void perceptron(char *filename, feature_table *ft, int n_iter)
   int epoch;
   int i;
   float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float));
-  feat_vec *fv = feat_vec_new(feature_types_nb);
+  /* feat_vec *fv = feat_vec_new(feature_types_nb); */
+  feat_vec *fv = feat_vec_new(1);
   char *token;
 
   for(epoch = 0; epoch < n_iter; epoch++){
diff --git a/maca_trans_parser/src/queue.c b/maca_trans_parser/src/queue.c
index 4e142d2..53985be 100644
--- a/maca_trans_parser/src/queue.c
+++ b/maca_trans_parser/src/queue.c
@@ -28,7 +28,8 @@ int queue_read_sentence(queue *bf, FILE *f, mcd *mcd_struct)
     }
     queue_add(bf, w);
   }
-  return bf->nbelem - 1; /* because of the dummy word */
+  /* return bf->nbelem - 1; */ /* because of the dummy word */
+  return bf->nbelem ; 
 }
 
 
diff --git a/maca_trans_parser/src/sentence.c b/maca_trans_parser/src/sentence.c
index 63cdb8b..750aaac 100644
--- a/maca_trans_parser/src/sentence.c
+++ b/maca_trans_parser/src/sentence.c
@@ -19,7 +19,7 @@ sentence *sentence_new(mcd *m, FILE *f)
 sentence *sentence_init(mcd *m, FILE *f)
 {
   sentence *s = sentence_new(m, f);
-  sentence_add_word(s, word_create_dummy(m));
+  sentence_add_word(s, word_create_dummy(m)); 
   return s;
 }
 
@@ -81,3 +81,23 @@ sentence *sentence_read(FILE *f, mcd *mcd_struct)
   }
   return s;
 }
+
+sentence *sentence_read_no_dummy_word(FILE *f, mcd *mcd_struct)
+{
+  sentence *s = sentence_new(mcd_struct, f);
+  char buffer[1000];
+  word *w = NULL;
+  
+  while(fgets(buffer, 1000, f)){
+    if(feof(f)) break;
+    if((buffer[0] == '\n') || (buffer[0] == ' ')) break; /* end of the sentence */
+    w = word_parse_buffer(buffer, mcd_struct);
+    sentence_add_word(s, w);
+  }
+  
+  if(s->length == 1){
+    sentence_free(s);
+    return NULL;
+  }
+  return s;
+}
diff --git a/maca_trans_parser/src/sentence.h b/maca_trans_parser/src/sentence.h
index 6a80509..cdd1019 100644
--- a/maca_trans_parser/src/sentence.h
+++ b/maca_trans_parser/src/sentence.h
@@ -18,6 +18,7 @@ sentence *sentence_init(mcd *m, FILE *f);
 
 void sentence_print(FILE *f, sentence *s, dico *dico_labels);
 sentence *sentence_read(FILE *f, mcd *mcd_struct);
+sentence *sentence_read_no_dummy_word(FILE *f, mcd *mcd_struct);
 void sentence_add_word(sentence *s, word *w);
 void sentence_free(sentence *s);
 
diff --git a/maca_trans_parser/src/simple_decoder.c b/maca_trans_parser/src/simple_decoder.c
index 8d43e99..b43dcbe 100644
--- a/maca_trans_parser/src/simple_decoder.c
+++ b/maca_trans_parser/src/simple_decoder.c
@@ -21,7 +21,7 @@ void simple_decoder_buffer(context *ctx, FILE *f, feature_table *ft, int root_la
   config *c = config_initial(f, ctx->mcd_struct, 0);
   
   /* read a sentence and put it in the buffer */
-  while(queue_read_sentence(c->bf, f, ctx->mcd_struct)){
+  while(queue_read_sentence(c->bf, f, ctx->mcd_struct) > 1){
     while(!config_is_terminal(c)){
       config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
       mvt_code = feature_table_argmax(fv, ft, &max);
@@ -43,6 +43,8 @@ void simple_decoder_buffer(context *ctx, FILE *f, feature_table *ft, int root_la
     
     config_connect_subtrees(c, root_label); 
     depset_print2(stdout, c->ds, ctx->dico_labels);
+    fprintf(stdout, "\n");
+
     
     /* config_free(c);  */
     c = config_initial(f, ctx->mcd_struct, 0);
@@ -59,13 +61,14 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la
   int mvt_label;
   float max;
   feat_vec *fv = feat_vec_new(feature_types_nb);
-  config *c = config_initial(f, ctx->mcd_struct, 5);
+  config *c = NULL;
 
   /* when in stream mode, force to renumber the tokens (ugly !) */
-  ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1;
+  /* ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; */
 
+  c = config_initial(f, ctx->mcd_struct, 5);
   while(!config_is_terminal(c)){
-    config_print(stdout, c);
+    /* config_print(stdout, c); */
     config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
     /* feat_vec_print_string(fv, ctx->d_perceptron_features); */
     mvt_code = feature_table_argmax(fv, ft, &max);
@@ -73,9 +76,10 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la
     mvt_label = movement_label(mvt_code);
 
     /* printf("code predicted = %d\n", mvt_code); */
-    
-    if((stack_height(c->st)==1)  && (mvt_type == MVT_RIGHT) && (mvt_label == root_label)){       /* sentence is complete */
 
+    /* sentence is complete */
+    if((stack_height(c->st)==1)  && (mvt_type == MVT_RIGHT) && (mvt_label == root_label)){      
+    /* if(mvt_label == root_label){        */
       /* config_print(stdout, c);  */
       
       /* create the root arc */
@@ -87,9 +91,13 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la
       /* config_print(stdout, c);   */
 
       config_connect_subtrees(c, root_label); 
-      /*      depset_print_new_index(stdout, c->ds, ctx->dico_labels);*/
-      depset_print2(stdout, c->ds, ctx->dico_labels);
+      /* depset_print_new_index(stdout, c->ds, ctx->dico_labels); */
 
+      if(ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] == -1)
+	depset_print3(stdout, c->ds, ctx->dico_labels);
+      else
+	depset_print2(stdout, c->ds, ctx->dico_labels);
+      
       /* pop the dummy word */
       stack_pop(c->st);
       /* remplace it with a fresh one */
@@ -98,7 +106,7 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la
       /* empty depset */
       depset_free(c->ds);
       c->ds = depset_new();
-      c->current_index = queue_renumber_words(c->bf);
+      /* c->current_index = queue_renumber_words(c->bf); */
       continue;
     }
 
diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c
index 9f58cdd..882a2aa 100644
--- a/maca_trans_parser/src/simple_decoder_tagger.c
+++ b/maca_trans_parser/src/simple_decoder_tagger.c
@@ -31,11 +31,12 @@ void simple_decoder_buffer(context *ctx)
   int i;
   word *w = NULL;
   FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
-  config *c = config_initial(f, ctx->mcd_struct, 0);
+  config *c = config_initial_no_dummy_word(f, ctx->mcd_struct, 0);
 
   /* read a sentence and put it in the buffer */
   while(queue_read_sentence(c->bf, f, ctx->mcd_struct)){
-    queue_remove(c->bf); /* get rid of dummy token */
+    /* queue_remove(c->bf); */
+    /* get rid of dummy token */
     if(ctx->f2p)
       add_signature_to_words_in_queue(c->bf, ctx->f2p);
     while(!config_is_terminal(c)){
@@ -54,7 +55,7 @@ void simple_decoder_buffer(context *ctx)
     printf("\n");
 
     /* config_free(c);  */
-    c = config_initial(f, ctx->mcd_struct, 0);
+    c = config_initial_no_dummy_word(f, ctx->mcd_struct, 0);
   }
   if(ctx->input_filename)
     fclose(f);
@@ -65,16 +66,33 @@ void simple_decoder_stream(context *ctx)
 {
   config *c;
   feat_vec *fv = feat_vec_new(feature_types_nb);
-  FILE *f = NULL; 
+  FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
+  feature_table *ft =  feature_table_load(ctx->perc_model_filename, ctx->verbose);
+  int postag;
+  float max;
+  word *w;
+  dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS");
 
-  /* when in stream mode, force to renumber the tokens (ugly !) */
-  ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1;
-  
-  c = config_initial(f, ctx->mcd_struct, 5);
+   c = config_initial_no_dummy_word(f, ctx->mcd_struct, 5);
   while(!config_is_terminal(c)){
-    config_print(stdout, c);
+    if(ctx->f2p)
+      add_signature_to_words_in_queue(c->bf, ctx->f2p);
+    /* config_print(stdout, c); */
     config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
+    
+    /* feat_vec_print(stdout, fv); */
+    postag = feature_table_argmax(fv, ft, &max);
+    /* printf("postag = %d\n", postag); */
+
+    w = queue_elt_n(c->bf, 0);
+    printf("%s\t%s\n", w->input, dico_int2string(dico_pos, postag));
+
+    if(postag != -1)
+      movement_tagger(c, postag, max, 1);
+    
+    
   }
+
   /* config_print(stdout, c);  */
   /* config_free(c); */
 }
diff --git a/maca_trans_parser/src/stack.c b/maca_trans_parser/src/stack.c
index 2672732..cbd419e 100644
--- a/maca_trans_parser/src/stack.c
+++ b/maca_trans_parser/src/stack.c
@@ -31,6 +31,16 @@ stack *stack_new(void)
   s->top = 0;
   return s;
 }
+/*
+stack *stack_static_new(int size)
+{
+  stack *s = (stack *)memalloc(sizeof(stack));
+  s->size = size;
+  s->array = (word **)memalloc(size * sizeof(word*)));
+  s->top = 0;
+  return s;
+}
+*/
 
 stack *stack_copy(stack *s)
 {
@@ -87,3 +97,18 @@ void stack_print(FILE *buffer, stack *s)
     fprintf(buffer, "]");
   }
 }
+
+/* keep k upper elements in the stack */
+
+void stack_trim_to_size(stack *s, int k) 
+{
+  int i, delta;
+  
+  if(stack_nbelem(s) > k){
+    delta = stack_nbelem(s) - k;
+    for(i = 0; i < s->top; i++){
+      s->array[i] = s->array[i + delta]; 
+    }
+    s->top = k;
+  }
+}
diff --git a/maca_trans_parser/src/stack.h b/maca_trans_parser/src/stack.h
index 2ccd6e2..de01374 100644
--- a/maca_trans_parser/src/stack.h
+++ b/maca_trans_parser/src/stack.h
@@ -23,4 +23,5 @@ void stack_print(FILE *buffer, stack *s);
 void stack_free(stack *s);
 int stack_is_empty(stack *s);
 /* int stack_height(stack *s); */
+void stack_trim_to_size(stack *s, int k);
 #endif
diff --git a/maca_trans_parser/src/word.c b/maca_trans_parser/src/word.c
index 277b15e..9d46d25 100644
--- a/maca_trans_parser/src/word.c
+++ b/maca_trans_parser/src/word.c
@@ -38,6 +38,9 @@ word *word_read(FILE *f, mcd *mcd_struct)
   return NULL;
 }
 
+/* parse string buffer to extract the different word features */
+/* codes of the word features are stored in feat_array */
+
 word *word_parse_buffer(char *buffer, mcd *mcd_struct)
 {
   char *token;   
diff --git a/maca_trans_parser/src/word.h b/maca_trans_parser/src/word.h
index 9414198..2ecd333 100644
--- a/maca_trans_parser/src/word.h
+++ b/maca_trans_parser/src/word.h
@@ -73,15 +73,16 @@
 #define word_set_V(w, val)     (w)->feat_array[FEAT_TYPE_V] = (val)
 #define word_set_W(w, val)     (w)->feat_array[FEAT_TYPE_W] = (val)
 #define word_set_X(w, val)     (w)->feat_array[FEAT_TYPE_X] = (val)
+
 #define word_set_Y(w, val)     (w)->feat_array[FEAT_TYPE_Y] = (val)
 #define word_set_Z(w, val)     (w)->feat_array[FEAT_TYPE_Z] = (val)
 #define word_set_signature(w, val)     (w)->signature = (val)
 
 typedef struct _word {
-  int feat_array[FEAT_TYPE_NB];
-  char *input;
-  int U1;        /* does the form begin with an uppercase character */
-  int signature; /* pos tags that this form can have (represented as a boolean string) */
+  int feat_array[FEAT_TYPE_NB]; /* array containing the codes corresponding to the different word features */
+  char *input;                  /* the string corresponding to the actual line in the corpus file */
+  int U1;                       /* does the form begin with an uppercase character */
+  int signature;                /* pos tags that this form can have (represented as a boolean string) */
   int label;
   char *form;
 } word;
-- 
GitLab