From cc969704be43c65441954ada9d103a01bcff61ec Mon Sep 17 00:00:00 2001
From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
Date: Wed, 26 Oct 2016 11:36:32 -0400
Subject: [PATCH] changed conditions in the parser for sentences that do not
 end with punctuation (workaround) added a new lemmatizer that uses the
 maca_trans_xx architecture added an experimental, very limited interpreter
 (maca_trans_interpreter) to be developped if useful changed behaviour of
 tagger and parser : if they have pos or lemma in the input, they are kept
 unchanged code refactoring

---
 CMakeLists.txt                                |   2 +-
 maca_common/include/form2pos.h                |   2 +-
 maca_common/include/mcd.h                     |   1 +
 maca_common/include/util.h                    |   1 +
 maca_common/include/word.h                    |   2 +
 maca_common/src/form2pos.c                    |  23 ++
 maca_common/src/util.c                        |  10 +
 maca_common/src/word.c                        |  38 +++-
 maca_lemmatizer/src/maca_lemmatizer.c         |  83 ++++++--
 maca_tools/src/mcf2conll.c                    |  26 +--
 maca_trans_parser/CMakeLists.txt              |  22 +-
 maca_trans_parser/src/context.c               |   2 +-
 maca_trans_parser/src/context.h               |   5 +-
 .../src/maca_trans_interpreter.c              | 150 +++++++++++++
 maca_trans_parser/src/maca_trans_lemmatizer.c | 198 ++++++++++++++++++
 maca_trans_parser/src/maca_trans_parser.c     |   2 -
 .../src/maca_trans_parser_arc_eager_mcf2cff.c |  52 ++---
 .../src/maca_trans_tagger_mcf2cff.c           |  82 ++++----
 .../maca_trans_tagparser_arc_eager_mcf2cff.c  |  49 ++---
 maca_trans_parser/src/movement_tagger.c       |   4 +-
 maca_trans_parser/src/movement_tagger.h       |   2 +-
 maca_trans_parser/src/movements.c             |   4 +-
 .../src/simple_decoder_parser_arc_eager.c     |  20 +-
 maca_trans_parser/src/simple_decoder_tagger.c | 114 ++++++++--
 .../src/simple_decoder_tagparser_arc_eager.c  |  21 +-
 perceptron/exec/cff_cutoff.c                  |   4 +-
 perceptron/lib/src/feature_table.c            |   1 +
 27 files changed, 715 insertions(+), 205 deletions(-)
 create mode 100644 maca_trans_parser/src/maca_trans_interpreter.c
 create mode 100644 maca_trans_parser/src/maca_trans_lemmatizer.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 517442d..2ebd0e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@ include_directories(perceptron/lib/include)
 add_subdirectory(maca_common)
 add_subdirectory(maca_tools)
 add_subdirectory(perceptron)
-add_subdirectory(maca_lemmatizer)
+#add_subdirectory(maca_lemmatizer)
 add_subdirectory(maca_trans_parser)
 add_subdirectory(maca_crf_tagger)
 
diff --git a/maca_common/include/form2pos.h b/maca_common/include/form2pos.h
index 515618c..56f5cbc 100644
--- a/maca_common/include/form2pos.h
+++ b/maca_common/include/form2pos.h
@@ -19,6 +19,6 @@ void form2pos_free(form2pos *f2p);
 form2pos *form2pos_read(char *filename);
 int form2pos_get_signature(form2pos *f2p, char *form);
 int form2pos_form_has_pos(form2pos *f2p, char *form, char *pos);
-
+int form2pos_word_is_non_ambiguous(form2pos *f2p, char *form, char **pos);
 
 #endif
diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h
index 622d4c3..fe4eecf 100644
--- a/maca_common/include/mcd.h
+++ b/maca_common/include/mcd.h
@@ -92,6 +92,7 @@
 
 #define mcd_set_form_col(m, v) (m)->wf[MCD_WF_FORM] = (v)
 
+
 /* mcd (multi column description) files describe the format of corpus files */
 /* every line of an mcd file describes the content of a column of the corpus file */
 /* every line contains four fields separated by a space character */
diff --git a/maca_common/include/util.h b/maca_common/include/util.h
index 26c0952..7046269 100644
--- a/maca_common/include/util.h
+++ b/maca_common/include/util.h
@@ -6,4 +6,5 @@ void myfree(void *ptr);
 void *memalloc(size_t s);
 FILE *myfopen(const char *path, const char *mode);
 FILE *myfopen_no_exit(const char *path, const char *mode);
+char *to_lower_string(char *s);
 #endif
diff --git a/maca_common/include/word.h b/maca_common/include/word.h
index 3990714..00e7808 100644
--- a/maca_common/include/word.h
+++ b/maca_common/include/word.h
@@ -109,5 +109,7 @@ word *word_read(FILE *f, mcd *mcd_struct);
 word *word_parse_buffer(char *buffer, mcd *mcd_struct);
 int word_is_eos(word *w, mcd *mcd_struct);
 int word_get_gov_index(word *w);
+void word_print_col_n(FILE *f, word *w, int n);
+void word_sprint_col_n(char *s, word *w, int n);
 
 #endif
diff --git a/maca_common/src/form2pos.c b/maca_common/src/form2pos.c
index b2efb19..dccc016 100644
--- a/maca_common/src/form2pos.c
+++ b/maca_common/src/form2pos.c
@@ -29,6 +29,29 @@ void form2pos_free(form2pos *f2p)
   free(f2p);
 }
 
+
+int form2pos_word_is_non_ambiguous(form2pos *f2p, char *form, char **pos)
+{
+  int pos_code;
+  int signature = form2pos_get_signature(f2p, form);
+  char *signature_str = dico_int2string(f2p->d_signature, signature);
+  if(signature_str == NULL) return 0;
+  int l = strlen(signature_str);
+  int sum = 0;
+  /* printf("form = %s signature = %s\n", form, signature_str); */
+  for(int i = 0; i < l; i++){
+    sum += signature_str[i] - '0';
+    if(signature_str[i] != '0') pos_code = i;
+    /* printf("sum = %d\n", sum); */
+  }
+  if(sum == 1)
+    *pos = dico_int2string(f2p->d_pos, pos_code);
+  else
+    *pos = NULL;
+      
+  return (sum ==1)? 1 : 0;
+}
+
 form2pos *form2pos_read(char *filename)
 {
   FILE *f = myfopen_no_exit(filename, "r");
diff --git a/maca_common/src/util.c b/maca_common/src/util.c
index 4ff0352..9b16c75 100644
--- a/maca_common/src/util.c
+++ b/maca_common/src/util.c
@@ -1,5 +1,7 @@
 #include<stdlib.h>
 #include<stdio.h>
+#include<string.h>
+#include<ctype.h>
 
 void myfree(void *ptr)
 {
@@ -34,3 +36,11 @@ FILE *myfopen_no_exit(const char *path, const char *mode)
   }
   return f;
 }
+
+char *to_lower_string(char *s)
+{
+  int i;
+  for(i=0; i < strlen(s); i++)
+    s[i] = tolower(s[i]);
+  return s;
+}
diff --git a/maca_common/src/word.c b/maca_common/src/word.c
index ebf00c6..b642093 100644
--- a/maca_common/src/word.c
+++ b/maca_common/src/word.c
@@ -59,7 +59,7 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct)
   w = word_new(buffer);
   token = strtok(buffer, "\t");
   do{
-    if((col < mcd_struct->nb_col) &&  (mcd_struct->wf[col] != -1)){
+    if((col < mcd_struct->nb_col) &&  (mcd_struct->wf[col] != -1) && (strcmp(token, "_"))){
       w->wf_array[mcd_struct->wf[col]] = mcd_get_code(mcd_struct, token, col);
     }
     if(mcd_struct->wf[col] == MCD_WF_FORM){
@@ -148,3 +148,39 @@ int word_get_gov_index(word *w)
   index = (word_get_index(w)) + (word_get_gov(w));
   return index; 
 }
+
+void word_print_col_n(FILE *f, word *w, int n)
+{
+  int i;
+  int col = 0;
+  char *buffer = w->input;
+  if(buffer == NULL) return;
+  int l= strlen(buffer);
+  for(i=0; i < l; i++){
+    if(buffer[i] == '\t') {
+      col++;
+      continue;
+    }
+    if(col == n)
+      fprintf(f, "%c", buffer[i]);
+  }
+}
+
+void word_sprint_col_n(char *s, word *w, int n)
+{
+  int i;
+  int col = 0;
+  int j = 0;
+  char *buffer = w->input;
+  if(buffer == NULL) return;
+  int l= strlen(buffer);
+  for(i=0; i < l; i++){
+    if(buffer[i] == '\t') {
+      col++;
+      continue;
+    }
+    if(col == n)
+      s[j++] = buffer[i];
+  }
+  s[j] = '\0';
+}
diff --git a/maca_lemmatizer/src/maca_lemmatizer.c b/maca_lemmatizer/src/maca_lemmatizer.c
index 76b640a..5d9cacd 100644
--- a/maca_lemmatizer/src/maca_lemmatizer.c
+++ b/maca_lemmatizer/src/maca_lemmatizer.c
@@ -86,14 +86,46 @@ char *to_lower_string(char *s)
     s[i] = tolower(s[i]);
   return s;
 }
+/*
+void print_word(char *input, mcd *mcd_struct, char *lemma)
+{
+  char *buffer = NULL;
+  char *token = NULL;
+  int col_nb = 0;
+  if(mcd_get_lemma_col(mcd_struct) == -1){
+    printf("%s\t%s\n", input, lemma);
+  }
+  else{
+    buffer = strdup(input);
+    token = strtok(buffer, "\t");
+    col_nb = 0;
+    while(token){
+      if(col_nb != 0) printf("\t");
+      if(col_nb == mcd_get_lemma_col(mcd_struct))
+	printf("%s", lemma);
+      else
+	word_print_col_n(stdout, w->input, col_nb);
+      col_nb++;
+      token = strtok(NULL, "\t");
+    }
+    if(col_nb <= mcd_get_lemma_col(mcd_struct))
+      printf("\t%s", lemma);
+    printf("\n");
+    free(buffer);
+  }
+}
+
+*/
 
 
 int main(int argc, char *argv[])
 {
   hash *form_pos_ht = hash_new(1000000);
   char buffer[10000];
+  char *buffer_copy;
   char *form;
   char *pos;
+
   char *token;
   int column_nb;
   char form_pos[500];
@@ -103,6 +135,7 @@ int main(int argc, char *argv[])
   context *ctx;
   int form_column;
   int pos_column;
+  int lemma_column;
   FILE *f = NULL;
 
   ctx = context_read_options(argc, argv);
@@ -123,24 +156,31 @@ int main(int argc, char *argv[])
     f = stdin;
   else
     f = myfopen(ctx->conll_filename, "r");
- 
+
+  lemma_column = ctx->mcd_struct->wf2col[MCD_WF_LEMMA];
+  
   lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht, ctx->debug_mode);
   
   /* look for a valid word */
-  while(fgets(buffer, 10000, f)){
+  buffer_copy = strdup(buffer);
+  while(fgets(buffer_copy, 10000, f)){
     if(feof(f)) return 0; /* no more words to read */
-    if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){
+    if((buffer_copy[0] == '\n') || (buffer_copy[0] == ' ') || (buffer_copy[0] == '\t')){
       printf("\n");
       continue;
     }
     
-    buffer[strlen(buffer)-1] = '\0';
-    printf("%s", buffer);
-    token = strtok(buffer, "\t");
+    buffer_copy[strlen(buffer_copy)-1] = '\0';
+    printf("%s", buffer_copy);
+    token = strtok(buffer_copy, "\t");
     column_nb = 0;
     form = NULL;
     pos = NULL;
+    lemma = NULL;
     do{
+      if(column_nb == lemma_column) /* lemma is present in the input file */
+	if(strcmp(token, "_")) /* and it is not an underscore */
+	  lemma = strdup(token);
       /* if((column_nb < ctx->mcd_struct->nb_col) && (column_nb == form_column)) */
       if(column_nb == form_column)
 	form = strdup(token);
@@ -151,32 +191,37 @@ int main(int argc, char *argv[])
       column_nb++;
     } while((token = strtok(NULL , "\t")));
     
-    strcpy(form_pos, form);
-    strcat(form_pos, "/");
-    strcat(form_pos, pos);
-    index_form_pos = hash_get_val(form_pos_ht, form_pos);
-    if(index_form_pos != HASH_INVALID_VAL){
-      lemma = lemma_array[index_form_pos];
-    }
-    else{
-      to_lower_string(form_pos);
+    if(lemma == NULL){
+      strcpy(form_pos, form);
+      strcat(form_pos, "/");
+      strcat(form_pos, pos);
       index_form_pos = hash_get_val(form_pos_ht, form_pos);
       if(index_form_pos != HASH_INVALID_VAL){
 	lemma = lemma_array[index_form_pos];
       }
-      else
-	if(ctx->verbose){
-	  fprintf(stderr, "cannot find an entry for %s %s\n", form, pos);
+      else{
+	to_lower_string(form_pos);
+	index_form_pos = hash_get_val(form_pos_ht, form_pos);
+	if(index_form_pos != HASH_INVALID_VAL){
+	  lemma = lemma_array[index_form_pos];
 	}
+	else
+	  if(ctx->verbose){
+	    fprintf(stderr, "cannot find an entry for %s %s\n", form, pos);
+	  }
 	lemma = form;
+      }
     }
     
-    /* printf("form = %s pos = %s (%s) lemma = %s\n", form, pos, form_pos, lemma);  */
+    /* print_word(buffer, ctx->mcd_struct, lemma); */
+
+  /* printf("form = %s pos = %s (%s) lemma = %s\n", form, pos, form_pos, lemma);  */
     printf("\t%s\n", lemma);
     
     if(pos)free(pos);
     if(form)free(form);
   }
+  free(buffer_copy);
   free(lemma_array);
   hash_free(form_pos_ht);
 
diff --git a/maca_tools/src/mcf2conll.c b/maca_tools/src/mcf2conll.c
index fa5f7cf..49f3e34 100644
--- a/maca_tools/src/mcf2conll.c
+++ b/maca_tools/src/mcf2conll.c
@@ -52,20 +52,6 @@ void mcf2conll_check_options(context *ctx){
   }
 }
 
-void str_print_col_n(FILE *f, char *buffer, int n)
-{
-  int i;
-  int col = 0;
-  int l= strlen(buffer);
-  for(i=0; i < l; i++){
-    if(buffer[i] == '\t') {
-      col++;
-      continue;
-    }
-    if(col == n)
-      fprintf(f, "%c", buffer[i]);
-  }
-}
 
 
 context *context_read_options(int argc, char *argv[])
@@ -151,31 +137,31 @@ int main(int argc, char *argv[])
       printf("%d\t", index);
       
       if(form_col != -1)
-	str_print_col_n(output_file, w->input, form_col);
+	word_print_col_n(output_file, w, form_col);
       else
 	fprintf(output_file, "_");
       fprintf(output_file, "\t");
       
       if(lemma_col != -1)
-	str_print_col_n(output_file, w->input, lemma_col);
+	word_print_col_n(output_file, w, lemma_col);
       else
 	fprintf(output_file, "_");
       fprintf(output_file, "\t");
       
       if(cpos_col != -1)
-	str_print_col_n(output_file, w->input, cpos_col);
+	word_print_col_n(output_file, w, cpos_col);
       else
 	fprintf(output_file, "_");
       fprintf(output_file, "\t");
       
       if(pos_col != -1)
-	str_print_col_n(output_file, w->input, pos_col);
+	word_print_col_n(output_file, w, pos_col);
       else
 	fprintf(output_file, "_");
       fprintf(output_file, "\t");
       
       if(feats_col != -1)
-	str_print_col_n(output_file, w->input, feats_col);
+	word_print_col_n(output_file, w, feats_col);
       else
 	fprintf(output_file, "_");
       fprintf(output_file, "\t");
@@ -190,7 +176,7 @@ int main(int argc, char *argv[])
 	fprintf(output_file, "_\t");
     
       if(label_col != -1)
-	str_print_col_n(output_file, w->input, label_col);
+	word_print_col_n(output_file, w, label_col);
       else
 	fprintf(output_file, "_");
       fprintf(output_file, "\t");
diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt
index 85d67be..2b199d3 100644
--- a/maca_trans_parser/CMakeLists.txt
+++ b/maca_trans_parser/CMakeLists.txt
@@ -53,11 +53,11 @@ install (TARGETS maca_trans_tagger_mcf2cff DESTINATION bin)
 #target_link_libraries(maca_trans_parser_mcf2cff maca_common)
 #install (TARGETS maca_trans_parser_mcf2cff DESTINATION bin)
 
-#add_executable(maca_trans_parser_arc_eager_mcf2cff ./src/maca_trans_parser_arc_eager_mcf2cff.c)
-#target_link_libraries(maca_trans_parser_arc_eager_mcf2cff perceptron)
-#target_link_libraries(maca_trans_parser_arc_eager_mcf2cff transparse)
-#target_link_libraries(maca_trans_parser_arc_eager_mcf2cff maca_common)
-#install (TARGETS maca_trans_parser_arc_eager_mcf2cff DESTINATION bin)
+add_executable(maca_trans_parser_arc_eager_mcf2cff ./src/maca_trans_parser_arc_eager_mcf2cff.c)
+target_link_libraries(maca_trans_parser_arc_eager_mcf2cff perceptron)
+target_link_libraries(maca_trans_parser_arc_eager_mcf2cff transparse)
+target_link_libraries(maca_trans_parser_arc_eager_mcf2cff maca_common)
+install (TARGETS maca_trans_parser_arc_eager_mcf2cff DESTINATION bin)
 
 add_executable(maca_trans_tagparser_arc_eager_mcf2cff ./src/maca_trans_tagparser_arc_eager_mcf2cff.c)
 target_link_libraries(maca_trans_tagparser_arc_eager_mcf2cff perceptron)
@@ -119,6 +119,18 @@ target_link_libraries(cff2fann transparse)
 target_link_libraries(cff2fann maca_common)
 install (TARGETS cff2fann DESTINATION bin)
 
+add_executable(maca_trans_interpreter ./src/maca_trans_interpreter.c)
+target_compile_options(maca_trans_interpreter INTERFACE -Wall)
+target_link_libraries(maca_trans_interpreter transparse)
+target_link_libraries(maca_trans_interpreter maca_common)
+install (TARGETS maca_trans_interpreter DESTINATION bin)
+
+add_executable(maca_lemmatizer ./src/maca_trans_lemmatizer.c)
+target_compile_options(maca_lemmatizer INTERFACE -Wall)
+target_link_libraries(maca_lemmatizer transparse)
+target_link_libraries(maca_lemmatizer maca_common)
+install (TARGETS maca_lemmatizer DESTINATION bin)
+
 
 #add_executable(test_w2v ./src/test_w2v.c)
 #target_link_libraries(test_w2v transparse)
diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c
index e1beddd..a9b9d43 100644
--- a/maca_trans_parser/src/context.c
+++ b/maca_trans_parser/src/context.c
@@ -272,7 +272,7 @@ context *context_read_options(int argc, char *argv[])
     if(ctx->conll)
      ctx->mcd_struct = mcd_build_conll07(); 
     else
-      ctx->mcd_struct = mcd_build_wplgf();
+      ctx->mcd_struct = mcd_build_wplgfs();
       /* ctx->mcd_struct = mcd_build_ifpls(); */
   
   return ctx;
diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h
index afdcd11..611dd10 100644
--- a/maca_trans_parser/src/context.h
+++ b/maca_trans_parser/src/context.h
@@ -4,7 +4,6 @@
 #define TEST_MODE 1
 #define TRAIN_MODE 2
 
-
 #define DEFAULT_MULTI_COL_DESC_FILENAME "maca_trans_parser.mcd" 
 #define DEFAULT_FEATURES_MODEL_FILENAME "maca_trans_parser.fm" 
 #define DEFAULT_VOCABS_FILENAME "maca_trans_parser.vocab" 
@@ -14,13 +13,14 @@
 #define DEFAULT_FEATURES_MODEL_TAGGER_FILENAME "maca_trans_tagger.fm" 
 #define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab" 
 #define DEFAULT_MODEL_TAGGER_FILENAME  "maca_trans_tagger.model" 
-#define DEFAULT_F2P_FILENAME "fP" 
 
 #define DEFAULT_MULTI_COL_DESC_TAGPARSER_FILENAME "maca_trans_tagparser.mcd" 
 #define DEFAULT_FEATURES_MODEL_TAGPARSER_FILENAME "maca_trans_tagparser.fm" 
 #define DEFAULT_VOCABS_TAGPARSER_FILENAME "maca_trans_tagparser.vocab" 
 #define DEFAULT_MODEL_TAGPARSER_FILENAME  "maca_trans_tagparser.model" 
+
 #define DEFAULT_F2P_FILENAME "fP" 
+#define DEFAULT_FPLM_FILENAME "fplm" 
 
 #include "dico_vec.h"
 #include "feat_model.h"
@@ -38,6 +38,7 @@ typedef struct {
   char *fann_filename;
   char *stag_desc_filename;
   char *f2p_filename;
+  char *fplm_filename;
   int hidden_neurons_nb;
   int iteration_nb;
   int debug_mode;
diff --git a/maca_trans_parser/src/maca_trans_interpreter.c b/maca_trans_parser/src/maca_trans_interpreter.c
new file mode 100644
index 0000000..281d3aa
--- /dev/null
+++ b/maca_trans_parser/src/maca_trans_interpreter.c
@@ -0,0 +1,150 @@
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include"util.h"
+#include"mcd.h"
+#include"config.h"
+#include"word_buffer.h"
+#include"movements.h"
+
+#define LONGUEUR_LIGNE 1000
+
+#define MODE_TAGGER 0
+#define MODE_PARSER 1
+#define MODE_TAGPARSER 2
+
+void help_message(void)
+{
+  fprintf(stdout, "help\t print this message\n");
+  fprintf(stdout, "verbose\t toggle verbose mode\n");
+  fprintf(stdout, "quit\t quit interpreter\n");
+  fprintf(stdout, "load_mcd\t load mcd file\n");
+  fprintf(stdout, "load_mcf\t load mcf file\n");
+  fprintf(stdout, "open_mcf\t open mcf file\n");
+  fprintf(stdout, "config_new\t \n");
+  fprintf(stdout, "config_print\t print configuration\n");
+  fprintf(stdout, "shift\t  perform a shift movement\n");
+  fprintf(stdout, "shift_undo\t \n");
+  fprintf(stdout, "parser\t switch to parser mode\n");
+  fprintf(stdout, "tagger\t switch to tagger mode\n");
+  fprintf(stdout, "tagparser\t switch to tagparser mode\n");
+
+}
+
+int main(int argc, char *argv[])
+{
+  char ligne[LONGUEUR_LIGNE];
+  char commande[LONGUEUR_LIGNE], argument[LONGUEUR_LIGNE];
+  int n;
+  mcd *mcd_struct = NULL;
+  char *mcd_filename = NULL;
+  char *mcf_filename = NULL;
+  FILE *mcf_file = NULL;
+  int verbose = 0;
+  word_buffer *wb = NULL;
+  config *c = NULL;
+  int mode = MODE_PARSER;
+  
+  while(1){
+    printf("> ");
+
+    if(fgets(ligne, LONGUEUR_LIGNE, stdin) == NULL) {
+      printf("au revoir !\n");
+      exit(1);
+    }
+    commande[0] = argument[0] = '\0';
+    n = sscanf(ligne, "%s %s\n", commande, argument);
+     /* printf("ligne = %s n = %d commande = %s argument = %s\n", ligne, n, commande, argument);  */
+
+    if(n == -1) continue;
+    if(!strcmp(commande, "quit")){ 
+      printf("au revoir !\n");
+      exit(1);
+    }
+
+    if(!strcmp(commande, "verbose")){ 
+      verbose = (verbose == 0) ? 1 : 0;
+      printf("verbose = %d\n", verbose);
+      continue;
+    }
+
+    if(!strcmp(commande, "help")){
+      help_message();
+      continue;
+    }
+
+    /* set mode */
+    
+    if(!strcmp(commande, "parser")){
+      mode = MODE_PARSER;
+      if(verbose)
+	fprintf(stdout, "mode = parser\n");
+      continue;
+    }
+    
+    if(!strcmp(commande, "tagger")){
+      mode = MODE_TAGGER;
+      if(verbose)
+	fprintf(stdout, "mode = tagger\n");
+      continue;
+    }
+    if(!strcmp(commande, "tagparser")){
+      mode = MODE_TAGPARSER;
+      if(verbose)
+	fprintf(stdout, "mode = tagparser\n");
+      continue;
+    }
+
+    if(!strcmp(commande, "mode")){
+      if(mode == MODE_PARSER){fprintf(stdout, "parser\n"); continue;}
+      if(mode == MODE_TAGGER){fprintf(stdout, "tagger\n"); continue;}
+      if(mode == MODE_TAGPARSER){fprintf(stdout, "tagparser\n"); continue;}
+    }
+
+    if(!strcmp(commande, "load_mcd")){
+      mcd_filename = strdup(argument);
+      mcd_struct = mcd_read(mcd_filename, verbose);
+      continue;
+    }
+
+    if(!strcmp(commande, "load_mcf")){
+      mcf_filename = strdup(argument);
+      word_buffer_load_mcf(mcf_filename, mcd_struct);
+      continue;
+    }
+
+    if(!strcmp(commande, "open_mcf")){
+      mcf_filename = strdup(argument);
+      mcf_file = myfopen(mcf_filename, "r"); 
+      continue;
+    }
+
+    if(!strcmp(commande, "config_new")){
+      c = config_new(mcf_file, mcd_struct, 5);    
+      continue;
+    }
+
+    if(!strcmp(commande, "config_print")){
+      config_print(stdout, c);
+      continue;
+    }
+
+    /* movements */
+
+    if(!strcmp(commande, "shift")){
+      movement_shift(c, 0);
+      config_print(stdout, c);
+      continue;
+    }
+
+    if(!strcmp(commande, "shift_undo")){
+      movement_shift_undo(c);
+      config_print(stdout, c);
+      continue;
+    }
+
+    
+
+    
+  }
+}
diff --git a/maca_trans_parser/src/maca_trans_lemmatizer.c b/maca_trans_parser/src/maca_trans_lemmatizer.c
new file mode 100644
index 0000000..2cda79f
--- /dev/null
+++ b/maca_trans_parser/src/maca_trans_lemmatizer.c
@@ -0,0 +1,198 @@
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include<unistd.h>
+#include<getopt.h>
+#include<ctype.h>
+
+#include"context.h"
+#include"dico.h"
+
+void maca_lemmatizer_help_message(context *ctx)
+{
+  context_general_help_message(ctx);
+  context_beam_help_message(ctx);
+  context_conll_help_message(ctx);
+  fprintf(stderr, "INPUT\n");
+  context_input_help_message(ctx);
+  context_mcd_help_message(ctx);
+}
+
+void maca_lemmatizer_check_options(context *ctx){
+  if(ctx->help
+     ){
+    maca_lemmatizer_help_message(ctx);
+    exit(1);
+  }
+}
+
+void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx)
+{
+  char absolute_path[500];
+  char absolute_filename[500];
+
+  absolute_path[0] = '\0';
+
+  if(ctx->maca_data_path)
+    strcat(absolute_path, ctx->maca_data_path);
+  else
+    strcat(absolute_path, getenv("MACAON_DIR"));
+	   
+  strcat(absolute_path, "/");
+  strcat(absolute_path, ctx->language);
+  strcat(absolute_path, "/bin/");
+
+  if(!ctx->fplm_filename){
+    strcpy(absolute_filename, absolute_path);
+    strcat(absolute_filename, DEFAULT_FPLM_FILENAME);
+    ctx->fplm_filename = strdup(absolute_filename);
+  }
+
+  if(ctx->verbose){
+    fprintf(stderr, "fplm_filename = %s\n", ctx->fplm_filename);
+  }
+}
+
+char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode)
+{
+  char form[1000];
+  char pos[1000];
+  char lemma[1000];  
+  char morpho[1000];
+  int num = 0;
+  char **lemma_array;
+  int lemma_array_size = 10000;
+  char buffer[10000];
+  int fields_nb;
+  FILE *f= myfopen(fplm_filename, "r");
+
+  lemma_array = (char **)memalloc(lemma_array_size * sizeof(char *));
+
+  while(fgets(buffer, 10000, f)){
+    fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho);
+    /* if(!strcmp(form, "d")) */
+    /* fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma);   */
+    if(fields_nb != 4){
+      if(debug_mode){
+	fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); 
+	fprintf(stderr, "incorrect fplm entry, skipping it\n");
+      }
+      continue;
+    }
+    strcat(form, "/");
+    strcat(form, pos);
+    hash_add(form_pos_ht, strdup(form), num);
+
+    if(num >= lemma_array_size){
+      lemma_array_size = 2 * (lemma_array_size) + 1;
+      lemma_array = realloc(lemma_array, (lemma_array_size) * sizeof(char *));
+    }
+
+    /* if(lemma_array[num] == NULL) */
+    lemma_array[num] = strdup(lemma);
+    num++;
+  }
+  /* fprintf(stderr, "%d entries loaded\n", num); */
+  return lemma_array;
+}
+
+
+char *lookup_lemma(char *form, char *pos, hash *form_pos_ht, char **lemma_array, int verbose)
+{
+  char form_pos[1000];
+  int index_form_pos;
+
+  strcpy(form_pos, form);
+  strcat(form_pos, "/");
+  strcat(form_pos, pos);
+  index_form_pos = hash_get_val(form_pos_ht, form_pos);
+
+
+  if(index_form_pos != HASH_INVALID_VAL) /* couple form/pos found in the hash table */
+    return lemma_array[index_form_pos];
+  
+  strcpy(form_pos, form);
+  to_lower_string(form_pos); /* change form to lower case and look it up again */
+  strcat(form_pos, "/");
+  strcat(form_pos, pos);
+  index_form_pos = hash_get_val(form_pos_ht, form_pos);
+  if(index_form_pos != HASH_INVALID_VAL)
+    return lemma_array[index_form_pos];
+
+  /* even in lower case couple form/pos is not found, return the form as lemma */
+  if(verbose)
+    fprintf(stderr, "cannot find an entry for %s %s\n", form, pos);
+  
+  return form;
+}
+
+/* a bit messy */
+void print_word(word *w, mcd *mcd_struct, char *lemma)
+{
+  char *buffer = NULL;
+  char *token = NULL;
+  int col_nb = 0;
+
+  if(mcd_get_lemma_col(mcd_struct) == -1){
+    printf("%s\t%s\n", w->input, lemma);
+  }
+  else{
+    buffer = strdup(w->input);
+    token = strtok(buffer, "\t");
+    col_nb = 0;
+    while(token){
+      if(col_nb != 0) printf("\t");
+      if(col_nb == mcd_get_lemma_col(mcd_struct))
+	printf("%s", lemma);
+      else
+	word_print_col_n(stdout, w, col_nb);
+      col_nb++;
+      token = strtok(NULL, "\t");
+    }
+    if(col_nb <= mcd_get_lemma_col(mcd_struct))
+      printf("\t%s", lemma);
+    printf("\n");
+    free(buffer);
+  }
+}
+
+
+int main(int argc, char *argv[])
+{
+  context *ctx = context_read_options(argc, argv);
+  hash *form_pos_ht = hash_new(1000000);
+  char **lemma_array = NULL;
+  word *b0;
+  char lemma[200];
+  char form[200];
+  char pos[200];
+  config *c;
+  
+  maca_lemmatizer_check_options(ctx);
+  maca_lemmatizer_set_linguistic_resources_filenames(ctx);
+
+  lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht, ctx->debug_mode);
+  
+  FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
+
+  c = config_new(f, ctx->mcd_struct, 5); 
+
+  while(!config_is_terminal(c)){
+    b0 = word_buffer_b0(c->bf);
+    word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct));
+    word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct));
+    word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct));
+
+    /* if lemma is not specified in input it is looked up */
+    if(strlen(lemma) && strcmp(lemma, "_"))
+      print_word(b0, ctx->mcd_struct, lemma);
+    else
+      print_word(b0, ctx->mcd_struct, lookup_lemma(form, pos, form_pos_ht, lemma_array, ctx->verbose));
+  
+    word_buffer_move_right(c->bf);
+  }
+  config_free(c); 
+  context_free(ctx);
+  return 0;
+}
+
diff --git a/maca_trans_parser/src/maca_trans_parser.c b/maca_trans_parser/src/maca_trans_parser.c
index 6a3e6d7..d48a4d8 100644
--- a/maca_trans_parser/src/maca_trans_parser.c
+++ b/maca_trans_parser/src/maca_trans_parser.c
@@ -5,12 +5,10 @@
 #include<getopt.h>
 #include"context.h"
 #include"movement_parser.h"
-#include"oracle_parser.h"
 #include"oracle_parser_arc_eager.h"
 #include"feat_fct.h"
 #include"feature_table.h"
 #include"dico.h"
-#include"beam.h"
 #include"simple_decoder_parser_arc_eager.h"
 /*#include"dnn_decoder.h"*/
 #include"config2feat_vec.h"
diff --git a/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c b/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c
index b990c79..f0a16c0 100644
--- a/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c
+++ b/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c
@@ -42,7 +42,7 @@ void maca_trans_parser_mcf2cff_check_options(context *ctx)
   }
 }
 
-void generate_training_file_stream(FILE *output_file, context *ctx)
+void generate_training_file(FILE *output_file, context *ctx)
 {
   config *c;
   int mvt_code;
@@ -66,14 +66,10 @@ void generate_training_file_stream(FILE *output_file, context *ctx)
   c = config_new(mcf_file, mcd_struct_hyp, 5);
   
   while(!word_buffer_end(ref) && (sentence_nb < ctx->sent_nb)){
-    /*printf("************ REF ************\n");
-    word_buffer_print(stdout, ref);
-    printf("*****************************\n");*/
-    
     mvt_code = oracle_parser_arc_eager(c, ref, root_label);
     mvt_type = movement_parser_type(mvt_code);
     mvt_label = movement_parser_label(mvt_code);
-
+    
     if(ctx->debug_mode){
       config_print(stdout,c);           
       movement_parser_print(stdout, mvt_code, ctx->dico_labels);        
@@ -92,40 +88,32 @@ void generate_training_file_stream(FILE *output_file, context *ctx)
       config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
       feat_vec_print(output_file, fv);
     }
-    
-    if(mvt_type == MVT_PARSER_EOS){
+
+    switch(mvt_type){
+    case MVT_PARSER_EOS :
       movement_parser_eos(c);
       sentence_nb++;
-      fprintf(stderr, "sentence %d\n", sentence_nb);
-      if(word_buffer_is_last(ref))
-	break;
-    }
-    
-    if(mvt_type == MVT_PARSER_LEFT){
+      if((sentence_nb % 100) == 0)
+	fprintf(stderr, "sentence %d\n", sentence_nb);
+      /* if(word_buffer_is_last(ref)) */
+      break;
+    case MVT_PARSER_LEFT :
       movement_parser_left_arc(c, mvt_label);
-      continue;
-    }
-    
-    if(mvt_type == MVT_PARSER_RIGHT){
+      break;    
+    case MVT_PARSER_RIGHT :
       movement_parser_right_arc(c, mvt_label);
       word_buffer_move_right(ref);
-      continue;
-    }
-    
-    if(mvt_type == MVT_PARSER_REDUCE){
+      break;    
+    case MVT_PARSER_REDUCE :
       movement_parser_reduce(c);
-      continue;
-    }
-   
-    if(mvt_type == MVT_PARSER_ROOT){
+      break;   
+    case MVT_PARSER_ROOT :
       movement_parser_root(c, root_label);
-      continue;
-    }
-
-    if(mvt_type == MVT_PARSER_SHIFT){
+      break;
+    case MVT_PARSER_SHIFT :
       movement_parser_shift(c);
       word_buffer_move_right(ref);
-      continue;
+      break;
     }
   }
 }
@@ -174,7 +162,7 @@ int main(int argc, char *argv[])
   /* open output file */
   output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w") : stdout;
   
-  generate_training_file_stream(output_file, ctx);
+  generate_training_file(output_file, ctx);
 
   if(ctx->mode == TRAIN_MODE)
     dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
diff --git a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c
index 93f990a..01342e6 100644
--- a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c
+++ b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c
@@ -12,17 +12,46 @@
 #include"word_emb.h"
 #include"config2feat_vec.h"
 
+#if 1
 void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p)
 {
   int i;
   word *w;
+  char lower_form[100];
 
   for(i = word_buffer_get_nbelem(bf) - 1; i >=0  ; i--){
     w = word_buffer_get_word_n(bf, i);
     if(word_get_signature(w) != -1) break;
     w->signature = form2pos_get_signature(f2p, w->form);
+    if(w->signature == -1){
+      strcpy(lower_form, w->form);
+      to_lower_string(lower_form);
+      w->signature = form2pos_get_signature(f2p, lower_form);
+    }
   }
 }
+#endif
+
+#if 0
+void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p, dico *dico_pos)
+{
+  int i;
+  word *w;
+  int signature;
+  char *pos;
+  for(i = word_buffer_get_nbelem(bf) - 1; i >=0  ; i--){
+    w = word_buffer_get_word_n(bf, i);
+    if(word_get_signature(w) != -1) break;
+    signature = form2pos_get_signature(f2p, w->form);
+    w->signature = signature;
+    if(form2pos_word_is_non_ambiguous(f2p, w->form, &pos)){
+      /* printf("%s non ambigu cat = %s code = %d \n", w->form, pos, dico_string2int(dico_pos, pos));  */
+      word_set_pos(w, dico_string2int(dico_pos, pos));
+      
+    }
+  }
+}
+#endif
 
 void maca_trans_parser_mcf2cff_help_message(context *ctx)
 {
@@ -56,64 +85,30 @@ void maca_trans_parser_mcf2cff_check_options(context *ctx)
   }
 }
 
-void generate_training_file_stream(FILE *output_file, context *ctx)
+void generate_training_file(FILE *output_file, context *ctx)
 {  
   config *c;
   feat_vec *fv = feat_vec_new(feature_types_nb);
   FILE *conll_file = myfopen(ctx->input_filename, "r");
   int postag;
-
+  /* dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */
+  
   c = config_new(conll_file, ctx->mcd_struct, 5);
 
   while(!config_is_terminal(c)){
     /* config_print(stdout,c);         */
     if(ctx->f2p)
-      add_signature_to_words_in_word_buffer(c->bf, ctx->f2p);
+      /*add_signature_to_words_in_word_buffer(c->bf, ctx->f2p, dico_pos); */
+ add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); 
     config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
     postag = oracle_tagger(c, NULL);
     fprintf(output_file, "%d", postag);
     feat_vec_print(output_file, fv);
-    int res = movement_tagger(c, postag, 0, 1);
+    int res = movement_tagger(c, postag);
     if(res == 0) break;
   }
 }
 
-void generate_training_file_buffer(FILE *output_file, context *ctx)
-{  
-  config *c;
-  feat_vec *fv = feat_vec_new(feature_types_nb);
-  sentence *ref = NULL;
-  int sentence_nb = 0;
-  FILE *conll_file = myfopen(ctx->input_filename, "r");
-  FILE *conll_file_ref = myfopen(ctx->input_filename, "r");
-  int postag;
-  c = config_new(conll_file, ctx->mcd_struct, 0);
-
-  while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ 
-    /* sentence_print(stdout, ref, NULL); */
-    word_buffer_read_sentence(c->bf);
- /* get rid of dummy token */
-    /* queue_remove(c->bf); */
-
-    if(ctx->f2p)
-      add_signature_to_words_in_word_buffer(c->bf, ctx->f2p);
-
-    while(!config_is_terminal(c)){
-      /* config_print(stdout, c);  */
-      config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); 
-      postag = oracle_tagger(c, ref);
-      fprintf(output_file, "%d", postag);
-      feat_vec_print(output_file, fv);
-
-      if(postag != -1)
-	movement_tagger(c, postag, 0, 0);
-    }
-    config_free(c); 
-    c = config_new(conll_file, ctx->mcd_struct, 0);
-    sentence_nb++;
-  }
-}
-
 int main(int argc, char *argv[])
 {
   context *ctx;
@@ -152,12 +147,9 @@ int main(int argc, char *argv[])
     output_file = myfopen(ctx->cff_filename, "w");
   else
     output_file = stdout;
-
-  if(ctx->stream_mode)
-    generate_training_file_stream(output_file, ctx);
-  else
-    generate_training_file_buffer(output_file, ctx);
   
+  generate_training_file(output_file, ctx);
+    
   if(ctx->mode == TRAIN_MODE){
     /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */
     dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
diff --git a/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c b/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c
index fdadaee..11c163c 100644
--- a/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c
+++ b/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c
@@ -54,7 +54,7 @@ void maca_trans_parser_mcf2cff_check_options(context *ctx)
   }
 }
 
-void generate_training_file_stream(FILE *output_file, context *ctx)
+void generate_training_file(FILE *output_file, context *ctx)
 {
   config *c;
   int mvt_code;
@@ -112,43 +112,33 @@ void generate_training_file_stream(FILE *output_file, context *ctx)
       feat_vec_print(output_file, fv);
     }
     
-    if(mvt_type == MVT_TAGPARSER_EOS){
+    switch(mvt_type){
+    case MVT_TAGPARSER_EOS :
       movement_tagparser_eos(c);
       sentence_nb++;
-      if(word_buffer_is_last(ref))
-	break;
-    }
-    
-    if(mvt_type == MVT_TAGPARSER_POSTAG){
+      if((sentence_nb % 100) == 0)
+	fprintf(stderr, "sentence %d\n", sentence_nb);
+      break;
+    case MVT_TAGPARSER_POSTAG :
       movement_tagparser_add_pos(c, mvt_label);
-      continue;
-    }
-
-    if(mvt_type == MVT_TAGPARSER_LEFT){
+      break;
+    case MVT_TAGPARSER_LEFT :
       movement_tagparser_left_arc(c, mvt_label);
-      continue;
-    }
-    
-    if(mvt_type == MVT_TAGPARSER_RIGHT){
+      break;    
+    case MVT_TAGPARSER_RIGHT :
       movement_tagparser_right_arc(c, mvt_label);
       word_buffer_move_right(ref);
-      continue;
-    }
-    
-    if(mvt_type == MVT_TAGPARSER_REDUCE){
+      break;    
+    case MVT_TAGPARSER_REDUCE :
       movement_tagparser_reduce(c);
-      continue;
-    }
-   
-    if(mvt_type == MVT_TAGPARSER_ROOT){
+      break;   
+    case MVT_TAGPARSER_ROOT :
       movement_tagparser_root(c, root_label);
-      continue;
-    }
-
-    if(mvt_type == MVT_TAGPARSER_SHIFT){
+      break;
+    case MVT_TAGPARSER_SHIFT :
       movement_tagparser_shift(c);
       word_buffer_move_right(ref);
-      continue;
+      break;
     }
   }
 }
@@ -173,7 +163,6 @@ int main(int argc, char *argv[])
   }
 
   ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL");
-
   
   if(ctx->dico_labels == NULL){
     fprintf(stderr, "cannot find label names\n");
@@ -198,7 +187,7 @@ int main(int argc, char *argv[])
   /* open output file */
   output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w") : stdout;
   
-  generate_training_file_stream(output_file, ctx);
+  generate_training_file(output_file, ctx);
 
   if(ctx->mode == TRAIN_MODE)
     dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
diff --git a/maca_trans_parser/src/movement_tagger.c b/maca_trans_parser/src/movement_tagger.c
index 5d33788..8e7d896 100644
--- a/maca_trans_parser/src/movement_tagger.c
+++ b/maca_trans_parser/src/movement_tagger.c
@@ -4,10 +4,8 @@
 #include"util.h"
 #include"movement_tagger.h"
 
-int movement_tagger(config *c, int postag, float score, int stream)
+int movement_tagger(config *c, int postag)
 {
-  if(word_buffer_is_last(c->bf)) return 0;
-
   word_set_pos(word_buffer_b0(c->bf), postag); 
   word_buffer_move_right(c->bf);
 
diff --git a/maca_trans_parser/src/movement_tagger.h b/maca_trans_parser/src/movement_tagger.h
index 7168f5a..1b7dfbe 100644
--- a/maca_trans_parser/src/movement_tagger.h
+++ b/maca_trans_parser/src/movement_tagger.h
@@ -3,6 +3,6 @@
 
 #include"config.h"
 #include"feat_vec.h"
-int movement_tagger(config *c, int postag, float score, int stream);
+int movement_tagger(config *c, int postag);
 
 #endif
diff --git a/maca_trans_parser/src/movements.c b/maca_trans_parser/src/movements.c
index cccdf08..a10e62b 100644
--- a/maca_trans_parser/src/movements.c
+++ b/maca_trans_parser/src/movements.c
@@ -8,8 +8,10 @@ int movement_eos(config *c, int movement_code)
 {
   if(stack_is_empty(config_get_stack(c))) return 0;
   word *s0 = stack_top(config_get_stack(c));
+
+  if(word_get_sent_seg(s0) == 1) return 0;
   
-  /* word on the top of the stack is sent_seg */
+  /* set word on the top of the stack to sent_seg */
   word_set_sent_seg(s0, 1); 
 
   config_push_mvt(c, movement_code, s0, NULL);
diff --git a/maca_trans_parser/src/simple_decoder_parser_arc_eager.c b/maca_trans_parser/src/simple_decoder_parser_arc_eager.c
index 3a81fc7..42fee6d 100644
--- a/maca_trans_parser/src/simple_decoder_parser_arc_eager.c
+++ b/maca_trans_parser/src/simple_decoder_parser_arc_eager.c
@@ -44,7 +44,7 @@ void simple_decoder_parser_arc_eager(context *ctx)
   feat_vec *fv = feat_vec_new(feature_types_nb);
   config *c = NULL;
   int result;
-  float entropy;
+  /* float entropy; */
   /* float delta; */
   int argmax1, argmax2;
   float max1, max2;
@@ -74,24 +74,6 @@ void simple_decoder_parser_arc_eager(context *ctx)
 
     }
     
-    if(ctx->debug_mode){
-      fprintf(stdout, "***********************************\n");
-      config_print(stdout, c);      
-      entropy = feature_table_entropy(fv, ft);
-      /* delta = feature_table_diff_scores(fv, ft); */
-      feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2);
-      movement_parser_print(stdout, argmax1, ctx->dico_labels);         
-      printf(":\t%f\n", max1);
-      movement_parser_print(stdout, argmax2, ctx->dico_labels);         
-      printf(":\t%f\n", max2);
-      printf("delta = %f\n", max1 - max2);
-
-      /* delta = feature_table_first_second(fv, ft); */
-       /* printf("entropy = %f delta = %f\n", entropy, delta);  */
-       printf("entropy = %f\n",entropy); 
-      
-      /* movement_parser_print(stdout, mvt_code, ctx->dico_labels);          */
-    }
     result = 0;
     switch(mvt_type){
     case MVT_PARSER_LEFT :
diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c
index bad70b6..9a50f63 100644
--- a/maca_trans_parser/src/simple_decoder_tagger.c
+++ b/maca_trans_parser/src/simple_decoder_tagger.c
@@ -3,22 +3,82 @@
 #include<string.h>
 #include<unistd.h>
 #include<getopt.h>
+#include<ctype.h>
+
 #include"context.h"
 #include"movement_tagger.h"
 #include"feat_fct.h"
 #include"config2feat_vec.h"
 #include"feature_table.h"
 #include"dico.h"
+#include"mcd.h"
 
+#if 1
 void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p)
 {
   int i;
   word *w;
+  char lower_form[100];
 
   for(i = word_buffer_get_nbelem(bf) - 1; i >=0  ; i--){
     w = word_buffer_get_word_n(bf, i);
     if(word_get_signature(w) != -1) break;
     w->signature = form2pos_get_signature(f2p, w->form);
+    if(w->signature == -1){
+      strcpy(lower_form, w->form);
+      to_lower_string(lower_form);
+      w->signature = form2pos_get_signature(f2p, lower_form);
+    }
+  }
+}
+#endif
+
+#if 0
+void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p, dico *dico_pos)
+{
+  int i;
+  word *w;
+  int signature;
+  char *pos;
+  for(i = word_buffer_get_nbelem(bf) - 1; i >=0  ; i--){
+    w = word_buffer_get_word_n(bf, i);
+    if(word_get_signature(w) != -1) break;
+    signature = form2pos_get_signature(f2p, w->form);
+    w->signature = signature;
+    if(form2pos_word_is_non_ambiguous(f2p, w->form, &pos)){
+      /* printf("%s non ambigu code = %d \n", pos, dico_string2int(dico_pos, pos)); */
+      word_set_pos(w, dico_string2int(dico_pos, pos));
+      
+    }
+  }
+}
+#endif
+
+void print_word(word *w, mcd *mcd_struct, dico *dico_pos, int postag)
+{
+  char *buffer = NULL;
+  char *token = NULL;
+  int col_nb = 0;
+  if(mcd_get_pos_col(mcd_struct) == -1){
+    printf("%s\t%s\n", w->input, dico_int2string(dico_pos, postag));
+  }
+  else{
+    buffer = strdup(w->input);
+    token = strtok(buffer, "\t");
+    col_nb = 0;
+    while(token){
+      if(col_nb != 0) printf("\t");
+      if(col_nb == mcd_get_pos_col(mcd_struct))
+	printf("%s", dico_int2string(dico_pos, postag));
+      else
+	word_print_col_n(stdout, w, col_nb);
+      col_nb++;
+      token = strtok(NULL, "\t");
+    }
+    if(col_nb <= mcd_get_pos_col(mcd_struct))
+      printf("\t%s", dico_int2string(dico_pos, postag));
+    printf("\n");
+    free(buffer);
   }
 }
 
@@ -30,32 +90,48 @@ void simple_decoder_tagger(context *ctx)
   feature_table *ft =  feature_table_load(ctx->perc_model_filename, ctx->verbose);
   int postag;
   float max;
-  word *w;
+  word *b0;
   dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS");
-  int res;
+
   c = config_new(f, ctx->mcd_struct, 5); 
 
-  while(1){
+  while(!config_is_terminal(c)){
     if(ctx->f2p)
-      add_signature_to_words_in_word_buffer(c->bf, ctx->f2p);
-    /* config_print(stdout, c); */
-    config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
-    
-    /* feat_vec_print(stdout, fv); */
-    postag = feature_table_argmax(fv, ft, &max);
-    /* printf("postag = %d\n", postag); */
-    
-    w = word_buffer_b0(c->bf);
-    printf("%s\t%s\n", w->input, dico_int2string(dico_pos, postag));
+      /* add_signature_to_words_in_word_buffer(c->bf, ctx->f2p, dico_pos); */
+      add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); 
+
+    b0 = word_buffer_b0(c->bf);
+    postag = word_get_pos(b0);
+
+    if(ctx->debug_mode){
+      fprintf(stderr, "***********************************\n");
+      config_print(stderr, c);
+    }
     
-    res = movement_tagger(c, postag, max, 1);
+    /* if postag is not specified in input it is predicted */
+    if(postag == -1){
+      /* config_print(stdout, c); */
+      config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
+      
+      /* feat_vec_print(stdout, fv); */
+      postag = feature_table_argmax(fv, ft, &max);
+      /* printf("postag = %d\n", postag); */
 
-    /* printf(" current index = %d nb elem = %d\n", c->bf->current_index, c->bf->nbelem);  */
+      if(ctx->debug_mode){
+	vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
+	for(int i=0; i < 3; i++){
+	  fprintf(stderr, "%d\t", i);
+	  fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score);
+	}
+	free(vcode_array);
+      }
+    }
 
-    if(res == 0) break;
-  }
+    print_word(b0, ctx->mcd_struct, dico_pos, postag);
+    
+    movement_tagger(c, postag);
 
+  }
   /* config_print(stdout, c);  */
-  /* config_free(c); */
+   config_free(c); 
 }
-
diff --git a/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.c b/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.c
index 4b6aac1..15b8767 100644
--- a/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.c
+++ b/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.c
@@ -63,7 +63,7 @@ void simple_decoder_tagparser_arc_eager(context *ctx)
   feat_vec *fv = feat_vec_new(feature_types_nb);
   config *c = NULL;
   int result;
-  float entropy;
+  /* float entropy; */
   /* float delta; */
   int argmax1, argmax2;
   float max1, max2;
@@ -99,6 +99,19 @@ void simple_decoder_tagparser_arc_eager(context *ctx)
     if(ctx->debug_mode){
       fprintf(stdout, "***********************************\n");
       config_print(stdout, c);      
+
+      vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
+
+      for(int i=0; i < 5; i++){
+	printf("%d\t", i);
+	movement_tagparser_print(stdout, vcode_array[i].class_code, ctx->dico_labels, ctx->dico_postags);
+	printf("\t%.4f\n", vcode_array[i].score);
+      }
+      free(vcode_array);
+
+
+#if 0
+
       entropy = feature_table_entropy(fv, ft);
       /* delta = feature_table_diff_scores(fv, ft); */
       feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2);
@@ -113,11 +126,17 @@ void simple_decoder_tagparser_arc_eager(context *ctx)
        printf("entropy = %f\n",entropy); 
       
       /* movement_tagparser_print(stdout, mvt_code, ctx->dico_labels);          */
+#endif
     }
     result = 0;
     switch(mvt_type){
     case MVT_TAGPARSER_POSTAG :
       result = movement_tagparser_add_pos(c, mvt_label);
+      /*      if(result){
+	int code_pos = word_get_pos(word_buffer_b0(config_get_buffer(c)));
+	int code_form = word_get_form(word_buffer_b0(config_get_buffer(c)));
+	printf("code pos = %d code form = %d\n", code_pos, code_form);
+	}*/
       break;
     case MVT_TAGPARSER_LEFT :
       result = movement_tagparser_left_arc(c, mvt_label);
diff --git a/perceptron/exec/cff_cutoff.c b/perceptron/exec/cff_cutoff.c
index 029701b..64ad7e0 100644
--- a/perceptron/exec/cff_cutoff.c
+++ b/perceptron/exec/cff_cutoff.c
@@ -174,9 +174,9 @@ int main(int argc, char *argv[])
   fprintf(stderr, "after thresholding          : %d\n", n_feat - feat_removed);
   fprintf(stderr, "ratio                       : %.3f\n\n", (float)(n_feat - feat_removed) / n_feat);
 
-  fprintf(stderr, "total number of feature occurrences : %d\n", f_occ);
+  /*  fprintf(stderr, "total number of feature occurrences : %d\n", f_occ);
   fprintf(stderr, "atfer thresholding                  : %d\n", f_occ - occ_removed);
-  fprintf(stderr, "ratio                               : %.3f\n", (float)(f_occ - occ_removed) / f_occ);
+  fprintf(stderr, "ratio                               : %.3f\n", (float)(f_occ - occ_removed) / f_occ);*/
 
 
   dico_vec_replace_dico(ctx->vocabs, old_d_feat, new_d_feat);
diff --git a/perceptron/lib/src/feature_table.c b/perceptron/lib/src/feature_table.c
index 82c6be2..99ac711 100644
--- a/perceptron/lib/src/feature_table.c
+++ b/perceptron/lib/src/feature_table.c
@@ -194,6 +194,7 @@ float feature_table_argmax_1_2(feat_vec *fv, feature_table *ft, int *argmax1, fl
   return (*max1 - *max2);
 }
 
+
 float feature_table_entropy(feat_vec *fv, feature_table *ft)
 {
   float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float));
-- 
GitLab