From f50e7d93c0512664a9b6d287f9f1871e22946643 Mon Sep 17 00:00:00 2001
From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
Date: Thu, 11 Jan 2018 15:56:12 +0100
Subject: [PATCH] fixed few little bugs

---
 maca_common/include/word.h                    |   7 +-
 maca_common/include/word_buffer.h             |  14 +--
 maca_trans_parser/CMakeLists.txt              |   7 +-
 maca_trans_parser/src/cff2fann.c              | 103 +++++++++++-------
 maca_trans_parser/src/maca_trans_lemmatizer.c |  10 +-
 .../src/movement_parser_arc_eager.h           |   6 +-
 6 files changed, 86 insertions(+), 61 deletions(-)

diff --git a/maca_common/include/word.h b/maca_common/include/word.h
index 88f32b6..7e8f30b 100644
--- a/maca_common/include/word.h
+++ b/maca_common/include/word.h
@@ -1,6 +1,6 @@
 #ifndef __WORD__
 #define __WORD__
-
+#include<ctype.h>
 #include "mcd.h"
 #include "char16.h"
 
@@ -28,6 +28,7 @@ typedef struct _word {
 #define word_get_s5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[strlen((w)->form) - 5])
 #define word_get_s6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 6))? -1 : (w)->form[strlen((w)->form) - 6])
 */
+
 #define word_get_s1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 1])
 #define word_get_s2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 2])
 #define word_get_s3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 3])
@@ -51,8 +52,8 @@ typedef struct _word {
 #define word_get_p6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[5])
 
 #define word_get_id(w)             (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID])
-#define word_get_offset(w)             (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_OFFSET])
-#define word_get_length(w)           (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LENGTH])
+#define word_get_offset(w)         (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_OFFSET])
+#define word_get_length(w)         (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LENGTH])
 #define word_get_form(w)           (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM])
 #define word_get_lemma(w)          (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LEMMA])
 #define word_get_cpos(w)           (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_CPOS])
diff --git a/maca_common/include/word_buffer.h b/maca_common/include/word_buffer.h
index eb995bd..136b560 100644
--- a/maca_common/include/word_buffer.h
+++ b/maca_common/include/word_buffer.h
@@ -32,13 +32,13 @@
 #define word_buffer_is_empty(wb) (((wb)->nbelem == 0)? 1 : 0)
 
 typedef struct {
-  int size;           /* size of the array used to store words */
-  int nbelem;         /* number of words in the buffer */
-  int lookahead;      /* number of words between the current word and the last word of the buffer */
-  int current_index;  /* position of the current word */
-  word **array;       /* array to store words */
-  FILE *input_file;   /* file to read the words from */
-  mcd *mcd_struct;    /* mcd describing the format of input_file */
+  int    size;           /* size of the array used to store words */
+  int    nbelem;         /* number of words in the buffer */
+  int    lookahead;      /* number of words between the current word and the last word of the buffer */
+  int    current_index;  /* position of the current word */
+  word **array;          /* array to store words */
+  FILE  *input_file;     /* file to read the words from */
+  mcd   *mcd_struct;     /* mcd describing the format of input_file */
 } word_buffer;
 
 
diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt
index a3cded9..fe60c66 100644
--- a/maca_trans_parser/CMakeLists.txt
+++ b/maca_trans_parser/CMakeLists.txt
@@ -210,9 +210,10 @@ install (TARGETS maca_trans_lemmatizer DESTINATION bin)
 #target_link_libraries(test_w2v transparse)
 #install (TARGETS test_w2v DESTINATION bin)
 
-#add_executable(w2v_filter ./src/w2v_filter.c)
-#target_link_libraries(w2v_filter transparse)
-#install (TARGETS w2v_filter DESTINATION bin)
+add_executable(w2v_filter ./src/w2v_filter.c)
+target_link_libraries(w2v_filter transparse)
+target_link_libraries(w2v_filter maca_common)
+install (TARGETS w2v_filter DESTINATION bin)
 
 #add_executable(test_word_emb ./src/test_word_emb.c)
 #target_link_libraries(test_word_emb transparse)
diff --git a/maca_trans_parser/src/cff2fann.c b/maca_trans_parser/src/cff2fann.c
index 294d821..bcdf01f 100644
--- a/maca_trans_parser/src/cff2fann.c
+++ b/maca_trans_parser/src/cff2fann.c
@@ -52,6 +52,20 @@ void one_hot_print(FILE *f, int val, int dim)
     fprintf(f, "%d ", (i == val)? 1  : 0);
 }
 
+void check_feature_model(feat_model *fm)
+{
+  int i;
+  feat_desc *fd;
+  
+  for(i=0; i <fm->nbelem; i++){
+    fd = fm->array[i];
+    if(fd->nbelem > 1){
+      fprintf(stderr, "feature %d is a complex feature, aborting\n", i);
+      exit(1);
+    }
+  }
+}
+
 void print_header(mcd *m, feat_model *fm)
 {
   int i;
@@ -62,33 +76,24 @@ void print_header(mcd *m, feat_model *fm)
 
   for(i=0; i <fm->nbelem; i++){
     fd = fm->array[i];
-    if(fd->nbelem > 1){
-      printf("feature %d is a complex feature, skipping it\n", i);
-    }
-    else{
-      sfd = fd->array[0];
-      printf("\t%s", sfd->name);
-    }
+    sfd = fd->array[0];
+    printf("\t%s", sfd->name);
   }
 
   printf("\n");
   printf("OUT");
   for(i=0; i <fm->nbelem; i++){
     fd = fm->array[i];
-    if(fd->nbelem > 1){
-      printf("feature %d is a complex feature, skipping it\n", i);
-    }
-    else{
-      sfd = fd->array[0];
-      if(sfd->type == FEAT_TYPE_FORM){printf("\tFORM");continue;}
-      if(sfd->type == FEAT_TYPE_LEMMA){printf("\tLEMMA");continue;}
-      if(sfd->type == FEAT_TYPE_CPOS){printf("\tCPOS");continue;}
-      if(sfd->type == FEAT_TYPE_POS){printf("\tPOS");continue;}
-      if(sfd->type == FEAT_TYPE_LABEL){printf("\tLABEL");continue;}
-      if(sfd->type == FEAT_TYPE_INT){printf("\tINT");continue;}
-      printf("\tUNK");
-    }
+    sfd = fd->array[0];
+    if(sfd->type == FEAT_TYPE_FORM){printf("\tFORM");continue;}
+    if(sfd->type == FEAT_TYPE_LEMMA){printf("\tLEMMA");continue;}
+    if(sfd->type == FEAT_TYPE_CPOS){printf("\tCPOS");continue;}
+    if(sfd->type == FEAT_TYPE_POS){printf("\tPOS");continue;}
+    if(sfd->type == FEAT_TYPE_LABEL){printf("\tLABEL");continue;}
+    if(sfd->type == FEAT_TYPE_INT){printf("\tINT");continue;}
+    printf("\tUNK");
   }
+
   printf("\n");
   /*
   for(i=0; i < m->nb_col; i++){
@@ -127,6 +132,7 @@ void cff2fann(context *ctx)
   char feature_type[64];
   int feature_valindex;
   int count = 0;
+  char *feat_str = NULL;
   
   vocab = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
 
@@ -142,34 +148,46 @@ void cff2fann(context *ctx)
     if (count % 100 == 0)
       fprintf(stderr, "%d\r", count);
     while(token){
-      /* printf("col = %d token = %s max = %d\n", col_nb, token, max_array[col_nb]);  */
+      /* printf("col = %d token = %s\n", col_nb, token); */
       val = atoi(token);
       if(col_nb == 0){
         /* one_hot_print(stdout, val, ctx->mvt_nb);  */
         /* printf("\n"); */
 	printf("%d", val);
       } else {
-        sscanf(dico_int2string(vocab, val), "%[^==]==%d", feature_type, &feature_valindex);
-        /* printf("feature_type = %s\n", feature_type); */
-        feat_type = feat_model_get_type_feat_n(ctx->features_model, col_nb - 1);
-	/* printf("feat_type = %d\n", feat_type);  */
-	/* printf("%d: ", col_nb); */
-        int mcd_col = m->wf2col[feat_type];
-        /* printf("representation = %d\n", m->representation[mcd_col]); */
-        if(m->representation[mcd_col] == MCD_REPRESENTATION_EMB){
-          /* printf("it is an embedding val = %d, file = %s\n", val, m->filename[mcd_col]); */
-          /* word_emb_print(stdout, m->word_emb_array[mcd_col], feature_valindex); */
-          /* printf("\n"); */
-	  printf("\t%d", feature_valindex);
-
-        } else if(m->representation[mcd_col] == MCD_REPRESENTATION_VOCAB){
-          /* printf("it is a vocab\n"); */
-          /* one_hot_print(stdout, feature_valindex, m->dico_array[mcd_col]->nbelem);  */
-          /* printf("\n"); */
+	feat_str = dico_int2string(vocab, val);
+	if(feat_str){
+	  /* printf("feat str = %s\n", feat_str); */
+	  sscanf(feat_str, "%[^==]==%d", feature_type, &feature_valindex);
+	  /* printf("feature_type = %s\n", feature_type); */
+	  feat_type = feat_model_get_type_feat_n(ctx->features_model, col_nb - 1);
+	  /* printf("feat_type = %d\n", feat_type);  */
+	  /* printf("%d: ", col_nb); */
+	  int mcd_col = m->wf2col[feat_type];
+	
+	  /* printf("representation = %d\n", m->representation[mcd_col]); */
+	  if(m->representation[mcd_col] == MCD_REPRESENTATION_EMB){
+	    /* printf("it is an embedding val = %d, file = %s\n", val, m->filename[mcd_col]); */
+	    /* word_emb_print(stdout, m->word_emb_array[mcd_col], feature_valindex); */
+	    /* printf("\n"); */
+	    printf("\t%d", feature_valindex);
+	    
+	  } else if(m->representation[mcd_col] == MCD_REPRESENTATION_VOCAB){
+	    /* printf("it is a vocab\n"); */
+	    /* one_hot_print(stdout, feature_valindex, m->dico_array[mcd_col]->nbelem);  */
+	    /* printf("\n"); */
+	    printf("\t%d", feature_valindex);
+	  } else {
+	    printf("\t%d", feature_valindex);
+	  }
+	}
+	else{
+	  fprintf(stderr, "WARNING cannot find the description of feature : %d\n", val);
+	  feature_valindex = -1;
+	  
 	  printf("\t%d", feature_valindex);
-        } else {
-          printf("\t%d", feature_valindex);
-        }
+	  
+	}
       }
       col_nb++;
       token = strtok(NULL , "\t");
@@ -193,6 +211,9 @@ int main(int argc, char *argv[])
 
   ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
 
+
+  check_feature_model(ctx->features_model);
+  
   look_for_number_of_features_and_classes(ctx->cff_filename, &nb_feat, &nb_class);
   ctx->mvt_nb = nb_class;
 
diff --git a/maca_trans_parser/src/maca_trans_lemmatizer.c b/maca_trans_parser/src/maca_trans_lemmatizer.c
index 645be94..641ea6c 100644
--- a/maca_trans_parser/src/maca_trans_lemmatizer.c
+++ b/maca_trans_parser/src/maca_trans_lemmatizer.c
@@ -158,7 +158,8 @@ int main(int argc, char *argv[])
       lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose);
       if(lemma_from_fplm){
 	//	printf("lemma %s found in exceptions file\n", lemma_from_fplm);
-	print_word(b0, ctx->mcd_struct, lemma_from_fplm);
+	//		print_word(b0, ctx->mcd_struct, to_lower_string(lemma_from_fplm));
+		print_word(b0, ctx->mcd_struct, lemma_from_fplm);
       }
     // if lemma is not found in exception file, predict an l_rule 
       else{
@@ -185,15 +186,16 @@ int main(int argc, char *argv[])
 	    if(l_rule_is_applicable(form, l_rule)){
 	      char *transformed_lemma = apply_l_rule(form, l_rule);
 	      //	printf("transformed_lemma = %s\n", transformed_lemma);
-	      //	    print_word(b0, ctx->mcd_struct, to_lower_string(transformed_lemma));
-	      print_word(b0, ctx->mcd_struct, transformed_lemma);
+	      //	      	    print_word(b0, ctx->mcd_struct, to_lower_string(transformed_lemma));
+		    print_word(b0, ctx->mcd_struct, transformed_lemma);
 	      free(transformed_lemma);
 	      break;
 	    }
 	  }
 	  /* no rule applied */
 	  if(i == 10){
-	    print_word(b0, ctx->mcd_struct, form);
+	    //	        print_word(b0, ctx->mcd_struct, to_lower_string(form));
+		print_word(b0, ctx->mcd_struct, form);
 	  }
 	  free(vcode_array);
 	}
diff --git a/maca_trans_parser/src/movement_parser_arc_eager.h b/maca_trans_parser/src/movement_parser_arc_eager.h
index 3a040d5..2b11bf0 100644
--- a/maca_trans_parser/src/movement_parser_arc_eager.h
+++ b/maca_trans_parser/src/movement_parser_arc_eager.h
@@ -7,9 +7,9 @@
 #define MVT_PARSER_SHIFT 0
 #define MVT_PARSER_REDUCE 1
 #define MVT_PARSER_ROOT 2
-#define MVT_PARSER_EOS -1
-#define MVT_PARSER_LEFT 3
-#define MVT_PARSER_RIGHT 4
+#define MVT_PARSER_EOS 3
+#define MVT_PARSER_LEFT 4
+#define MVT_PARSER_RIGHT 5
 
 /* even movements are left movements (except 0, which is shift and 2 which is root) */
 #define movement_parser_left_code(label) (2 * (label) + 4)
-- 
GitLab