From ce4645960e291b77553f30fe856954a0c46055bb Mon Sep 17 00:00:00 2001
From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
Date: Thu, 23 Mar 2017 12:01:27 +0100
Subject: [PATCH] fixed few bugs in maca_tokenizer

---
 maca_common/src/trie.c      |   4 +-
 maca_lexer/src/maca_lexer.c | 132 ++++++++++++++++++++++++++----------
 2 files changed, 97 insertions(+), 39 deletions(-)

diff --git a/maca_common/src/trie.c b/maca_common/src/trie.c
index 150bdae..b25bca3 100644
--- a/maca_common/src/trie.c
+++ b/maca_common/src/trie.c
@@ -142,8 +142,8 @@ int trie_lookup(trie *t, int *word, int length)
 	break;
       }
     }
-     if(trans == NULL)
-       return 0;
+    if(trans == NULL)
+      return 0;
   }
   return t->states[current_state]->is_accept;
 }
diff --git a/maca_lexer/src/maca_lexer.c b/maca_lexer/src/maca_lexer.c
index b096645..03eceab 100644
--- a/maca_lexer/src/maca_lexer.c
+++ b/maca_lexer/src/maca_lexer.c
@@ -38,6 +38,33 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in
   return -1;
 }
 
+void print_states_array(char *buffer, context *ctx, trie *mwe_trie, dico *d_mwe_tokens, int *states_array, int *symbols_array, int path_index)
+{
+  int i;
+  if(path_index == 0) return;
+  int accept_state_index =  look_for_accept_state_in_path(mwe_trie, states_array, path_index);
+  /* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
+  for(i=0; i <= accept_state_index; i++){ 
+    if(ctx->paste){
+      if(i > 0) printf("%s", ctx->mwe_tokens_separator); 
+      printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); 
+    }
+    else{
+      if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
+      else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
+    }
+  }
+  if(ctx->paste)
+    if(accept_state_index != -1) printf("\n"); 
+  /* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe, they are just printed */
+  for(i = accept_state_index + 1; i < path_index; i++){
+    if(ctx->paste)
+      printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); 
+    else
+      printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
+  }
+}
+
 
 int main(int argc, char *argv[])
 {
@@ -48,10 +75,10 @@ int main(int argc, char *argv[])
   FILE *f = NULL;
   trie *mwe_trie;
   dico *d_mwe_tokens = NULL;
-  int states_array[100];
+  int states_array[100]; /* an array in which we store the states we have traversed in the trie */
   int symbols_array[100];
   int path_index = 0;
-  int i;
+  int next_state;
   
   ctx = context_read_options(argc, argv);
    maca_lexer_check_options(ctx);
@@ -74,77 +101,108 @@ int main(int argc, char *argv[])
   d_mwe_tokens = dico_read(ctx->mwe_tokens_dico_filename, 0.5);
 
   /* trie_print(stdout, mwe_trie); */
-
   
-  /* look for a valid word */
   while(fgets(buffer, 10000, f)){
-    if(feof(f)) return 0; /* no more words to read */
+    /* look for a valid word */
     if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){
       printf("\n");
       continue;
     }
-    
     buffer[strlen(buffer)-1] = '\0';
+    /* look for code of word read */
     form_code = dico_string2int(d_mwe_tokens, buffer);
+
+    if(form_code == -1){
+      print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
+      path_index = 0;
+      /* print the current token */
+      if(ctx->paste)
+	printf("%s\n", buffer);
+      else
+	printf("%s\t1\n", buffer);
+      continue;
+    }
+
+    next_state = trie_destination_state(mwe_trie, (path_index == 0) ? 0: states_array[path_index - 1], form_code);
+    if(next_state != 0){
+      symbols_array[path_index] = form_code;
+      states_array[path_index] = next_state;
+      path_index++;
+      continue;
+    }
+
+    print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
+    if(path_index != 0)
+      next_state = trie_destination_state(mwe_trie, 0, form_code);
+    path_index = 0;
+    if(next_state){
+      symbols_array[path_index] = form_code;
+      states_array[path_index] = next_state;
+      path_index++;
+      continue;
+    }
+    
+    if(ctx->paste)
+      printf("%s\n", buffer);
+    else
+      printf("%s\t1\n", buffer);
+  
+
+#if 0
+    
     symbols_array[path_index]  = form_code;
-    states_array[path_index] = (form_code == -1)? 0
-      : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code);
-    /*
-    printf("buffer = %s ", buffer);
-    printf("code  = %d\n", form_code);
+    states_array[path_index] = (form_code == -1)? 0 /* if word has invalid code, go to initial state */
+      : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); /* otherwise try to move forward in the trie */
     
+    /*    printf("buffer = %s ", buffer);
+    printf("code  = %d\n", form_code);
     
+    printf("states array :");
     for(i=0; i <= path_index; i++){
       printf("%d ", states_array[i]);
     }
     printf("\n");
+    printf("symbols array :");
     for(i=0; i <= path_index; i++){
       printf("%d ", symbols_array[i]);
     }
-    printf("\n");
+    printf("\n**********************\n");
     */
+    
     if(states_array[path_index] == 0){ /* in initial state of trie */
-      /* nothing has been recognized */
+      /* nothing has been recognized, just print current word */
       if(path_index == 0)
 	if(ctx->paste)
-	 printf("%s\n", buffer);
+	  printf("%s\n", buffer);
 	else
 	  printf("%s\t1\n", buffer);
       else{ /* there is something in the path */
-	int accept_state_index =  look_for_accept_state_in_path(mwe_trie, states_array, path_index);
-	/* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
-	for(i=0; i <= accept_state_index; i++){ 
-	  if(ctx->paste){
-	  if(i > 0) printf("%s", ctx->mwe_tokens_separator); 
-	  printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); 
-	  }
-	  else{
-	    if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
-	    else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
-	  }
-	}
-	if(ctx->paste)
-	  if(accept_state_index != -1) printf("\n"); 
-	/* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */
-	for(i = accept_state_index + 1; i < path_index; i++){
-	  if(ctx->paste)
-	    printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); 
-	  else
-	    printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
-	}
+	print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
+	path_index = 0;
+
+	states_array[path_index] = (form_code == -1)? 0 /* if word has invalid code, go to initial state */
+	  : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); /* otherwise try to move forward in the trie */
+
+	
 	/* do not forget to print the current token */
 	if(ctx->paste)
 	  printf("%s\n", buffer);
 	else
 	  printf("%s\t1\n", buffer);
-	path_index = 0;
       }
     }
     /* not in state 0 of trie we are processing tokens of a potential mwe */
     else{ 
       path_index++;
     }
-    
+    #endif
+  }
+  
+  if(path_index != 0){ /* there is something in states array */
+    print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
+    path_index = 0;
   }
+  
   return 0;
 }
+
-- 
GitLab