From b564c719fd3060bafdf569a175897dec122b7508 Mon Sep 17 00:00:00 2001
From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
Date: Mon, 6 Nov 2017 14:37:48 +0100
Subject: [PATCH] modified tokenizer so that it outputs last token (with the
 help of yywrap)

---
 maca_tokenizer/src/en_tok_rules.l               |  8 +++++++-
 maca_tokenizer/src/fr_tok_rules.l               | 17 +++++++++++++----
 .../src/maca_tokenizer_functions_for_lex.c      |  6 ++++--
 .../src/maca_tokenizer_functions_for_lex.h      |  1 +
 4 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/maca_tokenizer/src/en_tok_rules.l b/maca_tokenizer/src/en_tok_rules.l
index 07b169c..4e684f1 100644
--- a/maca_tokenizer/src/en_tok_rules.l
+++ b/maca_tokenizer/src/en_tok_rules.l
@@ -9,7 +9,7 @@ extern char *token;
 
 %option prefix="en"
 
-%option noyywrap
+/*%option noyywrap*/
 %%
 
 \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
@@ -44,3 +44,9 @@ wanna printf("want\nto");
 .        {maca_tokenizer_add_char_to_token(yytext[0]);}
 
 %%
+
+int enwrap(void)
+{
+maca_tokenizer_segment((char *)"", (char *)"");
+return 1;
+}
diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l
index 025b94b..63af3d2 100644
--- a/maca_tokenizer/src/fr_tok_rules.l
+++ b/maca_tokenizer/src/fr_tok_rules.l
@@ -9,6 +9,9 @@ extern int defait_amalgames;
 extern int offset;
 extern int token_length;
 extern char *token;
+
+
+
 %}
 separ [ \t\n]
 nosepar [^ \t\n]
@@ -16,7 +19,7 @@ nosepar [^ \t\n]
 %option prefix="fr"
 /*%option outfile="fr_lex.c"*/
 
-%option noyywrap
+/*%option noyywrap*/
 %s state_defait_amalgames
 
 %%
@@ -35,9 +38,9 @@ nosepar [^ \t\n]
 {separ}*…    {maca_tokenizer_segment((char *)"…", yytext);}
 {separ}*\)   {maca_tokenizer_segment((char *)")", yytext);}
 {separ}*»    {maca_tokenizer_segment((char *)"»", yytext);}
-\(       {maca_tokenizer_segment((char *)"((", yytext);}
-\"	 {maca_tokenizer_segment((char *)"\"", yytext);}
-«	 {maca_tokenizer_segment((char *)"«", yytext);}
+\(           {maca_tokenizer_segment((char *)"(", yytext);}
+\"           {maca_tokenizer_segment((char *)"\"", yytext);}
+«            {maca_tokenizer_segment((char *)"«", yytext);}
 
 {nosepar}*'   {maca_tokenizer_segment((char *)yytext, yytext);}
 {nosepar}*’   {maca_tokenizer_segment((char *)yytext, yytext);}
@@ -66,3 +69,9 @@ nosepar [^ \t\n]
 " aux " printf("\nà\nles\n");
 }
 %%
+
+int frwrap(void)
+{
+maca_tokenizer_segment((char *)"", (char *)"");
+return 1;
+}
diff --git a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c
index 8f058c9..55b7635 100644
--- a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c
+++ b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c
@@ -9,7 +9,7 @@ extern char token[];
 extern int print_offset;
 extern int print_token_length;
 
-void maca_tokenizer_segment(char *separator, char *xx){
+void maca_tokenizer_segment(char *separator, char *text_matched){
   if(token_length != 0){
     printf("%s", token);
     if(print_offset)
@@ -21,6 +21,7 @@ void maca_tokenizer_segment(char *separator, char *xx){
   
   offset += utf8_strlen(token);
   token_length = 0;
+  token[0] = 0;
   
   if(strlen(separator) != 0){
     printf("%s", separator);
@@ -30,7 +31,7 @@ void maca_tokenizer_segment(char *separator, char *xx){
       printf("\t%d", (int) strlen(separator));
     printf("\n");
   }
-  offset += strlen(xx); 
+  offset += utf8_strlen(text_matched); 
 }
 
 
@@ -40,3 +41,4 @@ void maca_tokenizer_add_char_to_token(char c)
   token_length++;
   token[token_length] = 0;
 }
+
diff --git a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h
index a57ffe6..3e2669e 100644
--- a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h
+++ b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h
@@ -1,3 +1,4 @@
 
 void maca_tokenizer_segment(char *separator, char *xx);
 void maca_tokenizer_add_char_to_token(char c);
+
-- 
GitLab