From b564c719fd3060bafdf569a175897dec122b7508 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Mon, 6 Nov 2017 14:37:48 +0100 Subject: [PATCH] modified tokenizer so that it outputs last token (with the help of yywrap) --- maca_tokenizer/src/en_tok_rules.l | 8 +++++++- maca_tokenizer/src/fr_tok_rules.l | 17 +++++++++++++---- .../src/maca_tokenizer_functions_for_lex.c | 6 ++++-- .../src/maca_tokenizer_functions_for_lex.h | 1 + 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/maca_tokenizer/src/en_tok_rules.l b/maca_tokenizer/src/en_tok_rules.l index 07b169c..4e684f1 100644 --- a/maca_tokenizer/src/en_tok_rules.l +++ b/maca_tokenizer/src/en_tok_rules.l @@ -9,7 +9,7 @@ extern char *token; %option prefix="en" -%option noyywrap +/*%option noyywrap*/ %% \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} @@ -44,3 +44,9 @@ wanna printf("want\nto"); . {maca_tokenizer_add_char_to_token(yytext[0]);} %% + +int enwrap(void) +{ +maca_tokenizer_segment((char *)"", (char *)""); +return 1; +} diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index 025b94b..63af3d2 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -9,6 +9,9 @@ extern int defait_amalgames; extern int offset; extern int token_length; extern char *token; + + + %} separ [ \t\n] nosepar [^ \t\n] @@ -16,7 +19,7 @@ nosepar [^ \t\n] %option prefix="fr" /*%option outfile="fr_lex.c"*/ -%option noyywrap +/*%option noyywrap*/ %s state_defait_amalgames %% @@ -35,9 +38,9 @@ nosepar [^ \t\n] {separ}*… {maca_tokenizer_segment((char *)"…", yytext);} {separ}*\) {maca_tokenizer_segment((char *)")", yytext);} {separ}*» {maca_tokenizer_segment((char *)"»", yytext);} -\( {maca_tokenizer_segment((char *)"((", yytext);} -\" {maca_tokenizer_segment((char *)"\"", yytext);} -« {maca_tokenizer_segment((char *)"«", yytext);} +\( {maca_tokenizer_segment((char *)"(", yytext);} +\" {maca_tokenizer_segment((char *)"\"", yytext);} +« {maca_tokenizer_segment((char *)"«", yytext);} {nosepar}*' {maca_tokenizer_segment((char *)yytext, yytext);} {nosepar}*’ {maca_tokenizer_segment((char *)yytext, yytext);} @@ -66,3 +69,9 @@ nosepar [^ \t\n] " aux " printf("\nà\nles\n"); } %% + +int frwrap(void) +{ +maca_tokenizer_segment((char *)"", (char *)""); +return 1; +} diff --git a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c index 8f058c9..55b7635 100644 --- a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c +++ b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c @@ -9,7 +9,7 @@ extern char token[]; extern int print_offset; extern int print_token_length; -void maca_tokenizer_segment(char *separator, char *xx){ +void maca_tokenizer_segment(char *separator, char *text_matched){ if(token_length != 0){ printf("%s", token); if(print_offset) @@ -21,6 +21,7 @@ void maca_tokenizer_segment(char *separator, char *xx){ offset += utf8_strlen(token); token_length = 0; + token[0] = 0; if(strlen(separator) != 0){ printf("%s", separator); @@ -30,7 +31,7 @@ void maca_tokenizer_segment(char *separator, char *xx){ printf("\t%d", (int) strlen(separator)); printf("\n"); } - offset += strlen(xx); + offset += utf8_strlen(text_matched); } @@ -40,3 +41,4 @@ void maca_tokenizer_add_char_to_token(char c) token_length++; token[token_length] = 0; } + diff --git a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h index a57ffe6..3e2669e 100644 --- a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h +++ b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h @@ -1,3 +1,4 @@ void maca_tokenizer_segment(char *separator, char *xx); void maca_tokenizer_add_char_to_token(char c); + -- GitLab