diff --git a/maca_tokenizer/src/en_tok_rules.l b/maca_tokenizer/src/en_tok_rules.l index 07b169cc0bcd706f97fc7d62d86e953a3d028f17..4e684f1af5d6ee62260a6bc3ebf1ab8a1af6b5ed 100644 --- a/maca_tokenizer/src/en_tok_rules.l +++ b/maca_tokenizer/src/en_tok_rules.l @@ -9,7 +9,7 @@ extern char *token; %option prefix="en" -%option noyywrap +/*%option noyywrap*/ %% \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} @@ -44,3 +44,9 @@ wanna printf("want\nto"); . {maca_tokenizer_add_char_to_token(yytext[0]);} %% + +int enwrap(void) +{ +maca_tokenizer_segment((char *)"", (char *)""); +return 1; +} diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index 025b94bdfd752d11175be6f762cec774bb9c24dd..63af3d2e44fc0579e91c7eedf8283bbb4b3e8372 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -9,6 +9,9 @@ extern int defait_amalgames; extern int offset; extern int token_length; extern char *token; + + + %} separ [ \t\n] nosepar [^ \t\n] @@ -16,7 +19,7 @@ nosepar [^ \t\n] %option prefix="fr" /*%option outfile="fr_lex.c"*/ -%option noyywrap +/*%option noyywrap*/ %s state_defait_amalgames %% @@ -35,9 +38,9 @@ nosepar [^ \t\n] {separ}*… {maca_tokenizer_segment((char *)"…", yytext);} {separ}*\) {maca_tokenizer_segment((char *)")", yytext);} {separ}*» {maca_tokenizer_segment((char *)"»", yytext);} -\( {maca_tokenizer_segment((char *)"((", yytext);} -\" {maca_tokenizer_segment((char *)"\"", yytext);} -« {maca_tokenizer_segment((char *)"«", yytext);} +\( {maca_tokenizer_segment((char *)"(", yytext);} +\" {maca_tokenizer_segment((char *)"\"", yytext);} +« {maca_tokenizer_segment((char *)"«", yytext);} {nosepar}*' {maca_tokenizer_segment((char *)yytext, yytext);} {nosepar}*’ {maca_tokenizer_segment((char *)yytext, yytext);} @@ -66,3 +69,9 @@ nosepar [^ \t\n] " aux " printf("\nà\nles\n"); } %% + +int frwrap(void) +{ +maca_tokenizer_segment((char *)"", (char *)""); +return 1; +} diff --git a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c index 8f058c970808911af8ac0f13b0591fe865c86819..55b7635560267030afbd2ab206c2a2de875655d8 100644 --- a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c +++ b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c @@ -9,7 +9,7 @@ extern char token[]; extern int print_offset; extern int print_token_length; -void maca_tokenizer_segment(char *separator, char *xx){ +void maca_tokenizer_segment(char *separator, char *text_matched){ if(token_length != 0){ printf("%s", token); if(print_offset) @@ -21,6 +21,7 @@ void maca_tokenizer_segment(char *separator, char *xx){ offset += utf8_strlen(token); token_length = 0; + token[0] = 0; if(strlen(separator) != 0){ printf("%s", separator); @@ -30,7 +31,7 @@ void maca_tokenizer_segment(char *separator, char *xx){ printf("\t%d", (int) strlen(separator)); printf("\n"); } - offset += strlen(xx); + offset += utf8_strlen(text_matched); } @@ -40,3 +41,4 @@ void maca_tokenizer_add_char_to_token(char c) token_length++; token[token_length] = 0; } + diff --git a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h index a57ffe63c753d77c3122fcbdfcc91c68a5fa5717..3e2669effee5fa638e9ca9f353608bfad5ebb270 100644 --- a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h +++ b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h @@ -1,3 +1,4 @@ void maca_tokenizer_segment(char *separator, char *xx); void maca_tokenizer_add_char_to_token(char c); +