diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index 8478402b957317a95cff5b19e2b475fc4801b880..025b94bdfd752d11175be6f762cec774bb9c24dd 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -10,33 +10,37 @@ extern int offset; extern int token_length; extern char *token; %} +separ [ \t\n] +nosepar [^ \t\n] %option prefix="fr" /*%option outfile="fr_lex.c"*/ %option noyywrap %s state_defait_amalgames + %% if(defait_amalgames){ BEGIN(state_defait_amalgames); - } -\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} -[ \t]+ {maca_tokenizer_segment((char *)"", yytext);} -[ ]*\. {maca_tokenizer_segment((char *)".", yytext);} -[ ]*\? {maca_tokenizer_segment((char *)"?", yytext);} -[ ]*\! {maca_tokenizer_segment((char *)"!", yytext);} -[ ]*, {maca_tokenizer_segment((char *)",", yytext);} -[ ]*: {maca_tokenizer_segment((char *)":", yytext);} -[ ]*; {maca_tokenizer_segment((char *)";", yytext);} -[ ]*… {maca_tokenizer_segment((char *)"…", yytext);} -[ ]*\) {maca_tokenizer_segment((char *)")", yytext);} -[ ]*» {maca_tokenizer_segment((char *)"»", yytext);} + } + +\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} +{separ}+ {maca_tokenizer_segment((char *)"", yytext);} +{separ}*\. {maca_tokenizer_segment((char *)".", yytext);} +{separ}*\? {maca_tokenizer_segment((char *)"?", yytext);} +{separ}*\! {maca_tokenizer_segment((char *)"!", yytext);} +{separ}*, {maca_tokenizer_segment((char *)",", yytext);} +{separ}*: {maca_tokenizer_segment((char *)":", yytext);} +{separ}*; {maca_tokenizer_segment((char *)";", yytext);} +{separ}*… {maca_tokenizer_segment((char *)"…", yytext);} +{separ}*\) {maca_tokenizer_segment((char *)")", yytext);} +{separ}*» {maca_tokenizer_segment((char *)"»", yytext);} \( {maca_tokenizer_segment((char *)"((", yytext);} \" {maca_tokenizer_segment((char *)"\"", yytext);} « {maca_tokenizer_segment((char *)"«", yytext);} -[^ ]*' {maca_tokenizer_segment((char *)yytext, yytext);} -[^ ]*’ {maca_tokenizer_segment((char *)yytext, yytext);} +{nosepar}*' {maca_tokenizer_segment((char *)yytext, yytext);} +{nosepar}*’ {maca_tokenizer_segment((char *)yytext, yytext);} [0-9]+,[0-9]+ {maca_tokenizer_segment(yytext, yytext);} @@ -52,7 +56,7 @@ extern char *token; -elle {maca_tokenizer_segment((char *)"-elle", yytext);} -t-elles {maca_tokenizer_segment((char *)"-t-elles", yytext);} -elles {maca_tokenizer_segment((char *)"-elles", yytext);} -\n+ {maca_tokenizer_segment((char *)"", yytext);} +-là {maca_tokenizer_segment((char *)"-là", yytext);} . {maca_tokenizer_add_char_to_token(yytext[0]);} <state_defait_amalgames>{