diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index 63af3d2e44fc0579e91c7eedf8283bbb4b3e8372..907beaaed67fccb2883872a844d1feaa36b5f8b1 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -29,15 +29,15 @@ nosepar [^ \t\n] \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} {separ}+ {maca_tokenizer_segment((char *)"", yytext);} -{separ}*\. {maca_tokenizer_segment((char *)".", yytext);} -{separ}*\? {maca_tokenizer_segment((char *)"?", yytext);} -{separ}*\! {maca_tokenizer_segment((char *)"!", yytext);} -{separ}*, {maca_tokenizer_segment((char *)",", yytext);} -{separ}*: {maca_tokenizer_segment((char *)":", yytext);} -{separ}*; {maca_tokenizer_segment((char *)";", yytext);} -{separ}*… {maca_tokenizer_segment((char *)"…", yytext);} -{separ}*\) {maca_tokenizer_segment((char *)")", yytext);} -{separ}*» {maca_tokenizer_segment((char *)"»", yytext);} +\. {maca_tokenizer_segment((char *)".", yytext);} +\? {maca_tokenizer_segment((char *)"?", yytext);} +\! {maca_tokenizer_segment((char *)"!", yytext);} +, {maca_tokenizer_segment((char *)",", yytext);} +: {maca_tokenizer_segment((char *)":", yytext);} +; {maca_tokenizer_segment((char *)";", yytext);} +… {maca_tokenizer_segment((char *)"…", yytext);} +\) {maca_tokenizer_segment((char *)")", yytext);} +» {maca_tokenizer_segment((char *)"»", yytext);} \( {maca_tokenizer_segment((char *)"(", yytext);} \" {maca_tokenizer_segment((char *)"\"", yytext);} « {maca_tokenizer_segment((char *)"«", yytext);} diff --git a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c index 55b7635560267030afbd2ab206c2a2de875655d8..6e503c4503a106c788a30ebbaba0cf6d7fa76b13 100644 --- a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c +++ b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c @@ -28,7 +28,7 @@ void maca_tokenizer_segment(char *separator, char *text_matched){ if(print_offset) printf("\t%d", offset); if(print_token_length) - printf("\t%d", (int) strlen(separator)); + printf("\t%d", (int) utf8_strlen(separator)); printf("\n"); } offset += utf8_strlen(text_matched);