From 55796a1c92366bcc7b74cb43f06ad72dab85376e Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Fri, 10 Nov 2017 10:19:46 +0100 Subject: [PATCH] fixed bug in tokenization rules for french --- maca_tokenizer/src/fr_tok_rules.l | 18 +++++++++--------- .../src/maca_tokenizer_functions_for_lex.c | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index 63af3d2..907beaa 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -29,15 +29,15 @@ nosepar [^ \t\n] \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} {separ}+ {maca_tokenizer_segment((char *)"", yytext);} -{separ}*\. {maca_tokenizer_segment((char *)".", yytext);} -{separ}*\? {maca_tokenizer_segment((char *)"?", yytext);} -{separ}*\! {maca_tokenizer_segment((char *)"!", yytext);} -{separ}*, {maca_tokenizer_segment((char *)",", yytext);} -{separ}*: {maca_tokenizer_segment((char *)":", yytext);} -{separ}*; {maca_tokenizer_segment((char *)";", yytext);} -{separ}*… {maca_tokenizer_segment((char *)"…", yytext);} -{separ}*\) {maca_tokenizer_segment((char *)")", yytext);} -{separ}*» {maca_tokenizer_segment((char *)"»", yytext);} +\. {maca_tokenizer_segment((char *)".", yytext);} +\? {maca_tokenizer_segment((char *)"?", yytext);} +\! {maca_tokenizer_segment((char *)"!", yytext);} +, {maca_tokenizer_segment((char *)",", yytext);} +: {maca_tokenizer_segment((char *)":", yytext);} +; {maca_tokenizer_segment((char *)";", yytext);} +… {maca_tokenizer_segment((char *)"…", yytext);} +\) {maca_tokenizer_segment((char *)")", yytext);} +» {maca_tokenizer_segment((char *)"»", yytext);} \( {maca_tokenizer_segment((char *)"(", yytext);} \" {maca_tokenizer_segment((char *)"\"", yytext);} « {maca_tokenizer_segment((char *)"«", yytext);} diff --git a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c index 55b7635..6e503c4 100644 --- a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c +++ b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c @@ -28,7 +28,7 @@ void maca_tokenizer_segment(char *separator, char *text_matched){ if(print_offset) printf("\t%d", offset); if(print_token_length) - printf("\t%d", (int) strlen(separator)); + printf("\t%d", (int) utf8_strlen(separator)); printf("\n"); } offset += utf8_strlen(text_matched); -- GitLab