Skip to content
Snippets Groups Projects
Commit 55796a1c authored by Alexis Nasr's avatar Alexis Nasr
Browse files

fixed bug in tokenization rules for french

parent b564c719
No related branches found
No related tags found
No related merge requests found
...@@ -29,15 +29,15 @@ nosepar [^ \t\n] ...@@ -29,15 +29,15 @@ nosepar [^ \t\n]
\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} \<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);}
{separ}+ {maca_tokenizer_segment((char *)"", yytext);} {separ}+ {maca_tokenizer_segment((char *)"", yytext);}
{separ}*\. {maca_tokenizer_segment((char *)".", yytext);} \. {maca_tokenizer_segment((char *)".", yytext);}
{separ}*\? {maca_tokenizer_segment((char *)"?", yytext);} \? {maca_tokenizer_segment((char *)"?", yytext);}
{separ}*\! {maca_tokenizer_segment((char *)"!", yytext);} \! {maca_tokenizer_segment((char *)"!", yytext);}
{separ}*, {maca_tokenizer_segment((char *)",", yytext);} , {maca_tokenizer_segment((char *)",", yytext);}
{separ}*: {maca_tokenizer_segment((char *)":", yytext);} : {maca_tokenizer_segment((char *)":", yytext);}
{separ}*; {maca_tokenizer_segment((char *)";", yytext);} ; {maca_tokenizer_segment((char *)";", yytext);}
{separ}*… {maca_tokenizer_segment((char *)"…", yytext);} … {maca_tokenizer_segment((char *)"…", yytext);}
{separ}*\) {maca_tokenizer_segment((char *)")", yytext);} \) {maca_tokenizer_segment((char *)")", yytext);}
{separ}*» {maca_tokenizer_segment((char *)"»", yytext);} » {maca_tokenizer_segment((char *)"»", yytext);}
\( {maca_tokenizer_segment((char *)"(", yytext);} \( {maca_tokenizer_segment((char *)"(", yytext);}
\" {maca_tokenizer_segment((char *)"\"", yytext);} \" {maca_tokenizer_segment((char *)"\"", yytext);}
« {maca_tokenizer_segment((char *)"«", yytext);} « {maca_tokenizer_segment((char *)"«", yytext);}
......
...@@ -28,7 +28,7 @@ void maca_tokenizer_segment(char *separator, char *text_matched){ ...@@ -28,7 +28,7 @@ void maca_tokenizer_segment(char *separator, char *text_matched){
if(print_offset) if(print_offset)
printf("\t%d", offset); printf("\t%d", offset);
if(print_token_length) if(print_token_length)
printf("\t%d", (int) strlen(separator)); printf("\t%d", (int) utf8_strlen(separator));
printf("\n"); printf("\n");
} }
offset += utf8_strlen(text_matched); offset += utf8_strlen(text_matched);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment