diff --git a/build/debug.sh b/build/debug.sh deleted file mode 100755 index ec037e3c185c39ac81f861094f0c7bb033df1a78..0000000000000000000000000000000000000000 --- a/build/debug.sh +++ /dev/null @@ -1,3 +0,0 @@ -cmake -DCMAKE_BUILD_TYPE=Debug .. -make -sudo make install diff --git a/build/script.sh b/build/script.sh deleted file mode 100755 index 9485f62e027a988d84b15033b788b4b2d5d1970a..0000000000000000000000000000000000000000 --- a/build/script.sh +++ /dev/null @@ -1,3 +0,0 @@ -cmake .. -make -sudo make install diff --git a/maca_tokenizer/CMakeLists.txt b/maca_tokenizer/CMakeLists.txt index b2e874ca1fb06951ff72b2a18986de00673e3992..0952a2b1400718b74b095ef9aeb76e18976c014a 100644 --- a/maca_tokenizer/CMakeLists.txt +++ b/maca_tokenizer/CMakeLists.txt @@ -2,6 +2,7 @@ FLEX_TARGET(fr_tok_rules ./src/fr_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/fr_lex FLEX_TARGET(en_tok_rules ./src/en_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/en_lex.c) set(SOURCES ./src/context.c + ./src/maca_tokenizer_functions_for_lex.c ${FLEX_fr_tok_rules_OUTPUTS} ${FLEX_en_tok_rules_OUTPUTS}) diff --git a/maca_tokenizer/src/context.c b/maca_tokenizer/src/context.c index 9c9236bab08ac75200c457759ded3a78611046e3..949bbcc201f1f26eafd7462a89e3750fa57287b0 100644 --- a/maca_tokenizer/src/context.c +++ b/maca_tokenizer/src/context.c @@ -32,18 +32,11 @@ context *context_new(void) ctx->maca_data_path = NULL; ctx->input_filename = NULL; ctx->output_filename = NULL; + ctx->print_offset = 0; + ctx->print_token_length = 0; return ctx; } -void context_general_help_message(context *ctx) -{ - fprintf(stderr, "usage: %s [options]\n", ctx->program_name); - fprintf(stderr, "Options:\n"); - fprintf(stderr, "\t-h --help : print this message\n"); - fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); - fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); -} - void context_input_help_message(context *ctx){ fprintf(stderr, "\t-i --input <file> : input mcf file name\n"); } @@ -56,6 +49,22 @@ void context_language_help_message(context *ctx){ fprintf(stderr, "\t-L --language : identifier of the language to use\n"); } +void context_print_offset_message(context *ctx){ + fprintf(stderr, "\t-p --print_offset : print offset and token length\n"); +} + +void context_general_help_message(context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); + context_print_offset_message(ctx); +} + + + context *context_read_options(int argc, char *argv[]) { int c; @@ -64,11 +73,12 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[8] = + static struct option long_options[9] = { {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, {"debug", no_argument, 0, 'd'}, + {"print_offset", no_argument, 0, 'p'}, {"input", required_argument, 0, 'i'}, {"output", required_argument, 0, 'o'}, {"mcd", required_argument, 0, 'C'}, @@ -78,7 +88,7 @@ context *context_read_options(int argc, char *argv[]) optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "hvdi:o:C:L:D:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "hvdpi:o:C:L:D:", long_options, &option_index)) != -1){ switch (c) { case 'd': @@ -105,6 +115,10 @@ context *context_read_options(int argc, char *argv[]) case 'D': ctx->maca_data_path = strdup(optarg); break; + case 'p': + ctx->print_offset = 1; + ctx->print_token_length = 1; + break; } } diff --git a/maca_tokenizer/src/context.h b/maca_tokenizer/src/context.h index ce7e8f1d68cfd1f1137a59906142e631e0343f95..0ab89a195b33cd5b9c359a73821ae84ca0113c7b 100644 --- a/maca_tokenizer/src/context.h +++ b/maca_tokenizer/src/context.h @@ -18,6 +18,8 @@ typedef struct { mcd *mcd_struct; char *input_filename; char *output_filename; + int print_offset; + int print_token_length; } context; context *context_new(void); diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index 59b1909acc4c347676e79d025d2efb993d1c6af3..0f2b0cb8665e406e9c4c58eff105f603f1282bbb 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -1,6 +1,13 @@ %{ -#include <stdio.h> +#include<stdio.h> +#include"maca_tokenizer_functions_for_lex.h" + extern int defait_amalgames; +/*extern int print_offset; +extern int print_token_length;*/ +int offset = 0; +int token_length = 0; +char token[10000]; %} %option prefix="fr" @@ -12,36 +19,39 @@ extern int defait_amalgames; if(defait_amalgames){ BEGIN(state_defait_amalgames); } +\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} +[ \t]+ {maca_tokenizer_segment((char *)"", yytext);} +[ ]*\. {maca_tokenizer_segment((char *)".", yytext);} +[ ]*\? {maca_tokenizer_segment((char *)"?", yytext);} +[ ]*\! {maca_tokenizer_segment((char *)"!", yytext);} +[ ]*, {maca_tokenizer_segment((char *)",", yytext);} +[ ]*: {maca_tokenizer_segment((char *)":", yytext);} +[ ]*; {maca_tokenizer_segment((char *)";", yytext);} +[ ]*… {maca_tokenizer_segment((char *)"…", yytext);} +[ ]*\) {maca_tokenizer_segment((char *)")", yytext);} +[ ]*» {maca_tokenizer_segment((char *)"»", yytext);} +\( {maca_tokenizer_segment((char *)"((", yytext);} +' {maca_tokenizer_segment((char *)"'", yytext);} +’ {maca_tokenizer_segment((char *)"'", yytext);} +\" {maca_tokenizer_segment((char *)"\"", yytext);} +« {maca_tokenizer_segment((char *)"«", yytext);} + +[0-9]+,[0-9]+ {maca_tokenizer_segment(yytext, yytext);} -[0-9]+,[0-9]+ printf("%s", yytext); -[ \t]+ printf("\n"); -[ ]*\. printf("\n."); -[ ]*\? printf("\n?"); -[ ]*\! printf("\n!"); -[ ]*, printf("\n,"); -[ ]*: printf("\n:"); -[ ]*; printf("\n;"); -[ ]*… printf("\n…"); -[ ]*\) printf("\n)"); -[ ]*» printf("\n»"); -\( printf("(\n"); -' printf("'\n"); -’ printf("'\n"); -\" printf("\"\n"); -« printf("»\n"); --je printf("\n-je"); --tu printf("\n-tu"); --on printf("\n-on"); --ce printf("\n-ce"); --t-il printf("\n-t-il"); --il printf("\n-il"); --t-ils printf("\n-t-ils"); --ils printf("\n-ils"); --t-elle printf("\n-t-elle"); --elle printf("\n-elle"); --t-elles printf("\n-t-elles"); --elles printf("\n-elles"); -\n+ printf("\n"); +-je {maca_tokenizer_segment((char *)"-je", yytext);} +-tu {maca_tokenizer_segment((char *)"-tu", yytext);} +-on {maca_tokenizer_segment((char *)"-on", yytext);} +-ce {maca_tokenizer_segment((char *)"-ce", yytext);} +-t-il {maca_tokenizer_segment((char *)"-t-il", yytext);} +-il {maca_tokenizer_segment((char *)"-il", yytext);} +-t-ils {maca_tokenizer_segment((char *)"-t-ils", yytext);} +-ils {maca_tokenizer_segment((char *)"-ils", yytext);} +-t-elle {maca_tokenizer_segment((char *)"-t-elle", yytext);} +-elle {maca_tokenizer_segment((char *)"-elle", yytext);} +-t-elles {maca_tokenizer_segment((char *)"-t-elles", yytext);} +-elles {maca_tokenizer_segment((char *)"-elles", yytext);} +\n+ {maca_tokenizer_segment((char *)"", yytext);} +. {maca_tokenizer_add_char_to_token(yytext[0]);} <state_defait_amalgames>{ " du " printf("\nde\nle\n"); diff --git a/maca_tokenizer/src/maca_tokenizer.c b/maca_tokenizer/src/maca_tokenizer.c index 6768e5da317a68bb98da83331151f63d61f1ffcc..524baa61360d1191c4d25aeef7794bb91c36769e 100644 --- a/maca_tokenizer/src/maca_tokenizer.c +++ b/maca_tokenizer/src/maca_tokenizer.c @@ -7,6 +7,8 @@ int enlex(void); int frlex(void); int defait_amalgames = 0; +int print_offset = 0; +int print_token_length = 0; void maca_tokenizer_help_message(context *ctx) { @@ -32,6 +34,9 @@ int main(int argc, char* argv[]) ctx = context_read_options(argc, argv); maca_tokenizer_check_options(ctx); + + print_offset = ctx->print_offset; + print_token_length = ctx->print_token_length; if(!strcmp(ctx->language, "en")) enlex() ; diff --git a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c new file mode 100644 index 0000000000000000000000000000000000000000..8f058c970808911af8ac0f13b0591fe865c86819 --- /dev/null +++ b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.c @@ -0,0 +1,42 @@ +#include <stdio.h> +#include <string.h> +#include "char16.h" + +extern int offset; +extern int token_length; +extern char token[]; +/*extern char *yytext;*/ +extern int print_offset; +extern int print_token_length; + +void maca_tokenizer_segment(char *separator, char *xx){ + if(token_length != 0){ + printf("%s", token); + if(print_offset) + printf("\t%d", offset); + if(print_token_length) + printf("\t%d", utf8_strlen(token)); + printf("\n"); + } + + offset += utf8_strlen(token); + token_length = 0; + + if(strlen(separator) != 0){ + printf("%s", separator); + if(print_offset) + printf("\t%d", offset); + if(print_token_length) + printf("\t%d", (int) strlen(separator)); + printf("\n"); + } + offset += strlen(xx); +} + + +void maca_tokenizer_add_char_to_token(char c) +{ + token[token_length] = c; + token_length++; + token[token_length] = 0; +} diff --git a/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h new file mode 100644 index 0000000000000000000000000000000000000000..a57ffe63c753d77c3122fcbdfcc91c68a5fa5717 --- /dev/null +++ b/maca_tokenizer/src/maca_tokenizer_functions_for_lex.h @@ -0,0 +1,3 @@ + +void maca_tokenizer_segment(char *separator, char *xx); +void maca_tokenizer_add_char_to_token(char c);