From 0442bfb6e491fcf250ecc480b450aff7feea2175 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Mon, 21 Nov 2016 22:26:32 -0500 Subject: [PATCH] defined separate tokenization rules for french and english. Fixed some bugs in maca_lexer. --- maca_common/include/trie.h | 13 ++ maca_lexer/src/context.c | 1 + maca_lexer/src/context.h | 1 + maca_lexer/src/maca_lexer.c | 31 +++- maca_tokenizer/CMakeLists.txt | 18 ++- maca_tokenizer/main.c | 10 -- maca_tokenizer/src/context.c | 149 ++++++++++++++++++ maca_tokenizer/src/context.h | 33 ++++ maca_tokenizer/src/en_tok_rules.l | 27 ++++ .../{tok_rules.l => src/fr_tok_rules.l} | 4 + maca_tokenizer/src/maca_tokenizer.c | 43 +++++ 11 files changed, 310 insertions(+), 20 deletions(-) delete mode 100644 maca_tokenizer/main.c create mode 100644 maca_tokenizer/src/context.c create mode 100644 maca_tokenizer/src/context.h create mode 100644 maca_tokenizer/src/en_tok_rules.l rename maca_tokenizer/{tok_rules.l => src/fr_tok_rules.l} (90%) create mode 100644 maca_tokenizer/src/maca_tokenizer.c diff --git a/maca_common/include/trie.h b/maca_common/include/trie.h index 0ec45ad..697c6e9 100644 --- a/maca_common/include/trie.h +++ b/maca_common/include/trie.h @@ -21,6 +21,19 @@ typedef struct { int states_nb; } trie; +typedef struct { + int state; + int symbol; +} state_symbol; + +typedef struct { + int size; + state_symbol *array; + int nbelem; +} trie_path; + + + trie_state *trie_state_new(trie_trans *transitions, int is_accept); void trie_state_free(trie_state *state); diff --git a/maca_lexer/src/context.c b/maca_lexer/src/context.c index b6d2b5e..bce96ef 100644 --- a/maca_lexer/src/context.c +++ b/maca_lexer/src/context.c @@ -40,6 +40,7 @@ context *context_new(void) ctx->mwe_filename = NULL; ctx->mwe_tokens_dico_filename = NULL; ctx->mwe_tokens_separator = strdup(" "); + ctx->paste = 1; return ctx; } diff --git a/maca_lexer/src/context.h b/maca_lexer/src/context.h index 376f1b6..1ad410d 100644 --- a/maca_lexer/src/context.h +++ b/maca_lexer/src/context.h @@ -22,6 +22,7 @@ typedef struct { char *mwe_filename; char *mwe_tokens_dico_filename; char *mwe_tokens_separator; + int paste; } context; context *context_new(void); diff --git a/maca_lexer/src/maca_lexer.c b/maca_lexer/src/maca_lexer.c index da78d6f..04e56bf 100644 --- a/maca_lexer/src/maca_lexer.c +++ b/maca_lexer/src/maca_lexer.c @@ -22,7 +22,6 @@ void maca_lexer_help_message(context *ctx) context_vocab_help_message(ctx); } - void maca_lexer_check_options(context *ctx){ if(ctx->help){ maca_lexer_help_message(ctx); @@ -111,25 +110,41 @@ int main(int argc, char *argv[]) if(states_array[path_index] == 0){ /* in initial state of trie */ /* nothing has been recognized */ if(path_index == 0) - printf("%s\n", buffer); + if(ctx->paste) + printf("%s\n", buffer); + else + printf("%s\t1\n", buffer); else{ /* there is something in the path */ int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index); /* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */ for(i=0; i <= accept_state_index; i++){ - if(i > 0) printf("%s", ctx->mwe_tokens_separator); - printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); + if(ctx->paste){ + if(i > 0) printf("%s", ctx->mwe_tokens_separator); + printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); + } + else{ + if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + } } - if(accept_state_index != -1) printf("\n"); + if(ctx->paste) + if(accept_state_index != -1) printf("\n"); /* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */ for(i = accept_state_index + 1; i < path_index; i++){ - printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + if(ctx->paste) + printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + else + printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i])); } /* do not forget to print the current token */ - printf("%s\n", buffer); + if(ctx->paste) + printf("%s\n", buffer); + else + printf("%s\t1\n", buffer); path_index = 0; } } - /* not in state 0 of trie */ + /* not in state 0 of trie we are processing tokens of a potential mwe */ else{ path_index++; } diff --git a/maca_tokenizer/CMakeLists.txt b/maca_tokenizer/CMakeLists.txt index d524f50..f100c0b 100644 --- a/maca_tokenizer/CMakeLists.txt +++ b/maca_tokenizer/CMakeLists.txt @@ -1,4 +1,18 @@ -FLEX_TARGET(tokenizer tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/maca_tokenizer.c) +FLEX_TARGET(fr_tok_rules ./src/fr_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/fr_lex.c) +FLEX_TARGET(en_tok_rules ./src/en_tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/en_lex.c) + +set(SOURCES ./src/context.c + ${FLEX_fr_tok_rules_OUTPUTS} + ${FLEX_en_tok_rules_OUTPUTS}) +##compiling library +include_directories(./src) + +add_library(maca_tokenizer_lib STATIC ${SOURCES}) + + + + include_directories(${CMAKE_CURRENT_BINARY_DIR}) -add_executable(maca_tokenizer main.c ${FLEX_tokenizer_OUTPUTS}) +add_executable(maca_tokenizer ./src/maca_tokenizer.c) +target_link_libraries(maca_tokenizer maca_tokenizer_lib maca_common) install (TARGETS maca_tokenizer DESTINATION bin) diff --git a/maca_tokenizer/main.c b/maca_tokenizer/main.c deleted file mode 100644 index f3586e0..0000000 --- a/maca_tokenizer/main.c +++ /dev/null @@ -1,10 +0,0 @@ -int defait_amalgames = 0; - -int main(int argc, char* argv[]) { - - if(argc > 1) defait_amalgames = 1; - yylex() ; - - return 0; -} - diff --git a/maca_tokenizer/src/context.c b/maca_tokenizer/src/context.c new file mode 100644 index 0000000..25a3414 --- /dev/null +++ b/maca_tokenizer/src/context.c @@ -0,0 +1,149 @@ +#include<stdlib.h> +#include<stdio.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include "context.h" +#include "util.h" + + +void context_set_linguistic_resources_filenames(context *ctx); + +void context_free(context *ctx) +{ + if(ctx->program_name) free(ctx->program_name); + if(ctx->input_filename) free(ctx->input_filename); + if(ctx->output_filename) free(ctx->output_filename); + if(ctx->language) free(ctx->language); + if(ctx->maca_data_path) free(ctx->maca_data_path); + free(ctx); +} + +context *context_new(void) +{ + context *ctx = (context *)memalloc(sizeof(context)); + + ctx->help = 0; + ctx->verbose = 0; + ctx->debug_mode = 0; + ctx->program_name = NULL; + ctx->mcd_filename = NULL; + ctx->mcd_struct = NULL; + ctx->language = strdup("fr"); + ctx->maca_data_path = NULL; + ctx->input_filename = NULL; + ctx->output_filename = NULL; + return ctx; +} + +void context_general_help_message(context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); +} + +void context_input_help_message(context *ctx){ + fprintf(stderr, "\t-i --input <file> : input mcf file name\n"); +} + +void context_mcd_help_message(context *ctx){ + fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n"); +} + +void context_language_help_message(context *ctx){ + fprintf(stderr, "\t-L --language : identifier of the language to use\n"); +} + +context *context_read_options(int argc, char *argv[]) +{ + int c; + int option_index = 0; + context *ctx = context_new(); + + ctx->program_name = strdup(argv[0]); + + static struct option long_options[8] = + { + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, + {"debug", no_argument, 0, 'd'}, + {"input", required_argument, 0, 'i'}, + {"output", required_argument, 0, 'o'}, + {"mcd", required_argument, 0, 'C'}, + {"language", required_argument, 0, 'L'}, + {"maca_data_path", required_argument, 0, 'D'} + }; + optind = 0; + opterr = 0; + + while ((c = getopt_long (argc, argv, "hvdi:o:C:L:D:", long_options, &option_index)) != -1){ + switch (c) + { + case 'd': + ctx->debug_mode = 1; + break; + case 'h': + ctx->help = 1; + break; + case 'v': + ctx->verbose = 1; + break; + case 'i': + ctx->input_filename = strdup(optarg); + break; + case 'o': + ctx->output_filename = strdup(optarg); + break; + case 'C': + ctx->mcd_filename = strdup(optarg); + break; + case 'L': + ctx->language = strdup(optarg); + break; + case 'D': + ctx->maca_data_path = strdup(optarg); + break; + } + } + + context_set_linguistic_resources_filenames(ctx); + + + if(ctx->mcd_filename) + ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); + + + if(ctx->mcd_filename == NULL) + /* ctx->mcd_struct = mcd_build_conll07(); */ + ctx->mcd_struct = mcd_build_wplgf(); + + return ctx; +} + +void context_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_path[500]; + char absolute_filename[500]; + + absolute_path[0] = '\0'; + + if(ctx->maca_data_path) + strcat(absolute_path, ctx->maca_data_path); + else { + char *e = getenv("MACAON_DIR"); + if (e != NULL) { + strcat(absolute_path, e); + } else { + fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n"); + } + } + + + strcat(absolute_path, "/"); + strcat(absolute_path, ctx->language); + strcat(absolute_path, "/bin/"); + +} diff --git a/maca_tokenizer/src/context.h b/maca_tokenizer/src/context.h new file mode 100644 index 0000000..f9c3ce0 --- /dev/null +++ b/maca_tokenizer/src/context.h @@ -0,0 +1,33 @@ +#ifndef __MACA_LEXER_CONTEXT__ +#define __MACA_LEXER_CONTEXT__ + +#include "mcd.h" +#include <stdlib.h> + +#define DEFAULT_MWE_TOKENS_DICO_FILENAME "d_tokens.dico" +#define DEFAULT_MWE_FILENAME "mwe" + +typedef struct { + int help; + int verbose; + int debug_mode; + char *program_name; + char *language; + char *maca_data_path; + char *mcd_filename; + mcd *mcd_struct; + char *input_filename; + char *output_filename; +} context; + +context *context_new(void); +void context_free(context *ctx); + +context *context_read_options(int argc, char *argv[]); +void context_general_help_message(context *ctx); +void context_conll_help_message(context *ctx); +void context_language_help_message(context *ctx); +void context_maca_data_path_help_message(context *ctx); +void context_mcd_help_message(context *ctx); + +#endif diff --git a/maca_tokenizer/src/en_tok_rules.l b/maca_tokenizer/src/en_tok_rules.l new file mode 100644 index 0000000..0e8ea5b --- /dev/null +++ b/maca_tokenizer/src/en_tok_rules.l @@ -0,0 +1,27 @@ +%{ +#include <stdio.h> +extern int defait_amalgames; +%} + +%option prefix="en" + +%option noyywrap +%% +[0-9]+\.[0-9]+ printf("%s", yytext); +[ \t]+ printf("\n"); +\. printf("\n."); +\, printf("\n,"); +don't printf("do\nnot"); +don’t printf("do\nnot"); +doesn't printf("does\nnot"); +doesn’t printf("does\nnot"); +won't printf("will\nnot"); +won’t printf("will\nnot"); +cannot printf("can\nnot"); +wanna printf("want\nto"); +'s printf("\n's"); +’s printf("\n's"); +\n+ printf("\n"); + + +%% diff --git a/maca_tokenizer/tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l similarity index 90% rename from maca_tokenizer/tok_rules.l rename to maca_tokenizer/src/fr_tok_rules.l index 4cd823d..dd6055b 100644 --- a/maca_tokenizer/tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -2,6 +2,10 @@ #include <stdio.h> extern int defait_amalgames; %} + +%option prefix="fr" +/*%option outfile="fr_lex.c"*/ + %option noyywrap %s state_defait_amalgames %s state_num diff --git a/maca_tokenizer/src/maca_tokenizer.c b/maca_tokenizer/src/maca_tokenizer.c new file mode 100644 index 0000000..d9e9a18 --- /dev/null +++ b/maca_tokenizer/src/maca_tokenizer.c @@ -0,0 +1,43 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"context.h" + +int defait_amalgames = 0; + +void maca_tokenizer_help_message(context *ctx) +{ + context_general_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_input_help_message(ctx); + context_mcd_help_message(ctx); + context_language_help_message(ctx); +} + +void maca_tokenizer_check_options(context *ctx){ + if(ctx->help){ + maca_tokenizer_help_message(ctx); + exit(1); + } +} + + +int main(int argc, char* argv[]) +{ + + context *ctx; + + ctx = context_read_options(argc, argv); + maca_tokenizer_check_options(ctx); + + if(!strcmp(ctx->language, "en")) + enlex() ; + else + frlex() ; + + + /* if(argc > 1) defait_amalgames = 1; */ + + return 0; +} + -- GitLab