diff --git a/maca_lexer/src/context.c b/maca_lexer/src/context.c index 6a281826a707db0dab8c1375d4ca6c8a962595c9..b6d2b5e7b6e1c2e4da60fe277590e49f0f202751 100644 --- a/maca_lexer/src/context.c +++ b/maca_lexer/src/context.c @@ -14,11 +14,11 @@ void context_free(context *ctx) if(ctx->program_name) free(ctx->program_name); if(ctx->input_filename) free(ctx->input_filename); if(ctx->output_filename) free(ctx->output_filename); - if(ctx->fplm_filename) free(ctx->fplm_filename); if(ctx->language) free(ctx->language); if(ctx->maca_data_path) free(ctx->maca_data_path); if(ctx->mwe_filename) free(ctx->mwe_filename); if(ctx->mwe_tokens_dico_filename) free(ctx->mwe_tokens_dico_filename); + if(ctx->mwe_tokens_separator) free(ctx->mwe_tokens_separator); free(ctx); } @@ -30,7 +30,6 @@ context *context_new(void) ctx->verbose = 0; ctx->debug_mode = 0; ctx->program_name = NULL; - ctx->fplm_filename = NULL; ctx->mcd_filename = NULL; ctx->mcd_struct = NULL; ctx->language = strdup("fr"); @@ -40,6 +39,7 @@ context *context_new(void) ctx->output_filename = NULL; ctx->mwe_filename = NULL; ctx->mwe_tokens_dico_filename = NULL; + ctx->mwe_tokens_separator = strdup(" "); return ctx; } @@ -60,10 +60,6 @@ void context_form_column_help_message(context *ctx){ fprintf(stderr, "\t-F --form_column <int> : column containing form\n"); } -void context_fplm_help_message(context *ctx){ - fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n"); -} - void context_mcd_help_message(context *ctx){ fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n"); } @@ -76,6 +72,18 @@ void context_maca_data_path_help_message(context *ctx){ fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n"); } +void context_mwe_token_separator_help_message(context *ctx){ + fprintf(stderr, "\t-s --mwe_sep <string> : multi word expression tokens separator (default is space character)\n"); +} + +void context_mwe_filename_help_message(context *ctx){ + fprintf(stderr, "\t-M --mwe <filename> : multi word expression file\n"); +} + +void context_vocab_help_message(context *ctx){ + fprintf(stderr, "\t-V --vocab <filename> : multi word expression tokens vocabulary file\n"); +} + context *context_read_options(int argc, char *argv[]) { int c; @@ -84,7 +92,7 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[12] = + static struct option long_options[13] = { {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, @@ -93,16 +101,16 @@ context *context_read_options(int argc, char *argv[]) {"output", required_argument, 0, 'o'}, {"mcd", required_argument, 0, 'C'}, {"language", required_argument, 0, 'L'}, - {"fplm", required_argument, 0, 'f'}, {"form_column", required_argument, 0, 'F'}, {"maca_data_path", required_argument, 0, 'D'}, {"mwe", required_argument, 0, 'M'}, - {"vocab", required_argument, 0, 'V'} + {"vocab", required_argument, 0, 'V'}, + {"mwe_sep", required_argument, 0, 's'} }; optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "hvdi:o:f:C:L:M:F:D:V:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "hvdi:o:C:L:M:F:D:V:s:", long_options, &option_index)) != -1){ switch (c) { case 'd': @@ -117,9 +125,6 @@ context *context_read_options(int argc, char *argv[]) case 'F': ctx->form_column = atoi(optarg) - 1; break; - case 'f': - ctx->fplm_filename = strdup(optarg); - break; case 'i': ctx->input_filename = strdup(optarg); break; @@ -141,6 +146,9 @@ context *context_read_options(int argc, char *argv[]) case 'M': ctx->mwe_filename = strdup(optarg); break; + case 's': + ctx->mwe_tokens_separator = strdup(optarg); + break; } } diff --git a/maca_lexer/src/context.h b/maca_lexer/src/context.h index a1898faf3588e159738c450d898f69e6e8999596..376f1b6cea8e28dd7ddf78e2d7ca19db0974d273 100644 --- a/maca_lexer/src/context.h +++ b/maca_lexer/src/context.h @@ -4,7 +4,7 @@ #include "mcd.h" #include <stdlib.h> -#define DEFAULT_MWE_TOKENS_DICO_FILENAME "mwe_tokens" +#define DEFAULT_MWE_TOKENS_DICO_FILENAME "d_tokens.dico" #define DEFAULT_MWE_FILENAME "mwe" typedef struct { @@ -12,7 +12,6 @@ typedef struct { int verbose; int debug_mode; char *program_name; - char *fplm_filename; char *language; char *maca_data_path; char *mcd_filename; @@ -22,6 +21,7 @@ typedef struct { char *output_filename; char *mwe_filename; char *mwe_tokens_dico_filename; + char *mwe_tokens_separator; } context; context *context_new(void); @@ -31,7 +31,6 @@ context *context_read_options(int argc, char *argv[]); void context_general_help_message(context *ctx); void context_conll_help_message(context *ctx); void context_language_help_message(context *ctx); -void context_fplm_help_message(context *ctx); void context_maca_data_path_help_message(context *ctx); void context_mcd_help_message(context *ctx); void context_form_column_help_message(context *ctx); diff --git a/maca_lexer/src/maca_lexer.c b/maca_lexer/src/maca_lexer.c index 5308fac5d036510738c80a77457d555225ae456f..da78d6f4d9f7b292f7f794476bbbd2a5bd2bbeb3 100644 --- a/maca_lexer/src/maca_lexer.c +++ b/maca_lexer/src/maca_lexer.c @@ -7,6 +7,39 @@ #include"util.h" #include"context.h" + +void maca_lexer_help_message(context *ctx) +{ + context_general_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_input_help_message(ctx); + context_mcd_help_message(ctx); + context_language_help_message(ctx); + context_maca_data_path_help_message(ctx); + context_form_column_help_message(ctx); + context_mwe_token_separator_help_message(ctx); + context_mwe_filename_help_message(ctx); + context_vocab_help_message(ctx); +} + + +void maca_lexer_check_options(context *ctx){ + if(ctx->help){ + maca_lexer_help_message(ctx); + exit(1); + } +} + +int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_index) +{ + int i; + for(i=path_index - 1; i >= 0; i--){ + if(mwe_trie->states[states_array[i]]->is_accept) return i; + } + return -1; +} + + int main(int argc, char *argv[]) { char buffer[10000]; @@ -26,7 +59,7 @@ int main(int argc, char *argv[]) int i; ctx = context_read_options(argc, argv); - /* maca_lexer_check_options(ctx); */ + maca_lexer_check_options(ctx); if(ctx->form_column != -1) @@ -39,7 +72,10 @@ int main(int argc, char *argv[]) else f = myfopen(ctx->input_filename, "r"); + if(ctx->verbose) fprintf(stderr, "reading mwe list from file : %s\n", ctx->mwe_filename); mwe_trie = trie_build_from_collection(ctx->mwe_filename); + + if(ctx->verbose) fprintf(stderr, "reading mwe tokens vocabulary from file : %s\n", ctx->mwe_tokens_dico_filename); d_mwe_tokens = dico_read(ctx->mwe_tokens_dico_filename, 0.5); /* trie_print(stdout, mwe_trie); */ @@ -58,7 +94,8 @@ int main(int argc, char *argv[]) symbols_array[path_index] = form_code; states_array[path_index] = (form_code == -1)? 0 : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); - /* printf("buffer = %s ", buffer); + /* + printf("buffer = %s ", buffer); printf("code = %d\n", form_code); @@ -72,31 +109,31 @@ int main(int argc, char *argv[]) printf("\n"); */ if(states_array[path_index] == 0){ /* in initial state of trie */ - if(path_index == 0){ /* nothing has been recognized */ + /* nothing has been recognized */ + if(path_index == 0) printf("%s\n", buffer); - } - else{ - if(mwe_trie->states[states_array[path_index - 1]]->is_accept){ - for(i=0; i < path_index; i++){ - if(i > 0) printf("#"); - printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); - } - printf("\n"); + else{ /* there is something in the path */ + int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index); + /* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */ + for(i=0; i <= accept_state_index; i++){ + if(i > 0) printf("%s", ctx->mwe_tokens_separator); + printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); } - else{ - for(i=0; i < path_index; i++){ - printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); - } + if(accept_state_index != -1) printf("\n"); + /* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */ + for(i = accept_state_index + 1; i < path_index; i++){ + printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); } + /* do not forget to print the current token */ printf("%s\n", buffer); + path_index = 0; } - path_index = 0; } - else{ + /* not in state 0 of trie */ + else{ path_index++; } } return 0; } -