From ce4645960e291b77553f30fe856954a0c46055bb Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Thu, 23 Mar 2017 12:01:27 +0100 Subject: [PATCH] fixed few bugs in maca_tokenizer --- maca_common/src/trie.c | 4 +- maca_lexer/src/maca_lexer.c | 132 ++++++++++++++++++++++++++---------- 2 files changed, 97 insertions(+), 39 deletions(-) diff --git a/maca_common/src/trie.c b/maca_common/src/trie.c index 150bdae..b25bca3 100644 --- a/maca_common/src/trie.c +++ b/maca_common/src/trie.c @@ -142,8 +142,8 @@ int trie_lookup(trie *t, int *word, int length) break; } } - if(trans == NULL) - return 0; + if(trans == NULL) + return 0; } return t->states[current_state]->is_accept; } diff --git a/maca_lexer/src/maca_lexer.c b/maca_lexer/src/maca_lexer.c index b096645..03eceab 100644 --- a/maca_lexer/src/maca_lexer.c +++ b/maca_lexer/src/maca_lexer.c @@ -38,6 +38,33 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in return -1; } +void print_states_array(char *buffer, context *ctx, trie *mwe_trie, dico *d_mwe_tokens, int *states_array, int *symbols_array, int path_index) +{ + int i; + if(path_index == 0) return; + int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index); + /* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */ + for(i=0; i <= accept_state_index; i++){ + if(ctx->paste){ + if(i > 0) printf("%s", ctx->mwe_tokens_separator); + printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); + } + else{ + if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + } + } + if(ctx->paste) + if(accept_state_index != -1) printf("\n"); + /* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe, they are just printed */ + for(i = accept_state_index + 1; i < path_index; i++){ + if(ctx->paste) + printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + else + printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + } +} + int main(int argc, char *argv[]) { @@ -48,10 +75,10 @@ int main(int argc, char *argv[]) FILE *f = NULL; trie *mwe_trie; dico *d_mwe_tokens = NULL; - int states_array[100]; + int states_array[100]; /* an array in which we store the states we have traversed in the trie */ int symbols_array[100]; int path_index = 0; - int i; + int next_state; ctx = context_read_options(argc, argv); maca_lexer_check_options(ctx); @@ -74,77 +101,108 @@ int main(int argc, char *argv[]) d_mwe_tokens = dico_read(ctx->mwe_tokens_dico_filename, 0.5); /* trie_print(stdout, mwe_trie); */ - - /* look for a valid word */ while(fgets(buffer, 10000, f)){ - if(feof(f)) return 0; /* no more words to read */ + /* look for a valid word */ if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){ printf("\n"); continue; } - buffer[strlen(buffer)-1] = '\0'; + /* look for code of word read */ form_code = dico_string2int(d_mwe_tokens, buffer); + + if(form_code == -1){ + print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index); + path_index = 0; + /* print the current token */ + if(ctx->paste) + printf("%s\n", buffer); + else + printf("%s\t1\n", buffer); + continue; + } + + next_state = trie_destination_state(mwe_trie, (path_index == 0) ? 0: states_array[path_index - 1], form_code); + if(next_state != 0){ + symbols_array[path_index] = form_code; + states_array[path_index] = next_state; + path_index++; + continue; + } + + print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index); + if(path_index != 0) + next_state = trie_destination_state(mwe_trie, 0, form_code); + path_index = 0; + if(next_state){ + symbols_array[path_index] = form_code; + states_array[path_index] = next_state; + path_index++; + continue; + } + + if(ctx->paste) + printf("%s\n", buffer); + else + printf("%s\t1\n", buffer); + + +#if 0 + symbols_array[path_index] = form_code; - states_array[path_index] = (form_code == -1)? 0 - : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); - /* - printf("buffer = %s ", buffer); - printf("code = %d\n", form_code); + states_array[path_index] = (form_code == -1)? 0 /* if word has invalid code, go to initial state */ + : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); /* otherwise try to move forward in the trie */ + /* printf("buffer = %s ", buffer); + printf("code = %d\n", form_code); + printf("states array :"); for(i=0; i <= path_index; i++){ printf("%d ", states_array[i]); } printf("\n"); + printf("symbols array :"); for(i=0; i <= path_index; i++){ printf("%d ", symbols_array[i]); } - printf("\n"); + printf("\n**********************\n"); */ + if(states_array[path_index] == 0){ /* in initial state of trie */ - /* nothing has been recognized */ + /* nothing has been recognized, just print current word */ if(path_index == 0) if(ctx->paste) - printf("%s\n", buffer); + printf("%s\n", buffer); else printf("%s\t1\n", buffer); else{ /* there is something in the path */ - int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index); - /* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */ - for(i=0; i <= accept_state_index; i++){ - if(ctx->paste){ - if(i > 0) printf("%s", ctx->mwe_tokens_separator); - printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); - } - else{ - if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i])); - else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i])); - } - } - if(ctx->paste) - if(accept_state_index != -1) printf("\n"); - /* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */ - for(i = accept_state_index + 1; i < path_index; i++){ - if(ctx->paste) - printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); - else - printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i])); - } + print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index); + path_index = 0; + + states_array[path_index] = (form_code == -1)? 0 /* if word has invalid code, go to initial state */ + : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); /* otherwise try to move forward in the trie */ + + /* do not forget to print the current token */ if(ctx->paste) printf("%s\n", buffer); else printf("%s\t1\n", buffer); - path_index = 0; } } /* not in state 0 of trie we are processing tokens of a potential mwe */ else{ path_index++; } - + #endif + } + + if(path_index != 0){ /* there is something in states array */ + print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index); + path_index = 0; } + return 0; } + -- GitLab