diff --git a/maca_lexer/src/maca_lexer.c b/maca_lexer/src/maca_lexer.c index 03eceab8d59f86c227efb426b93127a81f982f1a..d80e7cf332474e9c2a11ba25c80f7bf1dfd2b345 100644 --- a/maca_lexer/src/maca_lexer.c +++ b/maca_lexer/src/maca_lexer.c @@ -103,19 +103,20 @@ int main(int argc, char *argv[]) /* trie_print(stdout, mwe_trie); */ while(fgets(buffer, 10000, f)){ - /* look for a valid word */ + /* look for a valid form */ if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){ printf("\n"); continue; } buffer[strlen(buffer)-1] = '\0'; - /* look for code of word read */ + /* look for code of form read */ form_code = dico_string2int(d_mwe_tokens, buffer); if(form_code == -1){ + /* if form has no code, it cannot be part of a mwe, print the potential mwe discovered so far */ print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index); path_index = 0; - /* print the current token */ + /* print the current form */ if(ctx->paste) printf("%s\n", buffer); else @@ -123,82 +124,42 @@ int main(int argc, char *argv[]) continue; } + /* look for the next state in the trie */ next_state = trie_destination_state(mwe_trie, (path_index == 0) ? 0: states_array[path_index - 1], form_code); + if(next_state != 0){ + /* the path is growing */ symbols_array[path_index] = form_code; states_array[path_index] = next_state; path_index++; continue; } - + /* print the potential mwe discovered so far */ print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index); + if(path_index != 0) + /* if there was a path that aborted, see if there is a valid transition from state 0 with form */ next_state = trie_destination_state(mwe_trie, 0, form_code); + path_index = 0; if(next_state){ + /* such a transition exists */ symbols_array[path_index] = form_code; states_array[path_index] = next_state; path_index++; continue; } + /* such a transition does not exist, just print the form */ if(ctx->paste) printf("%s\n", buffer); else printf("%s\t1\n", buffer); - - -#if 0 - - symbols_array[path_index] = form_code; - states_array[path_index] = (form_code == -1)? 0 /* if word has invalid code, go to initial state */ - : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); /* otherwise try to move forward in the trie */ - - /* printf("buffer = %s ", buffer); - printf("code = %d\n", form_code); - - printf("states array :"); - for(i=0; i <= path_index; i++){ - printf("%d ", states_array[i]); - } - printf("\n"); - printf("symbols array :"); - for(i=0; i <= path_index; i++){ - printf("%d ", symbols_array[i]); - } - printf("\n**********************\n"); - */ - - if(states_array[path_index] == 0){ /* in initial state of trie */ - /* nothing has been recognized, just print current word */ - if(path_index == 0) - if(ctx->paste) - printf("%s\n", buffer); - else - printf("%s\t1\n", buffer); - else{ /* there is something in the path */ - print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index); - path_index = 0; - - states_array[path_index] = (form_code == -1)? 0 /* if word has invalid code, go to initial state */ - : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); /* otherwise try to move forward in the trie */ - - - /* do not forget to print the current token */ - if(ctx->paste) - printf("%s\n", buffer); - else - printf("%s\t1\n", buffer); - } - } - /* not in state 0 of trie we are processing tokens of a potential mwe */ - else{ - path_index++; - } - #endif } - if(path_index != 0){ /* there is something in states array */ + if(path_index != 0){ + /* there is something in states array */ + /* print the potential mwe discovered so far */ print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index); path_index = 0; }