Commit ce464596 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

fixed few bugs in maca_tokenizer

parent d8392458
......@@ -142,8 +142,8 @@ int trie_lookup(trie *t, int *word, int length)
break;
}
}
if(trans == NULL)
return 0;
if(trans == NULL)
return 0;
}
return t->states[current_state]->is_accept;
}
......
......@@ -38,6 +38,33 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in
return -1;
}
void print_states_array(char *buffer, context *ctx, trie *mwe_trie, dico *d_mwe_tokens, int *states_array, int *symbols_array, int path_index)
{
int i;
if(path_index == 0) return;
int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index);
/* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
for(i=0; i <= accept_state_index; i++){
if(ctx->paste){
if(i > 0) printf("%s", ctx->mwe_tokens_separator);
printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
else{
if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
}
if(ctx->paste)
if(accept_state_index != -1) printf("\n");
/* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe, they are just printed */
for(i = accept_state_index + 1; i < path_index; i++){
if(ctx->paste)
printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else
printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
}
int main(int argc, char *argv[])
{
......@@ -48,10 +75,10 @@ int main(int argc, char *argv[])
FILE *f = NULL;
trie *mwe_trie;
dico *d_mwe_tokens = NULL;
int states_array[100];
int states_array[100]; /* an array in which we store the states we have traversed in the trie */
int symbols_array[100];
int path_index = 0;
int i;
int next_state;
ctx = context_read_options(argc, argv);
maca_lexer_check_options(ctx);
......@@ -74,77 +101,108 @@ int main(int argc, char *argv[])
d_mwe_tokens = dico_read(ctx->mwe_tokens_dico_filename, 0.5);
/* trie_print(stdout, mwe_trie); */
/* look for a valid word */
while(fgets(buffer, 10000, f)){
if(feof(f)) return 0; /* no more words to read */
/* look for a valid word */
if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){
printf("\n");
continue;
}
buffer[strlen(buffer)-1] = '\0';
/* look for code of word read */
form_code = dico_string2int(d_mwe_tokens, buffer);
if(form_code == -1){
print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
path_index = 0;
/* print the current token */
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
continue;
}
next_state = trie_destination_state(mwe_trie, (path_index == 0) ? 0: states_array[path_index - 1], form_code);
if(next_state != 0){
symbols_array[path_index] = form_code;
states_array[path_index] = next_state;
path_index++;
continue;
}
print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
if(path_index != 0)
next_state = trie_destination_state(mwe_trie, 0, form_code);
path_index = 0;
if(next_state){
symbols_array[path_index] = form_code;
states_array[path_index] = next_state;
path_index++;
continue;
}
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
#if 0
symbols_array[path_index] = form_code;
states_array[path_index] = (form_code == -1)? 0
: trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code);
/*
printf("buffer = %s ", buffer);
printf("code = %d\n", form_code);
states_array[path_index] = (form_code == -1)? 0 /* if word has invalid code, go to initial state */
: trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); /* otherwise try to move forward in the trie */
/* printf("buffer = %s ", buffer);
printf("code = %d\n", form_code);
printf("states array :");
for(i=0; i <= path_index; i++){
printf("%d ", states_array[i]);
}
printf("\n");
printf("symbols array :");
for(i=0; i <= path_index; i++){
printf("%d ", symbols_array[i]);
}
printf("\n");
printf("\n**********************\n");
*/
if(states_array[path_index] == 0){ /* in initial state of trie */
/* nothing has been recognized */
/* nothing has been recognized, just print current word */
if(path_index == 0)
if(ctx->paste)
printf("%s\n", buffer);
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
else{ /* there is something in the path */
int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index);
/* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
for(i=0; i <= accept_state_index; i++){
if(ctx->paste){
if(i > 0) printf("%s", ctx->mwe_tokens_separator);
printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
else{
if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
}
if(ctx->paste)
if(accept_state_index != -1) printf("\n");
/* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */
for(i = accept_state_index + 1; i < path_index; i++){
if(ctx->paste)
printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else
printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
path_index = 0;
states_array[path_index] = (form_code == -1)? 0 /* if word has invalid code, go to initial state */
: trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); /* otherwise try to move forward in the trie */
/* do not forget to print the current token */
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
path_index = 0;
}
}
/* not in state 0 of trie we are processing tokens of a potential mwe */
else{
path_index++;
}
#endif
}
if(path_index != 0){ /* there is something in states array */
print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
path_index = 0;
}
return 0;
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment