Skip to content
Snippets Groups Projects
Commit ce464596 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

fixed few bugs in maca_tokenizer

parent d8392458
Branches
No related tags found
No related merge requests found
...@@ -38,6 +38,33 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in ...@@ -38,6 +38,33 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in
return -1; return -1;
} }
void print_states_array(char *buffer, context *ctx, trie *mwe_trie, dico *d_mwe_tokens, int *states_array, int *symbols_array, int path_index)
{
int i;
if(path_index == 0) return;
int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index);
/* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
for(i=0; i <= accept_state_index; i++){
if(ctx->paste){
if(i > 0) printf("%s", ctx->mwe_tokens_separator);
printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
else{
if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
}
if(ctx->paste)
if(accept_state_index != -1) printf("\n");
/* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe, they are just printed */
for(i = accept_state_index + 1; i < path_index; i++){
if(ctx->paste)
printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else
printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
}
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
...@@ -48,10 +75,10 @@ int main(int argc, char *argv[]) ...@@ -48,10 +75,10 @@ int main(int argc, char *argv[])
FILE *f = NULL; FILE *f = NULL;
trie *mwe_trie; trie *mwe_trie;
dico *d_mwe_tokens = NULL; dico *d_mwe_tokens = NULL;
int states_array[100]; int states_array[100]; /* an array in which we store the states we have traversed in the trie */
int symbols_array[100]; int symbols_array[100];
int path_index = 0; int path_index = 0;
int i; int next_state;
ctx = context_read_options(argc, argv); ctx = context_read_options(argc, argv);
maca_lexer_check_options(ctx); maca_lexer_check_options(ctx);
...@@ -75,76 +102,107 @@ int main(int argc, char *argv[]) ...@@ -75,76 +102,107 @@ int main(int argc, char *argv[])
/* trie_print(stdout, mwe_trie); */ /* trie_print(stdout, mwe_trie); */
/* look for a valid word */
while(fgets(buffer, 10000, f)){ while(fgets(buffer, 10000, f)){
if(feof(f)) return 0; /* no more words to read */ /* look for a valid word */
if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){ if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){
printf("\n"); printf("\n");
continue; continue;
} }
buffer[strlen(buffer)-1] = '\0'; buffer[strlen(buffer)-1] = '\0';
/* look for code of word read */
form_code = dico_string2int(d_mwe_tokens, buffer); form_code = dico_string2int(d_mwe_tokens, buffer);
if(form_code == -1){
print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
path_index = 0;
/* print the current token */
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
continue;
}
next_state = trie_destination_state(mwe_trie, (path_index == 0) ? 0: states_array[path_index - 1], form_code);
if(next_state != 0){
symbols_array[path_index] = form_code; symbols_array[path_index] = form_code;
states_array[path_index] = (form_code == -1)? 0 states_array[path_index] = next_state;
: trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); path_index++;
/* continue;
printf("buffer = %s ", buffer); }
printf("code = %d\n", form_code);
print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
if(path_index != 0)
next_state = trie_destination_state(mwe_trie, 0, form_code);
path_index = 0;
if(next_state){
symbols_array[path_index] = form_code;
states_array[path_index] = next_state;
path_index++;
continue;
}
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
#if 0
symbols_array[path_index] = form_code;
states_array[path_index] = (form_code == -1)? 0 /* if word has invalid code, go to initial state */
: trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); /* otherwise try to move forward in the trie */
/* printf("buffer = %s ", buffer);
printf("code = %d\n", form_code);
printf("states array :");
for(i=0; i <= path_index; i++){ for(i=0; i <= path_index; i++){
printf("%d ", states_array[i]); printf("%d ", states_array[i]);
} }
printf("\n"); printf("\n");
printf("symbols array :");
for(i=0; i <= path_index; i++){ for(i=0; i <= path_index; i++){
printf("%d ", symbols_array[i]); printf("%d ", symbols_array[i]);
} }
printf("\n"); printf("\n**********************\n");
*/ */
if(states_array[path_index] == 0){ /* in initial state of trie */ if(states_array[path_index] == 0){ /* in initial state of trie */
/* nothing has been recognized */ /* nothing has been recognized, just print current word */
if(path_index == 0) if(path_index == 0)
if(ctx->paste) if(ctx->paste)
printf("%s\n", buffer); printf("%s\n", buffer);
else else
printf("%s\t1\n", buffer); printf("%s\t1\n", buffer);
else{ /* there is something in the path */ else{ /* there is something in the path */
int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index); print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
/* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */ path_index = 0;
for(i=0; i <= accept_state_index; i++){
if(ctx->paste){ states_array[path_index] = (form_code == -1)? 0 /* if word has invalid code, go to initial state */
if(i > 0) printf("%s", ctx->mwe_tokens_separator); : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); /* otherwise try to move forward in the trie */
printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
else{
if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
}
if(ctx->paste)
if(accept_state_index != -1) printf("\n");
/* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */
for(i = accept_state_index + 1; i < path_index; i++){
if(ctx->paste)
printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else
printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
/* do not forget to print the current token */ /* do not forget to print the current token */
if(ctx->paste) if(ctx->paste)
printf("%s\n", buffer); printf("%s\n", buffer);
else else
printf("%s\t1\n", buffer); printf("%s\t1\n", buffer);
path_index = 0;
} }
} }
/* not in state 0 of trie we are processing tokens of a potential mwe */ /* not in state 0 of trie we are processing tokens of a potential mwe */
else{ else{
path_index++; path_index++;
} }
#endif
}
if(path_index != 0){ /* there is something in states array */
print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
path_index = 0;
} }
return 0; return 0;
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment