Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Alexis Nasr
macaon2
Commits
ce464596
Commit
ce464596
authored
Mar 23, 2017
by
Alexis Nasr
Browse files
fixed few bugs in maca_tokenizer
parent
d8392458
Changes
2
Hide whitespace changes
Inline
Side-by-side
maca_common/src/trie.c
View file @
ce464596
...
...
@@ -142,8 +142,8 @@ int trie_lookup(trie *t, int *word, int length)
break
;
}
}
if
(
trans
==
NULL
)
return
0
;
if
(
trans
==
NULL
)
return
0
;
}
return
t
->
states
[
current_state
]
->
is_accept
;
}
...
...
maca_lexer/src/maca_lexer.c
View file @
ce464596
...
...
@@ -38,6 +38,33 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in
return
-
1
;
}
void
print_states_array
(
char
*
buffer
,
context
*
ctx
,
trie
*
mwe_trie
,
dico
*
d_mwe_tokens
,
int
*
states_array
,
int
*
symbols_array
,
int
path_index
)
{
int
i
;
if
(
path_index
==
0
)
return
;
int
accept_state_index
=
look_for_accept_state_in_path
(
mwe_trie
,
states_array
,
path_index
);
/* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
for
(
i
=
0
;
i
<=
accept_state_index
;
i
++
){
if
(
ctx
->
paste
){
if
(
i
>
0
)
printf
(
"%s"
,
ctx
->
mwe_tokens_separator
);
printf
(
"%s"
,
dico_int2string
(
d_mwe_tokens
,
symbols_array
[
i
]));
}
else
{
if
(
i
==
0
)
printf
(
"%s
\t
1
\n
"
,
dico_int2string
(
d_mwe_tokens
,
symbols_array
[
i
]));
else
printf
(
"%s
\t
0
\n
"
,
dico_int2string
(
d_mwe_tokens
,
symbols_array
[
i
]));
}
}
if
(
ctx
->
paste
)
if
(
accept_state_index
!=
-
1
)
printf
(
"
\n
"
);
/* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe, they are just printed */
for
(
i
=
accept_state_index
+
1
;
i
<
path_index
;
i
++
){
if
(
ctx
->
paste
)
printf
(
"%s
\n
"
,
dico_int2string
(
d_mwe_tokens
,
symbols_array
[
i
]));
else
printf
(
"%s
\t
1
\n
"
,
dico_int2string
(
d_mwe_tokens
,
symbols_array
[
i
]));
}
}
int
main
(
int
argc
,
char
*
argv
[])
{
...
...
@@ -48,10 +75,10 @@ int main(int argc, char *argv[])
FILE
*
f
=
NULL
;
trie
*
mwe_trie
;
dico
*
d_mwe_tokens
=
NULL
;
int
states_array
[
100
];
int
states_array
[
100
];
/* an array in which we store the states we have traversed in the trie */
int
symbols_array
[
100
];
int
path_index
=
0
;
int
i
;
int
next_state
;
ctx
=
context_read_options
(
argc
,
argv
);
maca_lexer_check_options
(
ctx
);
...
...
@@ -74,77 +101,108 @@ int main(int argc, char *argv[])
d_mwe_tokens
=
dico_read
(
ctx
->
mwe_tokens_dico_filename
,
0
.
5
);
/* trie_print(stdout, mwe_trie); */
/* look for a valid word */
while
(
fgets
(
buffer
,
10000
,
f
)){
if
(
feof
(
f
))
return
0
;
/* no more words to rea
d */
/* look for a valid wor
d */
if
((
buffer
[
0
]
==
'\n'
)
||
(
buffer
[
0
]
==
' '
)
||
(
buffer
[
0
]
==
'\t'
)){
printf
(
"
\n
"
);
continue
;
}
buffer
[
strlen
(
buffer
)
-
1
]
=
'\0'
;
/* look for code of word read */
form_code
=
dico_string2int
(
d_mwe_tokens
,
buffer
);
if
(
form_code
==
-
1
){
print_states_array
(
buffer
,
ctx
,
mwe_trie
,
d_mwe_tokens
,
states_array
,
symbols_array
,
path_index
);
path_index
=
0
;
/* print the current token */
if
(
ctx
->
paste
)
printf
(
"%s
\n
"
,
buffer
);
else
printf
(
"%s
\t
1
\n
"
,
buffer
);
continue
;
}
next_state
=
trie_destination_state
(
mwe_trie
,
(
path_index
==
0
)
?
0
:
states_array
[
path_index
-
1
],
form_code
);
if
(
next_state
!=
0
){
symbols_array
[
path_index
]
=
form_code
;
states_array
[
path_index
]
=
next_state
;
path_index
++
;
continue
;
}
print_states_array
(
buffer
,
ctx
,
mwe_trie
,
d_mwe_tokens
,
states_array
,
symbols_array
,
path_index
);
if
(
path_index
!=
0
)
next_state
=
trie_destination_state
(
mwe_trie
,
0
,
form_code
);
path_index
=
0
;
if
(
next_state
){
symbols_array
[
path_index
]
=
form_code
;
states_array
[
path_index
]
=
next_state
;
path_index
++
;
continue
;
}
if
(
ctx
->
paste
)
printf
(
"%s
\n
"
,
buffer
);
else
printf
(
"%s
\t
1
\n
"
,
buffer
);
#if 0
symbols_array[path_index] = form_code;
states_array
[
path_index
]
=
(
form_code
==
-
1
)
?
0
:
trie_destination_state
(
mwe_trie
,
(
path_index
==
0
)
?
0
:
states_array
[
path_index
-
1
],
form_code
);
/*
printf("buffer = %s ", buffer);
printf("code = %d\n", form_code);
states_array[path_index] = (form_code == -1)? 0 /* if word has invalid code, go to initial state */
: trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); /* otherwise try to move forward in the trie */
/* printf("buffer = %s ", buffer);
printf("code = %d\n", form_code);
printf("states array :");
for(i=0; i <= path_index; i++){
printf("%d ", states_array[i]);
}
printf("\n");
printf("symbols array :");
for(i=0; i <= path_index; i++){
printf("%d ", symbols_array[i]);
}
printf("\n");
printf("\n
**********************\n
");
*/
if(states_array[path_index] == 0){ /* in initial state of trie */
/* nothing has been recognized */
/* nothing has been recognized
, just print current word
*/
if(path_index == 0)
if(ctx->paste)
printf
(
"%s
\n
"
,
buffer
);
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
else{ /* there is something in the path */
int
accept_state_index
=
look_for_accept_state_in_path
(
mwe_trie
,
states_array
,
path_index
);
/* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
for
(
i
=
0
;
i
<=
accept_state_index
;
i
++
){
if
(
ctx
->
paste
){
if
(
i
>
0
)
printf
(
"%s"
,
ctx
->
mwe_tokens_separator
);
printf
(
"%s"
,
dico_int2string
(
d_mwe_tokens
,
symbols_array
[
i
]));
}
else
{
if
(
i
==
0
)
printf
(
"%s
\t
1
\n
"
,
dico_int2string
(
d_mwe_tokens
,
symbols_array
[
i
]));
else
printf
(
"%s
\t
0
\n
"
,
dico_int2string
(
d_mwe_tokens
,
symbols_array
[
i
]));
}
}
if
(
ctx
->
paste
)
if
(
accept_state_index
!=
-
1
)
printf
(
"
\n
"
);
/* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */
for
(
i
=
accept_state_index
+
1
;
i
<
path_index
;
i
++
){
if
(
ctx
->
paste
)
printf
(
"%s
\n
"
,
dico_int2string
(
d_mwe_tokens
,
symbols_array
[
i
]));
else
printf
(
"%s
\t
1
\n
"
,
dico_int2string
(
d_mwe_tokens
,
symbols_array
[
i
]));
}
print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
path_index = 0;
states_array[path_index] = (form_code == -1)? 0 /* if word has invalid code, go to initial state */
: trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); /* otherwise try to move forward in the trie */
/* do not forget to print the current token */
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
path_index
=
0
;
}
}
/* not in state 0 of trie we are processing tokens of a potential mwe */
else{
path_index++;
}
#endif
}
if
(
path_index
!=
0
){
/* there is something in states array */
print_states_array
(
buffer
,
ctx
,
mwe_trie
,
d_mwe_tokens
,
states_array
,
symbols_array
,
path_index
);
path_index
=
0
;
}
return
0
;
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment