Commit b0825cf4 authored by Jeremy Auguste's avatar Jeremy Auguste
Browse files

Merge branch 'master' into ssrnn

parents 89c594c6 d5460fe9
......@@ -142,8 +142,8 @@ int trie_lookup(trie *t, int *word, int length)
break;
}
}
if(trans == NULL)
return 0;
if(trans == NULL)
return 0;
}
return t->states[current_state]->is_accept;
}
......
......@@ -38,6 +38,33 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in
return -1;
}
void print_states_array(char *buffer, context *ctx, trie *mwe_trie, dico *d_mwe_tokens, int *states_array, int *symbols_array, int path_index)
{
int i;
if(path_index == 0) return;
int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index);
/* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
for(i=0; i <= accept_state_index; i++){
if(ctx->paste){
if(i > 0) printf("%s", ctx->mwe_tokens_separator);
printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
else{
if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
}
if(ctx->paste)
if(accept_state_index != -1) printf("\n");
/* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe, they are just printed */
for(i = accept_state_index + 1; i < path_index; i++){
if(ctx->paste)
printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else
printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
}
int main(int argc, char *argv[])
{
......@@ -48,10 +75,10 @@ int main(int argc, char *argv[])
FILE *f = NULL;
trie *mwe_trie;
dico *d_mwe_tokens = NULL;
int states_array[100];
int states_array[100]; /* an array in which we store the states we have traversed in the trie */
int symbols_array[100];
int path_index = 0;
int i;
int next_state;
ctx = context_read_options(argc, argv);
maca_lexer_check_options(ctx);
......@@ -74,77 +101,69 @@ int main(int argc, char *argv[])
d_mwe_tokens = dico_read(ctx->mwe_tokens_dico_filename, 0.5);
/* trie_print(stdout, mwe_trie); */
/* look for a valid word */
while(fgets(buffer, 10000, f)){
if(feof(f)) return 0; /* no more words to read */
/* look for a valid form */
if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){
printf("\n");
continue;
}
buffer[strlen(buffer)-1] = '\0';
/* look for code of form read */
form_code = dico_string2int(d_mwe_tokens, buffer);
symbols_array[path_index] = form_code;
states_array[path_index] = (form_code == -1)? 0
: trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code);
/*
printf("buffer = %s ", buffer);
printf("code = %d\n", form_code);
for(i=0; i <= path_index; i++){
printf("%d ", states_array[i]);
}
printf("\n");
for(i=0; i <= path_index; i++){
printf("%d ", symbols_array[i]);
if(form_code == -1){
/* if form has no code, it cannot be part of a mwe, print the potential mwe discovered so far */
print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
path_index = 0;
/* print the current form */
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
continue;
}
printf("\n");
*/
if(states_array[path_index] == 0){ /* in initial state of trie */
/* nothing has been recognized */
if(path_index == 0)
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
else{ /* there is something in the path */
int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index);
/* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */
for(i=0; i <= accept_state_index; i++){
if(ctx->paste){
if(i > 0) printf("%s", ctx->mwe_tokens_separator);
printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
else{
if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
}
if(ctx->paste)
if(accept_state_index != -1) printf("\n");
/* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */
for(i = accept_state_index + 1; i < path_index; i++){
if(ctx->paste)
printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
else
printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i]));
}
/* do not forget to print the current token */
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
path_index = 0;
}
/* look for the next state in the trie */
next_state = trie_destination_state(mwe_trie, (path_index == 0) ? 0: states_array[path_index - 1], form_code);
if(next_state != 0){
/* the path is growing */
symbols_array[path_index] = form_code;
states_array[path_index] = next_state;
path_index++;
continue;
}
/* not in state 0 of trie we are processing tokens of a potential mwe */
else{
/* print the potential mwe discovered so far */
print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
if(path_index != 0)
/* if there was a path that aborted, see if there is a valid transition from state 0 with form */
next_state = trie_destination_state(mwe_trie, 0, form_code);
path_index = 0;
if(next_state){
/* such a transition exists */
symbols_array[path_index] = form_code;
states_array[path_index] = next_state;
path_index++;
continue;
}
/* such a transition does not exist, just print the form */
if(ctx->paste)
printf("%s\n", buffer);
else
printf("%s\t1\n", buffer);
}
if(path_index != 0){
/* there is something in states array */
/* print the potential mwe discovered so far */
print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index);
path_index = 0;
}
return 0;
}
......@@ -435,6 +435,26 @@ int ldep_s0r(void *input){
return -1;
}
int ldep_s0p(void *input){
config *c = input;
word *gov = stack_s0(config_get_stack((config *) c));
int i;
word *dep;
int dist;
if(gov){
for(i=word_get_index(gov) - 1; i > 0 ; i--){
dep = word_buffer_get_word_n(config_get_buffer((config *) c), i);
if(word_get_sent_seg(dep) == 1) return -1;
dist = word_get_index(gov) - i;
if(word_get_gov(dep) == dist){
return word_get_pos(dep);
}
}
}
return -1;
}
int ldep_s1r(void *input){
config *c = input;
word *gov = stack_s1(config_get_stack((config *) c));
......@@ -455,6 +475,26 @@ int ldep_s1r(void *input){
return -1;
}
int ldep_s1p(void *input){
config *c = input;
word *gov = stack_s1(config_get_stack((config *) c));
int i;
word *dep;
int dist;
if(gov){
for(i=word_get_index(gov) - 1; i > 0 ; i--){
dep = word_buffer_get_word_n(config_get_buffer((config *) c), i);
if(word_get_sent_seg(dep) == 1) return -1;
dist = word_get_index(gov) - i;
if(word_get_gov(dep) == dist){
return word_get_pos(dep);
}
}
}
return -1;
}
int ldep_b0r(void *input){
config *c = input;
word *gov = word_buffer_b0(config_get_buffer((config *) c));
......@@ -475,6 +515,26 @@ int ldep_b0r(void *input){
return -1;
}
int ldep_b0p(void *input){
config *c = input;
word *gov = word_buffer_b0(config_get_buffer((config *) c));
int i;
word *dep;
int dist;
if(gov){
for(i=word_get_index(gov) - 1; i > 0 ; i--){
dep = word_buffer_get_word_n(config_get_buffer((config *) c), i);
if(word_get_sent_seg(dep) == 1) return -1;
dist = word_get_index(gov) - i;
if(word_get_gov(dep) == dist){
return word_get_pos(dep);
}
}
}
return -1;
}
int rdep_s0r(void *input){
config *c = input;
word *gov = stack_s0(config_get_stack((config *) c));
......@@ -495,6 +555,46 @@ int rdep_s0r(void *input){
return -1;
}
int rdep_s0p(void *input){
config *c = input;
word *gov = stack_s0(config_get_stack((config *) c));
int i;
word *dep;
int dist;
if(gov){
for(i=word_get_index(gov) + 1; i < word_buffer_get_nbelem(config_get_buffer((config *) c)) ; i++){
dep = word_buffer_get_word_n(config_get_buffer((config *) c), i);
if(i >= word_get_index(word_buffer_b0(config_get_buffer((config *) c)))) return -1;
dist = i - word_get_index(gov);
if(word_get_gov(dep) == - dist){
return word_get_pos(dep);
}
}
}
return -1;
}
int rdep_s1p(void *input){
config *c = input;
word *gov = stack_s1(config_get_stack((config *) c));
int i;
word *dep;
int dist;
if(gov){
for(i=word_get_index(gov) + 1; i < word_buffer_get_nbelem(config_get_buffer((config *) c)) ; i++){
dep = word_buffer_get_word_n(config_get_buffer((config *) c), i);
if(i >= word_get_index(word_buffer_b0(config_get_buffer((config *) c)))) return -1;
dist = i - word_get_index(gov);
if(word_get_gov(dep) == - dist){
return word_get_pos(dep);
}
}
}
return -1;
}
int rdep_s1r(void *input){
config *c = input;
word *gov = stack_s1(config_get_stack((config *) c));
......@@ -535,6 +635,26 @@ int rdep_b0r(void *input){
return -1;
}
int rdep_b0p(void *input){
config *c = input;
word *gov = word_buffer_b0(config_get_buffer((config *) c));
int i;
word *dep;
int dist;
if(gov){
for(i=word_get_index(gov) + 1; i < word_buffer_get_nbelem(config_get_buffer((config *) c)) ; i++){
dep = word_buffer_get_word_n(config_get_buffer((config *) c), i);
if(i >= word_get_index(word_buffer_b0(config_get_buffer((config *) c)))) return -1;
dist = i - word_get_index(gov);
if(word_get_gov(dep) == - dist){
return word_get_pos(dep);
}
}
}
return -1;
}
int ndep_b0(void *input){
config *c = input;
......@@ -1118,10 +1238,16 @@ feat_lib *feat_lib_build(void)
feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_s0r", ldep_s0r);
feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_s0r", rdep_s0r);
feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_s0p", ldep_s0p);
feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_s0p", rdep_s0p);
feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_s1r", ldep_s1r);
feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_s1r", rdep_s1r);
feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_s1p", ldep_s1p);
feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_s1p", rdep_s1p);
feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_b0r", ldep_b0r);
feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_b0r", rdep_b0r);
feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_b0p", ldep_b0p);
feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_b0p", rdep_b0p);
feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"ndep_b0", ndep_b0);
feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"ndep_s0", ndep_s0);
......
......@@ -423,12 +423,18 @@ int gs0p(void *input);
int ldep_s0r(void *input);
int rdep_s0r(void *input);
int ldep_s0p(void *input);
int rdep_s0p(void *input);
int ldep_s1r(void *input);
int rdep_s1r(void *input);
int ldep_s1p(void *input);
int rdep_s1p(void *input);
int ndep_b0(void *input);
int ndep_s0(void *input);
int ldep_b0r(void *input);
int rdep_b0r(void *input);
int ldep_b0p(void *input);
int rdep_b0p(void *input);
/* distance features */
......
......@@ -152,6 +152,7 @@ int main(int argc, char *argv[])
ctx = context_read_options(argc, argv);
maca_trans_parser_mcf2cff_check_options(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
if(ctx->mode == TRAIN_MODE){
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment