diff --git a/maca_common/src/trie.c b/maca_common/src/trie.c index 150bdae8858af9b8f214ed1b0ad6866702fe0b64..b25bca3df0e2b0a87abe36d74c5c6ed692db65b6 100644 --- a/maca_common/src/trie.c +++ b/maca_common/src/trie.c @@ -142,8 +142,8 @@ int trie_lookup(trie *t, int *word, int length) break; } } - if(trans == NULL) - return 0; + if(trans == NULL) + return 0; } return t->states[current_state]->is_accept; } diff --git a/maca_lexer/src/maca_lexer.c b/maca_lexer/src/maca_lexer.c index b0966453a6050b4febc293302ccb742427884cfd..d80e7cf332474e9c2a11ba25c80f7bf1dfd2b345 100644 --- a/maca_lexer/src/maca_lexer.c +++ b/maca_lexer/src/maca_lexer.c @@ -38,6 +38,33 @@ int look_for_accept_state_in_path(trie *mwe_trie, int *states_array, int path_in return -1; } +void print_states_array(char *buffer, context *ctx, trie *mwe_trie, dico *d_mwe_tokens, int *states_array, int *symbols_array, int path_index) +{ + int i; + if(path_index == 0) return; + int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index); + /* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */ + for(i=0; i <= accept_state_index; i++){ + if(ctx->paste){ + if(i > 0) printf("%s", ctx->mwe_tokens_separator); + printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); + } + else{ + if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + } + } + if(ctx->paste) + if(accept_state_index != -1) printf("\n"); + /* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe, they are just printed */ + for(i = accept_state_index + 1; i < path_index; i++){ + if(ctx->paste) + printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + else + printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i])); + } +} + int main(int argc, char *argv[]) { @@ -48,10 +75,10 @@ int main(int argc, char *argv[]) FILE *f = NULL; trie *mwe_trie; dico *d_mwe_tokens = NULL; - int states_array[100]; + int states_array[100]; /* an array in which we store the states we have traversed in the trie */ int symbols_array[100]; int path_index = 0; - int i; + int next_state; ctx = context_read_options(argc, argv); maca_lexer_check_options(ctx); @@ -74,77 +101,69 @@ int main(int argc, char *argv[]) d_mwe_tokens = dico_read(ctx->mwe_tokens_dico_filename, 0.5); /* trie_print(stdout, mwe_trie); */ - - /* look for a valid word */ while(fgets(buffer, 10000, f)){ - if(feof(f)) return 0; /* no more words to read */ + /* look for a valid form */ if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')){ printf("\n"); continue; } - buffer[strlen(buffer)-1] = '\0'; + /* look for code of form read */ form_code = dico_string2int(d_mwe_tokens, buffer); - symbols_array[path_index] = form_code; - states_array[path_index] = (form_code == -1)? 0 - : trie_destination_state(mwe_trie, (path_index == 0) ? 0 : states_array[path_index - 1], form_code); - /* - printf("buffer = %s ", buffer); - printf("code = %d\n", form_code); - - - for(i=0; i <= path_index; i++){ - printf("%d ", states_array[i]); - } - printf("\n"); - for(i=0; i <= path_index; i++){ - printf("%d ", symbols_array[i]); + + if(form_code == -1){ + /* if form has no code, it cannot be part of a mwe, print the potential mwe discovered so far */ + print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index); + path_index = 0; + /* print the current form */ + if(ctx->paste) + printf("%s\n", buffer); + else + printf("%s\t1\n", buffer); + continue; } - printf("\n"); - */ - if(states_array[path_index] == 0){ /* in initial state of trie */ - /* nothing has been recognized */ - if(path_index == 0) - if(ctx->paste) - printf("%s\n", buffer); - else - printf("%s\t1\n", buffer); - else{ /* there is something in the path */ - int accept_state_index = look_for_accept_state_in_path(mwe_trie, states_array, path_index); - /* all tokens in path s.t. 0 <= token_index <= accept_state_index form an mwe */ - for(i=0; i <= accept_state_index; i++){ - if(ctx->paste){ - if(i > 0) printf("%s", ctx->mwe_tokens_separator); - printf("%s", dico_int2string(d_mwe_tokens, symbols_array[i])); - } - else{ - if(i==0) printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i])); - else printf("%s\t0\n", dico_int2string(d_mwe_tokens, symbols_array[i])); - } - } - if(ctx->paste) - if(accept_state_index != -1) printf("\n"); - /* all tokens in path s.t. accept_state_index < token_index < path_index do not form an mwe */ - for(i = accept_state_index + 1; i < path_index; i++){ - if(ctx->paste) - printf("%s\n", dico_int2string(d_mwe_tokens, symbols_array[i])); - else - printf("%s\t1\n", dico_int2string(d_mwe_tokens, symbols_array[i])); - } - /* do not forget to print the current token */ - if(ctx->paste) - printf("%s\n", buffer); - else - printf("%s\t1\n", buffer); - path_index = 0; - } + + /* look for the next state in the trie */ + next_state = trie_destination_state(mwe_trie, (path_index == 0) ? 0: states_array[path_index - 1], form_code); + + if(next_state != 0){ + /* the path is growing */ + symbols_array[path_index] = form_code; + states_array[path_index] = next_state; + path_index++; + continue; } - /* not in state 0 of trie we are processing tokens of a potential mwe */ - else{ + /* print the potential mwe discovered so far */ + print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index); + + if(path_index != 0) + /* if there was a path that aborted, see if there is a valid transition from state 0 with form */ + next_state = trie_destination_state(mwe_trie, 0, form_code); + + path_index = 0; + if(next_state){ + /* such a transition exists */ + symbols_array[path_index] = form_code; + states_array[path_index] = next_state; path_index++; + continue; } + /* such a transition does not exist, just print the form */ + if(ctx->paste) + printf("%s\n", buffer); + else + printf("%s\t1\n", buffer); + } + + if(path_index != 0){ + /* there is something in states array */ + /* print the potential mwe discovered so far */ + print_states_array(buffer, ctx, mwe_trie, d_mwe_tokens, states_array, symbols_array, path_index); + path_index = 0; } + return 0; } + diff --git a/maca_trans_parser/src/feat_fct.c b/maca_trans_parser/src/feat_fct.c index ec84300011c7bbd72c51601c9cf1774455f8cfb4..fdb525e33391bb91f27236bcfe8b80c36ac69bf4 100644 --- a/maca_trans_parser/src/feat_fct.c +++ b/maca_trans_parser/src/feat_fct.c @@ -435,6 +435,26 @@ int ldep_s0r(void *input){ return -1; } +int ldep_s0p(void *input){ + config *c = input; + word *gov = stack_s0(config_get_stack((config *) c)); + int i; + word *dep; + int dist; + + if(gov){ + for(i=word_get_index(gov) - 1; i > 0 ; i--){ + dep = word_buffer_get_word_n(config_get_buffer((config *) c), i); + if(word_get_sent_seg(dep) == 1) return -1; + dist = word_get_index(gov) - i; + if(word_get_gov(dep) == dist){ + return word_get_pos(dep); + } + } + } + return -1; +} + int ldep_s1r(void *input){ config *c = input; word *gov = stack_s1(config_get_stack((config *) c)); @@ -455,6 +475,26 @@ int ldep_s1r(void *input){ return -1; } +int ldep_s1p(void *input){ + config *c = input; + word *gov = stack_s1(config_get_stack((config *) c)); + int i; + word *dep; + int dist; + + if(gov){ + for(i=word_get_index(gov) - 1; i > 0 ; i--){ + dep = word_buffer_get_word_n(config_get_buffer((config *) c), i); + if(word_get_sent_seg(dep) == 1) return -1; + dist = word_get_index(gov) - i; + if(word_get_gov(dep) == dist){ + return word_get_pos(dep); + } + } + } + return -1; +} + int ldep_b0r(void *input){ config *c = input; word *gov = word_buffer_b0(config_get_buffer((config *) c)); @@ -475,6 +515,26 @@ int ldep_b0r(void *input){ return -1; } +int ldep_b0p(void *input){ + config *c = input; + word *gov = word_buffer_b0(config_get_buffer((config *) c)); + int i; + word *dep; + int dist; + + if(gov){ + for(i=word_get_index(gov) - 1; i > 0 ; i--){ + dep = word_buffer_get_word_n(config_get_buffer((config *) c), i); + if(word_get_sent_seg(dep) == 1) return -1; + dist = word_get_index(gov) - i; + if(word_get_gov(dep) == dist){ + return word_get_pos(dep); + } + } + } + return -1; +} + int rdep_s0r(void *input){ config *c = input; word *gov = stack_s0(config_get_stack((config *) c)); @@ -495,6 +555,46 @@ int rdep_s0r(void *input){ return -1; } +int rdep_s0p(void *input){ + config *c = input; + word *gov = stack_s0(config_get_stack((config *) c)); + int i; + word *dep; + int dist; + + if(gov){ + for(i=word_get_index(gov) + 1; i < word_buffer_get_nbelem(config_get_buffer((config *) c)) ; i++){ + dep = word_buffer_get_word_n(config_get_buffer((config *) c), i); + if(i >= word_get_index(word_buffer_b0(config_get_buffer((config *) c)))) return -1; + dist = i - word_get_index(gov); + if(word_get_gov(dep) == - dist){ + return word_get_pos(dep); + } + } + } + return -1; +} + +int rdep_s1p(void *input){ + config *c = input; + word *gov = stack_s1(config_get_stack((config *) c)); + int i; + word *dep; + int dist; + + if(gov){ + for(i=word_get_index(gov) + 1; i < word_buffer_get_nbelem(config_get_buffer((config *) c)) ; i++){ + dep = word_buffer_get_word_n(config_get_buffer((config *) c), i); + if(i >= word_get_index(word_buffer_b0(config_get_buffer((config *) c)))) return -1; + dist = i - word_get_index(gov); + if(word_get_gov(dep) == - dist){ + return word_get_pos(dep); + } + } + } + return -1; +} + int rdep_s1r(void *input){ config *c = input; word *gov = stack_s1(config_get_stack((config *) c)); @@ -535,6 +635,26 @@ int rdep_b0r(void *input){ return -1; } +int rdep_b0p(void *input){ + config *c = input; + word *gov = word_buffer_b0(config_get_buffer((config *) c)); + int i; + word *dep; + int dist; + + if(gov){ + for(i=word_get_index(gov) + 1; i < word_buffer_get_nbelem(config_get_buffer((config *) c)) ; i++){ + dep = word_buffer_get_word_n(config_get_buffer((config *) c), i); + if(i >= word_get_index(word_buffer_b0(config_get_buffer((config *) c)))) return -1; + dist = i - word_get_index(gov); + if(word_get_gov(dep) == - dist){ + return word_get_pos(dep); + } + } + } + return -1; +} + int ndep_b0(void *input){ config *c = input; @@ -1118,10 +1238,16 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_s0r", ldep_s0r); feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_s0r", rdep_s0r); + feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_s0p", ldep_s0p); + feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_s0p", rdep_s0p); feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_s1r", ldep_s1r); feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_s1r", rdep_s1r); + feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_s1p", ldep_s1p); + feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_s1p", rdep_s1p); feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_b0r", ldep_b0r); feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_b0r", rdep_b0r); + feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_b0p", ldep_b0p); + feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_b0p", rdep_b0p); feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"ndep_b0", ndep_b0); feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"ndep_s0", ndep_s0); diff --git a/maca_trans_parser/src/feat_fct.h b/maca_trans_parser/src/feat_fct.h index 8650e6bf303f20529c7ebd0efb1d91720dc3cdea..35820ec8dee1b696493efeb76ec91de5dc47646d 100644 --- a/maca_trans_parser/src/feat_fct.h +++ b/maca_trans_parser/src/feat_fct.h @@ -423,12 +423,18 @@ int gs0p(void *input); int ldep_s0r(void *input); int rdep_s0r(void *input); +int ldep_s0p(void *input); +int rdep_s0p(void *input); int ldep_s1r(void *input); int rdep_s1r(void *input); +int ldep_s1p(void *input); +int rdep_s1p(void *input); int ndep_b0(void *input); int ndep_s0(void *input); int ldep_b0r(void *input); int rdep_b0r(void *input); +int ldep_b0p(void *input); +int rdep_b0p(void *input); /* distance features */ diff --git a/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c b/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c index ef78339a7f5b55edf273e2332e11d0a0d1205eed..11a3bb5879b72c9fd53b5cb124732a2f03b02a34 100644 --- a/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c @@ -152,6 +152,7 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); maca_trans_parser_mcf2cff_check_options(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); if(ctx->mode == TRAIN_MODE){