diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index 1fd242cb23c8dee976a0e1fcbe29022e0e9f6b28..5c6747eba43a0cff072a031c0e412ace373d26bb 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -10,7 +10,7 @@ #define MCD_WF_NB 36 -#define MCD_WF_INDEX 0 +#define MCD_WF_ID 0 #define MCD_WF_FORM 1 #define MCD_WF_LEMMA 2 #define MCD_WF_CPOS 3 @@ -53,7 +53,7 @@ #define mcd_get_dico_label(m) (m)->dico_array[MCD_WF_LABEL] -#define mcd_get_index_col(m) (m)->wf2col[MCD_WF_INDEX] +#define mcd_get_index_col(m) (m)->wf2col[MCD_WF_ID] #define mcd_get_form_col(m) (m)->wf2col[MCD_WF_FORM] #define mcd_get_lemma_col(m) (m)->wf2col[MCD_WF_LEMMA] #define mcd_get_cpos_col(m) (m)->wf2col[MCD_WF_CPOS] diff --git a/maca_common/include/word.h b/maca_common/include/word.h index 7606d4cf31bd94d38a7d087a8def69db4650fb0d..14d3c05a0461313c47f163a434f58a6c6f0fe28c 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -3,13 +3,13 @@ #include "mcd.h" -#define word_get_index(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_INDEX] +#define word_get_id(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID] #define word_get_form(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM] #define word_get_lemma(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LEMMA] #define word_get_cpos(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_CPOS] #define word_get_pos(w) ((w) == NULL) ? -1 : ((w)->wf_array[MCD_WF_POS]) #define word_get_feats(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FEATS] -#define word_get_gov(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_GOV] +#define word_get_gov(w) ((w) == NULL) ? 0 : (w)->wf_array[MCD_WF_GOV] #define word_get_label(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LABEL] #define word_get_stag(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_STAG] #define word_get_sent_seg(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_SENT_SEG] @@ -39,11 +39,12 @@ #define word_get_X(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_X] #define word_get_Y(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_Y] #define word_get_Z(w) ((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_Z] +#define word_get_input(w) ((w) == NULL) ? NULL : (w)->input #define word_get_signature(w) ((w) == NULL) ? -1 : (w)->signature #define word_get_U1(w) ((w) == NULL) ? -1 : (w)->U1 -#define word_get_relative_index(w) ((w) == NULL) ? -1 : (w)->relative_index +#define word_get_index(w) ((w) == NULL) ? -1 : (w)->index -#define word_set_index(w, val) (w)->wf_array[MCD_WF_INDEX] = (val) +#define word_set_id(w, val) (w)->wf_array[MCD_WF_ID] = (val) #define word_set_form(w, val) (w)->wf_array[MCD_WF_FORM] = (val) #define word_set_lemma(w, val) (w)->wf_array[MCD_WF_LEMMA] = (val) #define word_set_cpos(w, val) (w)->wf_array[MCD_WF_CPOS] = (val) @@ -80,7 +81,7 @@ #define word_set_Y(w, val) (w)->wf_array[MCD_WF_Y] = (val) #define word_set_Z(w, val) (w)->wf_array[MCD_WF_Z] = (val) #define word_set_signature(w, val) (w)->signature = (val) -#define word_set_relative_index(w, val) (w)->relative_index = (val) +#define word_set_index(w, val) (w)->index = (val) typedef struct _word { int wf_array[MCD_WF_NB]; /* array containing the codes corresponding to the different word features */ @@ -89,7 +90,7 @@ typedef struct _word { int signature; /* pos tags that this form can have (represented as a boolean string) */ int label; char *form; - int relative_index; + int index; } word; word *word_new(char *input); @@ -104,6 +105,6 @@ word *word_read(FILE *f, mcd *mcd_struct); word *word_parse_buffer(char *buffer, mcd *mcd_struct); int word_is_eos(word *w, mcd *mcd_struct); -int word_get_gov_relative_index(word *w); +int word_get_gov_index(word *w); #endif diff --git a/maca_common/src/form2pos.c b/maca_common/src/form2pos.c index ab86d127727f7197b2c6ea93469f74bd4003fb88..b2efb19294a1af2416128925693e2b6bc2e01a86 100644 --- a/maca_common/src/form2pos.c +++ b/maca_common/src/form2pos.c @@ -65,9 +65,9 @@ form2pos *form2pos_read(char *filename) int form2pos_get_signature(form2pos *f2p, char *form) { - /* if(form == NULL) + if(form == NULL) return -1; - else*/ + else return hash_get_val(f2p->h_form2signature, form); } diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index 64be8859569b5af09667538c9a46239ad8d7511c..9a2dc6d94d2afefe9b1b70389c6bf71bb844479d 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -231,11 +231,11 @@ mcd *mcd_read(char *mcd_filename, int verbose) mcd *mcd_build_conll07(void) { mcd *m = mcd_new(8); - m->wf[0]=MCD_WF_INDEX; + m->wf[0]=MCD_WF_ID; m->wf_str[0]=strdup("INDEX"); m->representation[0]= MCD_REPRESENTATION_INT; m->filename[0] = strdup("_"); - m->wf2col[MCD_WF_INDEX] = 0; + m->wf2col[MCD_WF_ID] = 0; m->wf[1]=MCD_WF_FORM; m->wf_str[1]=strdup("FORM"); @@ -332,11 +332,11 @@ mcd *mcd_build_ifpls(void) { mcd *m = mcd_new(6); - m->wf[0]=MCD_WF_INDEX; + m->wf[0]=MCD_WF_ID; m->wf_str[0]=strdup("INDEX"); m->representation[0]= MCD_REPRESENTATION_INT; m->filename[0] = strdup("_"); - m->wf2col[MCD_WF_INDEX] = 0; + m->wf2col[MCD_WF_ID] = 0; m->wf[1]=MCD_WF_FORM; m->wf_str[1]=strdup("FORM"); @@ -389,7 +389,7 @@ dico_vec *mcd_build_dico_vec(mcd *mcd_struct) int mcd_wf_code(char *wf) { - if(!strcmp(wf, "INDEX")) return MCD_WF_INDEX; + if(!strcmp(wf, "INDEX")) return MCD_WF_ID; if(!strcmp(wf, "FORM")) return MCD_WF_FORM; if(!strcmp(wf, "LEMMA")) return MCD_WF_LEMMA; if(!strcmp(wf, "CPOS")) return MCD_WF_CPOS; diff --git a/maca_common/src/sentence.c b/maca_common/src/sentence.c index aaa5e1b587d77cbc97e9f65b23e6c75fe25e85af..a97486b02c565d33b0b87073e005d3ca0834c71e 100644 --- a/maca_common/src/sentence.c +++ b/maca_common/src/sentence.c @@ -48,7 +48,7 @@ void sentence_add_word(sentence *s, word *w) s->length++; s->words = (word **)realloc(s->words, s->length * sizeof(word *)); s->words[s->length -1] = w; - word_set_relative_index(w, s->length -1); + word_set_index(w, s->length -1); } void sentence_free(sentence *s) diff --git a/maca_common/src/word.c b/maca_common/src/word.c index be7d2f8d26f9763fb82cc89df9fd9412a79d89f6..1a668e4421a96b40af11657254c764c5c894447d 100644 --- a/maca_common/src/word.c +++ b/maca_common/src/word.c @@ -20,7 +20,7 @@ word *word_new(char *input) w->wf_array[MCD_WF_GOV] = 0; w->form = NULL; - w->relative_index = -1; + w->index = -1; w->signature = -1; return w; } @@ -102,8 +102,8 @@ word *word_create_dummy(mcd *mcd_struct) word *w = word_new(NULL); /* int type; */ - w->wf_array[MCD_WF_INDEX] = 0; - w->relative_index = 0; + w->wf_array[MCD_WF_ID] = 0; + w->index = 0; /* for(type = 1; type < MCD_WF_NB; type++) w->wf_array[type] = -1;*/ /* if(mcd_struct->wf2col[type] != -1) @@ -120,8 +120,8 @@ void word_print2(FILE *f, word *w) printf("form = %d\t", word_get_form(w)); printf("lemma = %d\t", word_get_lemma(w)); printf("pos = %d\t", word_get_pos(w)); - printf("index = %d\t", word_get_index(w)); - printf("rel index = %d\n", word_get_relative_index(w)); + printf("index = %d\t", word_get_id(w)); + printf("rel index = %d\n", word_get_index(w)); } void word_print(FILE *f, word *w) @@ -140,16 +140,10 @@ int word_is_eos(word *w, mcd *mcd_struct) return word_get_sent_seg(w); } -int word_get_gov_relative_index(word *w) +int word_get_gov_index(word *w) { int index; if(word_get_gov(w) == 0) return -1; - - /* printf("in word_get_gov_rel_index(%d)\n", word_get_relative_index(w)); - printf("gov = %d\n", word_get_gov(w)); */ - - index = (word_get_relative_index(w)) + (word_get_gov(w)); - /* printf("index = %d\n", index); */ - return index; - /* return word_get_relative_index(w) + word_get_gov(w); */ + index = (word_get_index(w)) + (word_get_gov(w)); + return index; } diff --git a/maca_common/src/word_buffer.c b/maca_common/src/word_buffer.c index aabe1876898902cf68621439b6b14c3ef0173a31..a22ae9edcee304521f76dce0ef04ac52e7785a54 100644 --- a/maca_common/src/word_buffer.c +++ b/maca_common/src/word_buffer.c @@ -49,19 +49,19 @@ void word_buffer_print_compact(FILE *f, word_buffer *wb) { word *w; w = word_buffer_bm3(wb); - if(w){ fprintf(f, "%d:%s ", word_get_relative_index(w), w->form);} + if(w){ fprintf(f, "%d:%s ", word_get_index(w), w->form);} w = word_buffer_bm2(wb); - if(w){ fprintf(f, "%d:%s ", word_get_relative_index(w), w->form);} + if(w){ fprintf(f, "%d:%s ", word_get_index(w), w->form);} w = word_buffer_bm1(wb); - if(w){ fprintf(f, "%d:%s ", word_get_relative_index(w), w->form);} + if(w){ fprintf(f, "%d:%s ", word_get_index(w), w->form);} w = word_buffer_b0(wb); - if(w){ fprintf(f, "[%d:%s] ", word_get_relative_index(w), w->form);} + if(w){ fprintf(f, "[%d:%s] ", word_get_index(w), w->form);} w = word_buffer_b1(wb); - if(w){ fprintf(f, "%d:%s ", word_get_relative_index(w), w->form);} + if(w){ fprintf(f, "%d:%s ", word_get_index(w), w->form);} w = word_buffer_b2(wb); - if(w){ fprintf(f, "%d:%s ", word_get_relative_index(w), w->form);} + if(w){ fprintf(f, "%d:%s ", word_get_index(w), w->form);} w = word_buffer_b3(wb); - if(w){ fprintf(f, "%d:%s ", word_get_relative_index(w), w->form);} + if(w){ fprintf(f, "%d:%s ", word_get_index(w), w->form);} fprintf(f, "\n"); } @@ -83,7 +83,7 @@ int word_buffer_add(word_buffer *wb, word *w) wb->array = (word **)realloc(wb->array, wb->size * sizeof(word *)); } wb->array[wb->nbelem] = w; - word_set_relative_index(w, wb->nbelem); + word_set_index(w, wb->nbelem); wb->nbelem++; return wb->nbelem - 1; @@ -116,6 +116,7 @@ int word_buffer_read_next_word(word_buffer *wb) word *w = NULL; w = word_read(wb->input_file, wb->mcd_struct); + /* if((w) && (w->input)) printf("## %s\n", w->input); */ if(w == NULL) return -1; word_buffer_add(wb, w); return wb->nbelem - 1; @@ -141,6 +142,8 @@ int word_buffer_move_left(word_buffer *wb) int word_buffer_end(word_buffer *wb) { + + /* printf("in word_buffer_end current index = %d nb elem = %d\n", wb->current_index, wb->nbelem); */ return (wb->current_index >= wb->nbelem)? 1 : 0; } @@ -165,7 +168,7 @@ int word_buffer_read_sentence(word_buffer *wb) /* fprintf(stderr, "%s", buffer); */ if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')) break; /* end of the sentence */ w = word_parse_buffer(buffer, word_buffer_get_mcd(wb)); - word_set_relative_index(w, index); + word_set_index(w, index); index++; word_buffer_add(wb, w); if(word_is_eos(w, word_buffer_get_mcd(wb))) break; diff --git a/maca_trans_parser/src/config.c b/maca_trans_parser/src/config.c index 8e809a8b0b6b6ac1d85a232f97b6abb8a6245b42..68e15dd464e277074117c0489c45ca56c34b379c 100644 --- a/maca_trans_parser/src/config.c +++ b/maca_trans_parser/src/config.c @@ -30,7 +30,7 @@ config *config_new(FILE *f, mcd *mcd_struct, int lookahead) w = word_read(c->f, c->mcd_struct); if(w == NULL) return NULL; - word_set_relative_index(w, c->current_index); + word_set_index(w, c->current_index); c->current_index++; queue_add(c->bf, w); return w; @@ -48,7 +48,7 @@ void config_free(config *c) int config_is_terminal(config *c) { - return word_buffer_is_last(c->bf); + return word_buffer_end(c->bf); } config *config_initial(FILE *f, mcd *mcd_struct, int lookahead) @@ -102,12 +102,12 @@ void config_add_mvt(config *c, int mvt) void config_print(FILE *f, config *c) { - word *b0 = NULL; - word *s0 = NULL; + /* word *b0 = NULL; */ + /* word *s0 = NULL; */ if(c){ if(!stack_is_empty(c->st)) - s0 = stack_elt_n(c->st, 0); - b0 = word_buffer_b0(c->bf); + /* s0 = stack_elt_n(c->st, 0); */ + /* b0 = word_buffer_b0(c->bf); */ /* if(s0) { printf("s0 = "); word_print2(stdout, s0);} */ /* if(b0) { printf("b0 = "); word_print2(stdout, b0);} */ diff --git a/maca_trans_parser/src/depset.c b/maca_trans_parser/src/depset.c index 3ce6b4776925c1e2320b651adff6193ff7361b06..3298f7e419bbd05f78377d1608263db3a4f499dd 100644 --- a/maca_trans_parser/src/depset.c +++ b/maca_trans_parser/src/depset.c @@ -44,10 +44,10 @@ void depset_add(depset *d, word *gov, int label, word *dep) int new_length; if(gov == NULL || dep == NULL) return; - word *max = (word_get_relative_index(gov) > word_get_relative_index(dep)) ? gov : dep; + word *max = (word_get_index(gov) > word_get_index(dep)) ? gov : dep; - if(word_get_relative_index(max) >= d->length){ - new_length = word_get_relative_index(max) + 1; + if(word_get_index(max) >= d->length){ + new_length = word_get_index(max) + 1; d->array = (dependency *)realloc(d->array, new_length * sizeof(dependency)); for(i=d->length; i < new_length; i++){ d->array[i].gov = NULL; @@ -56,9 +56,9 @@ void depset_add(depset *d, word *gov, int label, word *dep) } d->length = new_length; } - d->array[word_get_relative_index(dep)].gov = gov; - d->array[word_get_relative_index(dep)].dep = dep; - d->array[word_get_relative_index(dep)].label = label; + d->array[word_get_index(dep)].gov = gov; + d->array[word_get_index(dep)].dep = dep; + d->array[word_get_index(dep)].label = label; } void depset_print(FILE *f, depset *d) @@ -66,7 +66,7 @@ void depset_print(FILE *f, depset *d) int i; for(i=0; i < d->length; i++){ if((d->array[i].gov) && (d->array[i].dep)) - fprintf(f, "(%d, %d, %d) ", word_get_relative_index(d->array[i].dep), d->array[i].label, word_get_relative_index(d->array[i].gov)); + fprintf(f, "(%d, %d, %d) ", word_get_index(d->array[i].dep), d->array[i].label, word_get_index(d->array[i].gov)); } fprintf(f, "\n"); } @@ -78,7 +78,7 @@ void depset_print2(FILE *f, depset *d, dico *dico_labels) char *label; for(i=1; i < d->length; i++){ if((d->array[i].gov) && (d->array[i].dep)){ - distance = word_get_relative_index(d->array[i].gov) - word_get_relative_index(d->array[i].dep); + distance = word_get_index(d->array[i].gov) - word_get_index(d->array[i].dep); /* fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, distance, dico_int2string(dico_labels, d->array[i].label)); */ label = dico_int2string(dico_labels, d->array[i].label); @@ -103,10 +103,10 @@ void depset_print3(FILE *f, depset *d, dico *dico_labels) for(i=1; i < d->length; i++){ if((d->array[i].gov) && (d->array[i].dep)){ if(d->array[i].label == root_code) - fprintf(f, "%d\t%s\t%d\t%s\n", word_get_relative_index(d->array[i].dep), d->array[i].dep->input, 0, dico_int2string(dico_labels, d->array[i].label)); + fprintf(f, "%d\t%s\t%d\t%s\n", word_get_index(d->array[i].dep), d->array[i].dep->input, 0, dico_int2string(dico_labels, d->array[i].label)); else{ - distance = word_get_relative_index(d->array[i].gov) - word_get_relative_index(d->array[i].dep); - fprintf(f, "%d\t%s\t%d\t%s\n", word_get_relative_index(d->array[i].dep), d->array[i].dep->input, distance, dico_int2string(dico_labels, d->array[i].label)); + distance = word_get_index(d->array[i].gov) - word_get_index(d->array[i].dep); + fprintf(f, "%d\t%s\t%d\t%s\n", word_get_index(d->array[i].dep), d->array[i].dep->input, distance, dico_int2string(dico_labels, d->array[i].label)); } } } @@ -129,9 +129,9 @@ void depset_print_new_index(FILE *f, depset *d, dico *dico_labels) for(i=1; i < d->length; i++){ if((d->array[i].gov) && (d->array[i].dep)){ - /* fprintf(f, "%d\t", word_get_relative_index(d->array[i].dep)); */ - fprintf(f, "%d\t", word_get_relative_index(d->array[i].dep)); - fprintf(f, "%s\t%d\t%s\n", skip_index(d->array[i].dep->input), word_get_relative_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label)); + /* fprintf(f, "%d\t", word_get_index(d->array[i].dep)); */ + fprintf(f, "%d\t", word_get_index(d->array[i].dep)); + fprintf(f, "%s\t%d\t%s\n", skip_index(d->array[i].dep->input), word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label)); } } fprintf(f, "\n"); @@ -148,8 +148,8 @@ int depset_compare(depset *d1, depset *d2) if(d1->length != d2->length){ fprintf(stdout, "fail\n"); return 0;} for(i=0; i < d1->length; i++){ for(j=0; j < d2->length; j++){ - if((word_get_relative_index(d1->array[i].gov) == word_get_relative_index(d2->array[j].gov)) - && (word_get_relative_index(d1->array[i].dep) == word_get_relative_index(d2->array[j].dep)) + if((word_get_index(d1->array[i].gov) == word_get_index(d2->array[j].gov)) + && (word_get_index(d1->array[i].dep) == word_get_index(d2->array[j].dep)) && (d1->array[i].label == d2->array[j].label)) break; } if(j == d2->length){ diff --git a/maca_trans_parser/src/feat_fct.c b/maca_trans_parser/src/feat_fct.c index c37a01b0f43e84679ecbcee31c49a041f0f089d0..5740caadc35a99d1b09714354b3179343d3d4fc6 100644 --- a/maca_trans_parser/src/feat_fct.c +++ b/maca_trans_parser/src/feat_fct.c @@ -8,6 +8,9 @@ /* word features */ /* words in the stack */ +int s0g(config *c) {return word_get_gov(stack_s0(config_get_stack(c)));} +int s0sf(config *c) {return word_get_label(stack_s0(config_get_stack(c)));} + int s0f(config *c) {return word_get_form(stack_s0(config_get_stack(c)));} int s0l(config *c) {return word_get_lemma(stack_s0(config_get_stack(c)));} int s0c(config *c) {return word_get_cpos(stack_s0(config_get_stack(c)));} @@ -44,6 +47,9 @@ int s0Z(config *c) {return word_get_Z(stack_s0(config_get_stack(c)));} int s0U1(config *c) {return word_get_U1(stack_s0(config_get_stack(c)));} int s0sgn(config *c) {return word_get_signature(stack_s0(config_get_stack(c)));} +int s1g(config *c) {return word_get_gov(stack_s1(config_get_stack(c)));} +int s1sf(config *c) {return word_get_label(stack_s1(config_get_stack(c)));} + int s1f(config *c) {return word_get_form(stack_s1(config_get_stack(c)));} int s1l(config *c) {return word_get_lemma(stack_s1(config_get_stack(c)));} int s1c(config *c) {return word_get_cpos(stack_s1(config_get_stack(c)));} @@ -400,8 +406,8 @@ int ldep_s0r(config *c){ int i; if(top){ - if(word_get_relative_index(top) >= c->ds->length) return -1; - for(i=word_get_relative_index(top); i > 0; i--) + if(word_get_index(top) >= c->ds->length) return -1; + for(i=word_get_index(top); i > 0; i--) if(c->ds->array[i].gov == top) return i; } @@ -422,7 +428,7 @@ int rdep_s0r(config *c){ int i; if(top) - for(i=word_get_relative_index(top); i < c->ds->length; i++) + for(i=word_get_index(top); i < c->ds->length; i++) if(c->ds->array[i].gov == top) return i; return -1; @@ -442,8 +448,8 @@ int ldep_b0r(config *c){ int i; if(top){ - if(word_get_relative_index(top) >= c->ds->length) return -1; - for(i=word_get_relative_index(top); i > 0; i--) + if(word_get_index(top) >= c->ds->length) return -1; + for(i=word_get_index(top); i > 0; i--) if(c->ds->array[i].gov == top) return i; } @@ -468,7 +474,7 @@ int rdep_b0r(config *c){ int i; if(top) - for(i=word_get_relative_index(top); i < c->ds->length; i++) + for(i=word_get_index(top); i < c->ds->length; i++) if(c->ds->array[i].gov == top) return i; return -1; @@ -545,7 +551,7 @@ int dist_s0_b0(config *c){ if(stack_is_empty(c->st) || word_buffer_is_empty(c->bf)) return 0; - dist = word_get_relative_index(word_buffer_b0(c->bf)) - word_get_relative_index(stack_top(c->st)); + dist = word_get_index(word_buffer_b0(c->bf)) - word_get_index(stack_top(c->st)); return (abs(dist) > 6)? 6 : dist; } diff --git a/maca_trans_parser/src/feat_fct.h b/maca_trans_parser/src/feat_fct.h index 84f8b2adacab1ec71540ef6fde58490b794373e3..6f91a3afc8d1e91aae837918ec2fee505f3dbc80 100644 --- a/maca_trans_parser/src/feat_fct.h +++ b/maca_trans_parser/src/feat_fct.h @@ -6,6 +6,9 @@ typedef int (*feat_fct) (config *c); /* word features */ +int s0g(config *c); +int s0sf(config *c); + int s0f(config *c); int s0l(config *c); int s0c(config *c); @@ -44,6 +47,10 @@ int s0sgn(config *c); int s0r(config *c); +int s1g(config *c); +int s1sf(config *c); + + int s1f(config *c); int s1l(config *c); int s1c(config *c); diff --git a/maca_trans_parser/src/feat_lib.c b/maca_trans_parser/src/feat_lib.c index 1567bd72904f9042a2d129df18f396109926e8c7..6a2293f5327804028f34d66605af322ffaadeb14 100644 --- a/maca_trans_parser/src/feat_lib.c +++ b/maca_trans_parser/src/feat_lib.c @@ -33,6 +33,9 @@ feat_lib *feat_lib_build(void) { feat_lib *fl = feat_lib_new(); + feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"s0sf", s0sf); + feat_lib_add(fl, FEAT_TYPE_INT , (char *)"s0g", s0g); + feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"s0f", s0f); feat_lib_add(fl, FEAT_TYPE_LEMMA, (char *)"s0l", s0l); feat_lib_add(fl, FEAT_TYPE_CPOS, (char *)"s0c", s0c); @@ -69,6 +72,9 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"s0U1", s0U1); feat_lib_add(fl, FEAT_TYPE_INT, (char *)"s0sgn", s0sgn); + feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"s1g", s1g); + feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"s1sf", s1sf); + feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"s1f", s1f); feat_lib_add(fl, FEAT_TYPE_LEMMA, (char *)"s1l", s1l); feat_lib_add(fl, FEAT_TYPE_CPOS, (char *)"s1c", s1c); diff --git a/maca_trans_parser/src/maca_trans_parser_mcf2cff.c b/maca_trans_parser/src/maca_trans_parser_mcf2cff.c index 645307c5596c17747d6d2bd7574f1cdd9279a4a4..77f2ab64664ecae7eabdfd722fc30f3b4773a1d4 100644 --- a/maca_trans_parser/src/maca_trans_parser_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_parser_mcf2cff.c @@ -54,38 +54,39 @@ void generate_training_file_stream(FILE *output_file, context *ctx) int eos_label = dico_string2int(ctx->dico_labels, "eos"); word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct); FILE *mcf_file = myfopen(ctx->input_filename, "r"); - + int start_sentence_index = 1; + /* create an mcd that corresponds to ctx->mcd_struct, but without gov and label */ + /* the idea is to ignore syntax in the mcf file that will be read */ + /* it is ugly !!! */ + mcd *mcd_struct_hyp = mcd_copy(ctx->mcd_struct); mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_GOV); mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_LABEL); c = config_initial(mcf_file, mcd_struct_hyp, 5); - + while(!word_buffer_end(ref)){ - /* printf("************ REF ************\n"); + /*printf("************ REF ************\n"); word_buffer_print(stdout, ref); - printf("*****************************\n");*/ + printf("*****************************\n");*/ - printf("*****************************\n"); - config_print(stdout,c); + /* printf("*****************************\n"); */ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); /* feat_vec_print(stdout, fv); */ - mvt_code = oracle_parser_arc_eager(c, ref); + mvt_code = oracle_parser_arc_eager(c, ref, start_sentence_index); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); - - /* printf("mvt code = %d\n", mvt_code); - printf("mvt type = %d\n", mvt_type); */ - movement_print(stdout, mvt_code, ctx->dico_labels); + + /* config_print(stdout,c); */ + /* movement_print(stdout, mvt_code, ctx->dico_labels); */ fprintf(output_file, "%d", mvt_code); feat_vec_print(output_file, fv); - if(mvt_type == MVT_LEFT){ movement_left_arc(c, mvt_label, 0); continue; @@ -94,35 +95,21 @@ void generate_training_file_stream(FILE *output_file, context *ctx) if(mvt_type == MVT_RIGHT){ movement_right_arc(c, mvt_label, 0); word_buffer_move_right(ref); - if((mvt_label == eos_label)){ /* sentence is complete */ - printf("sentence complete\n"); - - while(movement_reduce(c,0)){ - printf("reduce\n"); - printf("*****************************\n"); - config_print(stdout,c); - - } - - /* pop eos from stack */ - /* stack_pop(config_get_stack(c)); */ - /* pop root from stack */ - /* stack_pop(config_get_stack(c)); */ - - config_print(stdout,c); + sentence_nb++; + start_sentence_index = word_get_index(word_buffer_b0(config_get_buffer(c))) - 1; + /* printf("%d\n", start_sentence_index); */ - printf("ref current index = %d\n", word_buffer_get_current_index(ref)); + /* printf("*****************************\n"); */ + /* config_print(stdout,c); */ if(word_buffer_is_last(ref)){ - printf("it is the end\n"); + /* printf("it is the end\n"); */ break; } - - /* change index of dummy word */ - word_set_relative_index(stack_top(config_get_stack(c)), word_get_relative_index(word_buffer_b0(config_get_buffer(c))) - 1); - } + } continue; } + if(mvt_type == MVT_REDUCE){ movement_reduce(c, 0); continue; @@ -133,61 +120,7 @@ void generate_training_file_stream(FILE *output_file, context *ctx) continue; } } -} - - -#if 0 -void generate_training_file_buffer(FILE *output_file, context *ctx) -{ - config *c; - int mvt_code; - char mvt_type; - int mvt_label; - feat_vec *fv = feat_vec_new(feature_types_nb); - sentence *ref = NULL; - int sentence_nb = 0; - FILE *conll_file = myfopen(ctx->input_filename, "r"); - FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); - - c = config_initial(conll_file, ctx->mcd_struct, 0); - - while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ - /* sentence_print(stdout, ref, NULL); */ - word_buffer_read_sentence(c->bf); - while(!config_is_terminal(c)){ - /* config_print(stdout,c); */ - - config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); - - mvt_code = oracle_parser(c, ref); - - mvt_type = movement_type(mvt_code); - mvt_label = movement_label(mvt_code); - - /* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */ - - fprintf(output_file, "%d", mvt_code); - feat_vec_print(output_file, fv); - - if(mvt_type == MVT_LEFT){ - movement_left_arc(c, mvt_label, 0); - continue; - } - if(mvt_type == MVT_RIGHT){ - movement_right_arc(c, mvt_label, 0); - continue; - } - if(mvt_type == MVT_SHIFT){ - movement_shift(c, 0, 0); - continue; - } - } - config_free(c); - c = config_initial(conll_file, ctx->mcd_struct, 0); - sentence_nb++; - } } -#endif int main(int argc, char *argv[]) { @@ -238,16 +171,9 @@ int main(int argc, char *argv[]) output_file = myfopen(ctx->cff_filename, "w"); else output_file = stdout; - - - generate_training_file_stream(output_file, ctx); - - /* - if(ctx->stream_mode) - generate_training_file_stream(output_file, ctx); - else - generate_training_file_buffer(output_file, ctx); - */ + + generate_training_file_stream(output_file, ctx); + if(ctx->mode == TRAIN_MODE){ /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ dico_vec_print(ctx->vocabs_filename, ctx->vocabs); diff --git a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c index 3dadf7bbb07deb0a9d498b45b18bebf959b40552..6936c6399c713da664876b45ec181e39ee043bb2 100644 --- a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c @@ -87,7 +87,8 @@ void generate_training_file_stream(FILE *output_file, context *ctx) postag = oracle_tagger(c, NULL); fprintf(output_file, "%d", postag); feat_vec_print(output_file, fv); - movement_tagger(c, postag, 0, 1); + int res = movement_tagger(c, postag, 0, 1); + if(res == 0) break; } } diff --git a/maca_trans_parser/src/movement_parser.c b/maca_trans_parser/src/movement_parser.c index 254cab39d47f915545df5cb42b183986191b2d7a..d53556822e141778d4a7908aa5e4b40e2c0b95ab 100644 --- a/maca_trans_parser/src/movement_parser.c +++ b/maca_trans_parser/src/movement_parser.c @@ -37,10 +37,10 @@ int movement_left_arc(config *c, int label, float score) { if(stack_is_empty(c->st)) return 0; if(word_buffer_is_empty(c->bf)) return 0; - if(word_get_relative_index(stack_top(c->st)) == 0) return 0; /* the dummy word cannot be a dependent */ + if(word_get_index(stack_top(c->st)) == 0) return 0; /* the dummy word cannot be a dependent */ /* create a new dependency */ - word_set_gov(stack_top(c->st), word_get_relative_index(word_buffer_b0(c->bf))); + word_set_gov(stack_top(c->st), word_get_index(word_buffer_b0(c->bf))); word_set_label(stack_top(c->st), label); /* depset_add(c->ds, word_buffer_b0(c->bf), label, stack_top(c->st)); */ @@ -57,7 +57,7 @@ int movement_right_arc(config *c, int label, float score) /* create a new dependency */ - word_set_gov(word_buffer_b0(c->bf), word_get_relative_index(stack_top(c->st))); + word_set_gov(word_buffer_b0(c->bf), word_get_index(stack_top(c->st))); word_set_label(word_buffer_b0(c->bf), label); /* depset_add(c->ds, stack_top(c->st), label, word_buffer_b0(c->bf)); */ @@ -96,7 +96,7 @@ config *movement_left_arc_dup(config *c, int label, float score, feat_vec *fv) config *copy = NULL; if(stack_is_empty(c->st)) return NULL; if(word_buffer_is_empty(c->bf)) return NULL; - if(word_get_relative_index(stack_top(c->st)) == 0) return NULL; + if(word_get_index(stack_top(c->st)) == 0) return NULL; copy = config_copy(c); depset_add(copy->ds, word_buffer_b0(copy->bf), label, stack_top(copy->st)); diff --git a/maca_trans_parser/src/movement_parser_arc_eager.c b/maca_trans_parser/src/movement_parser_arc_eager.c index 56beba1d06b79bd3b91f19a8cbfa3125b630e9b1..0481cb0296e7ba7ef39d400d1d5e22a6eabe41e6 100644 --- a/maca_trans_parser/src/movement_parser_arc_eager.c +++ b/maca_trans_parser/src/movement_parser_arc_eager.c @@ -35,29 +35,22 @@ int movement_label(int mvt) int movement_left_arc(config *c, int label, float score) { - - if(stack_height(c->st) < 2) return 0; /* the dummy word cannot be a dependent */ - if(word_buffer_is_empty(c->bf)) return 0; - - /* word on top of the stack should not have a governor */ - /* printf("index word top of stack = %d\n", word_get_relative_index(stack_top(c->st))); */ - /* printf("word_get_gov_relative_index(stack_top(c->st)) = %d\n", word_get_gov_relative_index(stack_top(c->st))); */ - - if(word_get_gov_relative_index(stack_top(c->st)) != -1) return 0; - - word *gov = word_buffer_b0(c->bf); - word *dep = stack_top(c->st); - int dist = (word_get_relative_index(gov)) - (word_get_relative_index(dep)); + if(stack_height(config_get_stack(c)) < 2) return 0; /* the dummy word cannot be a dependent */ + if(word_buffer_is_empty(config_get_buffer(c))) return 0; + /* word on top of the stack should not have a governor */ + if(word_get_gov(stack_top(config_get_stack(c))) != 0) return 0; - printf("create left arc %d <- %d dist = %d\n", word_get_relative_index(dep), word_get_relative_index(gov), dist); + word *gov = word_buffer_b0(config_get_buffer(c)); + word *dep = stack_top(config_get_stack(c)); + int dist = (word_get_index(gov)) - (word_get_index(dep)); + /* printf("create left arc %d <- %d dist = %d\n", word_get_index(dep), word_get_index(gov), dist); */ /* create a new dependency */ word_set_gov(dep, dist); word_set_label(dep, label); - /* depset_add(c->ds, word_buffer_b0(c->bf), label, stack_top(c->st)); */ - stack_pop(c->st); + stack_pop(config_get_stack(c)); config_add_mvt(c, movement_left_code(label)); return 1; } @@ -65,45 +58,43 @@ int movement_left_arc(config *c, int label, float score) int movement_right_arc(config *c, int label, float score) { /* printf("RA "); */ - if(stack_is_empty(c->st)) return 0; - if(word_buffer_is_empty(c->bf)) return 0; + if(stack_is_empty(config_get_stack(c))) return 0; + if(word_buffer_is_empty(config_get_buffer(c))) return 0; - word *gov = stack_top(c->st); - word *dep = word_buffer_b0(c->bf); - int dist = (word_get_relative_index(gov)) - (word_get_relative_index(dep)); + word *gov = stack_top(config_get_stack(c)); + word *dep = word_buffer_b0(config_get_buffer(c)); + int dist = (word_get_index(gov)) - (word_get_index(dep)); - printf("create right arc %d -> %d dist = %d\n", word_get_relative_index(gov), word_get_relative_index(dep), dist); + /* printf("create right arc %d -> %d dist = %d\n", word_get_index(gov), word_get_index(dep), dist); */ /* create a new dependency */ word_set_gov(dep, dist); word_set_label(dep, label); - stack_push(c->st, word_buffer_b0(c->bf)); - word_buffer_move_right(c->bf); + stack_push(config_get_stack(c), word_buffer_b0(config_get_buffer(c))); + word_buffer_move_right(config_get_buffer(c)); config_add_mvt(c, movement_right_code(label)); return 1; } - int movement_shift(config *c, int stream, float score) { - if(word_buffer_is_empty(c->bf)) return 0; - /* printf("SH\n"); */ - - stack_push(c->st, word_buffer_b0(c->bf)); - word_buffer_move_right(c->bf); - + if(word_buffer_is_empty(config_get_buffer(c))) return 0; + stack_push(config_get_stack(c), word_buffer_b0(config_get_buffer(c))); + word_buffer_move_right(config_get_buffer(c)); config_add_mvt(c, MVT_SHIFT); return 1; } int movement_reduce(config *c, float score) { - if(stack_is_empty(c->st)) return 0; - if(word_get_gov(stack_top(c->st)) == 0) return 0; /* word on top of stack does not have a governor */ - stack_pop(c->st); + if(stack_is_empty(config_get_stack(c))) return 0; + /* word on top of stack must have a governor */ + if(word_get_gov(stack_top(config_get_stack(c))) == 0) return 0; + stack_pop(config_get_stack(c)); + config_add_mvt(c, MVT_REDUCE); return 1; } diff --git a/maca_trans_parser/src/movement_tagger.c b/maca_trans_parser/src/movement_tagger.c index 3ac695a1e63c5336b9745560c04e985fb5001f5d..5d33788a4b0fe791cdda53e07d30cd22c71d1fc6 100644 --- a/maca_trans_parser/src/movement_tagger.c +++ b/maca_trans_parser/src/movement_tagger.c @@ -6,7 +6,7 @@ int movement_tagger(config *c, int postag, float score, int stream) { - if(word_buffer_is_empty(c->bf)) return 0; + if(word_buffer_is_last(c->bf)) return 0; word_set_pos(word_buffer_b0(c->bf), postag); word_buffer_move_right(c->bf); diff --git a/maca_trans_parser/src/oracle_parser.c b/maca_trans_parser/src/oracle_parser.c index 418f422bb4500ab6bf642300fe9b6c4d809f1180..6f422d9352474b073beae348254e801dd3654a56 100644 --- a/maca_trans_parser/src/oracle_parser.c +++ b/maca_trans_parser/src/oracle_parser.c @@ -9,11 +9,11 @@ int check_all_dependents_of_word_in_ref_are_in_hyp(config *c, sentence *ref, int int dep; for(dep=0; dep < ref->length; dep++){ - if(word_get_gov_relative_index(ref->words[dep]) == word_index){ /* found a dependent of word in ref */ + if(word_get_gov_index(ref->words[dep]) == word_index){ /* found a dependent of word in ref */ /* look for a dependency in hyp such that its dependent is dep */ if((dep >= c->ds->length) || (c->ds->array[dep].gov == NULL) - || (word_get_relative_index(c->ds->array[dep].gov) != word_index) + || (word_get_index(c->ds->array[dep].gov) != word_index) || (c->ds->array[dep].label != word_get_label(ref->words[dep]))) return 0; } @@ -29,22 +29,22 @@ int oracle_parser(config *c, sentence *ref) if(!stack_is_empty(c->st) && !word_buffer_is_empty(c->bf)){ s0 = stack_top(c->st); - s0_index = word_get_relative_index(s0); + s0_index = word_get_index(s0); b0 = word_buffer_b0(c->bf); - b0_index = word_get_relative_index(b0); + b0_index = word_get_index(b0); /* printf("s0 = %d b0 = %d\n", s0_index, b0_index); */ - /*printf("dans ref gov de %d = %d\n", s0_index, word_get_gov_relative_index(ref->words[s0_index])); - printf("dans ref gov de %d = %d\n", b0_index, word_get_gov_relative_index(ref->words[b0_index])); */ + /*printf("dans ref gov de %d = %d\n", s0_index, word_get_gov_index(ref->words[s0_index])); + printf("dans ref gov de %d = %d\n", b0_index, word_get_gov_index(ref->words[b0_index])); */ /* LEFT ARC b0 is the governor and s0 the dependent */ - if(word_get_gov_relative_index(ref->words[s0_index]) == b0_index) + if(word_get_gov_index(ref->words[s0_index]) == b0_index) return movement_left_code(word_get_label(ref->words[s0_index])); /* RIGHT ARC s0 is the governor and b0 the dependent */ - if((word_get_gov_relative_index(ref->words[b0_index]) == s0_index) + if((word_get_gov_index(ref->words[b0_index]) == s0_index) && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, b0_index)){ return movement_right_code(word_get_label(ref->words[b0_index])); } diff --git a/maca_trans_parser/src/oracle_parser_arc_eager.c b/maca_trans_parser/src/oracle_parser_arc_eager.c index a24442bf116d7df7ed426a1f45ae4a091ae565d6..082130ad311cbdda948874d56d89545170689b8e 100644 --- a/maca_trans_parser/src/oracle_parser_arc_eager.c +++ b/maca_trans_parser/src/oracle_parser_arc_eager.c @@ -4,25 +4,25 @@ #include"word_buffer.h" #include"movement_parser_arc_eager.h" -int check_all_dependents_of_word_in_ref_are_in_hyp(config *c, word_buffer *ref, int word_index) +int check_all_dependents_of_word_in_ref_are_in_hyp(config *c, word_buffer *ref, int word_index, int start_sentence_index) { int dep; int gov_ref; int gov_hyp; - - for(dep=1; dep < ref->nbelem; dep++){ - gov_ref = word_get_gov_relative_index(word_buffer_get_word_n(ref, dep)); + int max = ((start_sentence_index + 500) > ref->nbelem)? ref->nbelem : (start_sentence_index + 500); + for(dep=start_sentence_index; dep < max; dep++){ + gov_ref = word_get_gov_index(word_buffer_get_word_n(ref, dep)); if(gov_ref == word_index){ /* found a dependent of word in ref */ /* look for a dependency in hyp such that its dependent is dep */ - printf("found a dep of word %d in ref, it is %d\n", word_index, dep); - gov_hyp = word_get_gov_relative_index(word_buffer_get_word_n(config_get_buffer(c), dep)); + /* printf("found a dep of word %d in ref, it is %d\n", word_index, dep); */ + gov_hyp = word_get_gov_index(word_buffer_get_word_n(config_get_buffer(c), dep)); - printf("gov of %d in hyp is %d\n", dep,gov_hyp); + /* printf("gov of %d in hyp is %d\n", dep,gov_hyp); */ if(gov_hyp != gov_ref) return 0; /* if((dep >= c->ds->length) || (c->ds->array[dep].gov == NULL) - || (word_get_relative_index(c->ds->array[dep].gov) != word_index) + || (word_get_index(c->ds->array[dep].gov) != word_index) || (c->ds->array[dep].label != word_get_label(ref->words[dep]))) return 0;*/ } @@ -30,21 +30,21 @@ int check_all_dependents_of_word_in_ref_are_in_hyp(config *c, word_buffer *ref, return 1; } -int oracle_parser_arc_eager(config *c, word_buffer *ref) +int oracle_parser_arc_eager(config *c, word_buffer *ref, int start_sentence_index) { word *s0; /* word on top of stack */ word *b0; /* next word in the bufer */ int s0_index, b0_index; int s0_gov_index, b0_gov_index; - if(!stack_is_empty(c->st) && !word_buffer_is_empty(c->bf)){ - s0 = stack_top(c->st); - s0_index = word_get_relative_index(s0); - s0_gov_index = word_get_gov_relative_index(word_buffer_get_word_n(ref, s0_index)); + if(!stack_is_empty(config_get_stack(c)) && !word_buffer_is_empty(config_get_buffer(c))){ + s0 = stack_top(config_get_stack(c)); + s0_index = word_get_index(s0); + s0_gov_index = word_get_gov_index(word_buffer_get_word_n(ref, s0_index)); - b0 = word_buffer_b0(c->bf); - b0_index = word_get_relative_index(b0); - b0_gov_index = word_get_gov_relative_index(word_buffer_get_word_n(ref, b0_index)); + b0 = word_buffer_b0(config_get_buffer(c)); + b0_index = word_get_index(b0); + b0_gov_index = word_get_gov_index(word_buffer_get_word_n(ref, b0_index)); /* printf("s0_index = %d b0_index = %d\n", s0_index, b0_index); printf("dans ref gov de s0 (%d) = %d\n", s0_index, s0_gov_index); @@ -52,22 +52,17 @@ int oracle_parser_arc_eager(config *c, word_buffer *ref) /* LEFT ARC b0 is the governor and s0 the dependent */ if(s0_gov_index == b0_index){ - - /* printf("oracle says left\n"); */ return movement_left_code(word_get_label(word_buffer_get_word_n(ref, s0_index))); } /* RIGHT ARC s0 is the governor and b0 the dependent */ if(b0_gov_index == s0_index){ - /* printf("oracle says right\n"); */ return movement_right_code(word_get_label(word_buffer_get_word_n(ref, b0_index))); } /* REDUCE */ - - printf("all dep in ref are in hyp = %d\n", check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index)); - if((stack_height(c->st) > 2) - && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index) - && (word_get_gov(stack_top(c->st)) != 0)) /* word on top of the stack has a goveror */ + if((stack_height(config_get_stack(c)) > 2) + && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index, start_sentence_index) + && (word_get_gov(stack_top(config_get_stack(c))) != 0)) /* word on top of the stack has a goveror */ { return MVT_REDUCE; } @@ -75,4 +70,5 @@ int oracle_parser_arc_eager(config *c, word_buffer *ref) /* SHIFT */ return MVT_SHIFT; } + return -1; } diff --git a/maca_trans_parser/src/oracle_parser_arc_eager.h b/maca_trans_parser/src/oracle_parser_arc_eager.h index 1aad136a6348a0f5aae413f5e2142a6454628a45..4f7568293d6636c10cbbc417adf6c1ed12b90042 100644 --- a/maca_trans_parser/src/oracle_parser_arc_eager.h +++ b/maca_trans_parser/src/oracle_parser_arc_eager.h @@ -6,6 +6,6 @@ #include"word_buffer.h" -int oracle_parser_arc_eager(config *c, word_buffer *ref); +int oracle_parser_arc_eager(config *c, word_buffer *ref, int start_sentence_index); #endif diff --git a/maca_trans_parser/src/queue.c b/maca_trans_parser/src/queue.c index 599739a6f1fe4132289e1a479c8b593dfc042e08..4e487004e5f2e21caa9725a04cd1c5630442fd20 100644 --- a/maca_trans_parser/src/queue.c +++ b/maca_trans_parser/src/queue.c @@ -7,7 +7,7 @@ int queue_renumber_words(queue *bf) int i; int index = 1; for(i=0; i < bf->nbelem; i++){ - word_set_relative_index(queue_elt_n(bf, i), index++); + word_set_index(queue_elt_n(bf, i), index++); } return index; } @@ -23,7 +23,7 @@ int queue_read_sentence(queue *bf, FILE *f, mcd *mcd_struct) /* fprintf(stderr, "%s", buffer); */ if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')) break; /* end of the sentence */ w = word_parse_buffer(buffer, mcd_struct); - word_set_relative_index(w, index); + word_set_index(w, index); index++; queue_add(bf, w); if(word_is_eos(w, mcd_struct)) break; @@ -43,12 +43,12 @@ void queue_print(FILE *f, queue *q) fprintf(f, "("); if(q->tail >= q->head) for(i=q->head; i < q->tail; i++) - fprintf(f, "%d ", word_get_relative_index(q->array[i])); + fprintf(f, "%d ", word_get_index(q->array[i])); else{ for(i=q->head; i < q->size; i++) - fprintf(f, "%d ", word_get_relative_index(q->array[i])); + fprintf(f, "%d ", word_get_index(q->array[i])); for(i=0; i < q->tail; i++) - fprintf(f, "%d ", word_get_relative_index(q->array[i])); + fprintf(f, "%d ", word_get_index(q->array[i])); } fprintf(f, ")\n"); } diff --git a/maca_trans_parser/src/simple_decoder_parser.c b/maca_trans_parser/src/simple_decoder_parser.c index b71f307ea6afb0851b0d8b2980d06abeb9984ff5..81aea380973f88da091b72888d9a52c6a9b250e5 100644 --- a/maca_trans_parser/src/simple_decoder_parser.c +++ b/maca_trans_parser/src/simple_decoder_parser.c @@ -10,50 +10,6 @@ #include"feature_table.h" #include"dico.h" -#if 0 -void simple_decoder_buffer(context *ctx, FILE *f, feature_table *ft, int root_label) -{ - int mvt_code; - int mvt_type; - int mvt_label; - float max; - feat_vec *fv = feat_vec_new(feature_types_nb); - config *c = config_initial(f, ctx->mcd_struct, 0); - - /* read a sentence and put it in the buffer */ - while(word_buffer_read_sentence(c->bf) > 1){ - while(!config_is_terminal(c)){ - config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); - mvt_code = feature_table_argmax(fv, ft, &max); - mvt_type = movement_type(mvt_code); - mvt_label = movement_label(mvt_code); - - if(mvt_type == MVT_LEFT) - if(movement_left_arc(c, mvt_label, max)) - continue; - - if(mvt_type == MVT_RIGHT) - if(movement_right_arc(c, mvt_label, max)) - continue; - - movement_shift(c, 0, max); - } - - /* config_print(stdout, c); */ - - config_connect_subtrees(c, root_label); - depset_print2(stdout, c->ds, ctx->dico_labels); - - - /* config_free(c); */ - c = config_initial(f, ctx->mcd_struct, 0); - } - - feat_vec_free(fv); -} - -#endif - void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_label) { int mvt_code; @@ -62,74 +18,64 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la float max; feat_vec *fv = feat_vec_new(feature_types_nb); config *c = NULL; - + word *dep; + c = config_initial(f, ctx->mcd_struct, 5); - while(!config_is_terminal(c)){ - /* config_print(stdout, c); */ + while(1){ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); - /* feat_vec_print(stdout, fv); */ mvt_code = feature_table_argmax(fv, ft, &max); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); - - /* printf("code predicted = %d\n", mvt_code); */ - /* movement_print(stdout, mvt_code, ctx->dico_labels); */ - /* sentence is complete */ - if((stack_height(c->st)==1) && (mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ - /* if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ */ - /* if(mvt_label == root_label){ */ - /* printf("sentence complete\n"); */ - /*config_print(stdout, c); */ - - /* create the root arc */ - movement_right_arc(c, mvt_label, 0); - - /* shift dummy word in stack */ - stack_pop(config_get_stack(c)); - - /* config_print(stdout, c); */ - - /* config_connect_subtrees(c, root_label); */ - /* depset_print_new_index(stdout, c->ds, ctx->dico_labels); */ - - depset_print2(stdout, c->ds, ctx->dico_labels); - - /* pop the dummy word */ - stack_pop(c->st); - /* remplace it with a fresh one */ - stack_push(c->st, word_create_dummy(ctx->mcd_struct)); - - /* empty depset */ - depset_free(c->ds); - c->ds = depset_new(); - - /* renumber the words that are left in the buffer */ - /* c->current_index = queue_renumber_words(c->bf); */ - continue; - } - - if(mvt_type == MVT_LEFT) - if(movement_left_arc(c, mvt_label, max)) + /* config_print(stdout, c); */ + /* movement_print(stdout, mvt_code, ctx->dico_labels); */ + + if(mvt_type == MVT_LEFT){ + dep = stack_s0(config_get_stack(c)); + if(movement_left_arc(c, mvt_label, max)){ + /* printf("%d\t", word_get_index(dep)); + printf("%s\t", word_get_input(dep)); + printf("%d\t", word_get_gov(dep)); + printf("%s\n", dico_int2string(ctx->dico_labels, word_get_label(dep)));*/ continue; + } + } - if(mvt_type == MVT_RIGHT) - if(movement_right_arc(c, mvt_label, max)) + if(mvt_type == MVT_RIGHT){ + dep = word_buffer_b0(config_get_buffer(c)); + if(movement_right_arc(c, mvt_label, max)){ + /* printf("%d\t", word_get_index(dep)); + printf("%s\t", word_get_input(dep)); + printf("%d\t", word_get_gov(dep)); + printf("%s\n", dico_int2string(ctx->dico_labels, word_get_label(dep)));*/ continue; + } + } if(mvt_type == MVT_REDUCE) if(movement_reduce(c, max)) continue; movement_shift(c, 1, max); + + if(word_buffer_is_last(config_get_buffer(c))) break; } - - /* config_print(stdout, c); */ - - /* config_connect_subtrees(c, root_label); */ - - depset_print2(stdout, c->ds, ctx->dico_labels); + for(int i=1; i < config_get_buffer(c)->nbelem; i++){ + dep = word_buffer_get_word_n(config_get_buffer(c), i); + printf("%s\t", word_get_input(dep)); + printf("%d\t", word_get_gov(dep)); + /* printf("label = %d\n", word_get_label(dep)); */ + char *label = (word_get_label(dep) == -1)? NULL : dico_int2string(ctx->dico_labels, word_get_label(dep)); + if(label != NULL) + printf("%s\t", label) ; + else + printf("_\t"); + if((label != NULL) && !strcmp(label, "eos")) + printf("1\n"); + else + printf("0\n"); + } /* config_free(c); */ feat_vec_free(fv); @@ -145,16 +91,10 @@ void simple_decoder(context *ctx) root_label = dico_string2int(ctx->dico_labels, ctx->root_label); if(root_label == -1) root_label = 0; - simple_decoder_stream(ctx, f, ft, root_label); - /* - if(ctx->stream_mode) - simple_decoder_stream(ctx, f, ft, root_label); - else - simple_decoder_buffer(ctx, f, ft, root_label); - */ + simple_decoder_stream(ctx, f, ft, root_label); + feature_table_free(ft); if(ctx->input_filename) fclose(f); } - diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c index a145fee8ef889546f71fb55375ecf88bf6d265d8..9e3341c362a39892648f55cbf32b665856cdff48 100644 --- a/maca_trans_parser/src/simple_decoder_tagger.c +++ b/maca_trans_parser/src/simple_decoder_tagger.c @@ -85,9 +85,10 @@ void simple_decoder_stream(context *ctx) float max; word *w; dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); - - c = config_initial_no_dummy_word(f, ctx->mcd_struct, 5); - while(!config_is_terminal(c)){ + int res; + c = config_initial_no_dummy_word(f, ctx->mcd_struct, 5); + + while(1){ if(ctx->f2p) add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); /* config_print(stdout, c); */ @@ -100,10 +101,11 @@ void simple_decoder_stream(context *ctx) w = word_buffer_b0(c->bf); printf("%s\t%s\n", w->input, dico_int2string(dico_pos, postag)); - if(postag != -1) - movement_tagger(c, postag, max, 1); - - + res = movement_tagger(c, postag, max, 1); + + /* printf(" current index = %d nb elem = %d\n", c->bf->current_index, c->bf->nbelem); */ + + if(res == 0) break; } /* config_print(stdout, c); */ diff --git a/maca_trans_parser/src/stack.c b/maca_trans_parser/src/stack.c index 24ed623226c1272f39cdfb60efb623bd351b0b0c..cbd419e9c496e6e6cc6c89b3616d0f21ea6a161d 100644 --- a/maca_trans_parser/src/stack.c +++ b/maca_trans_parser/src/stack.c @@ -93,7 +93,7 @@ void stack_print(FILE *buffer, stack *s) if(s){ fprintf(buffer, "["); for(i=0; i < stack_height(s); i++) - fprintf(buffer, " %d", word_get_relative_index(s->array[i])); + fprintf(buffer, " %d", word_get_index(s->array[i])); fprintf(buffer, "]"); } }