diff --git a/maca_common/CMakeLists.txt b/maca_common/CMakeLists.txt index 394e71f18c2252e2cb296a2e73c3804d2e528769..4e79f08cafb365ece442ac6006b250e730bd393d 100644 --- a/maca_common/CMakeLists.txt +++ b/maca_common/CMakeLists.txt @@ -4,7 +4,6 @@ set(SOURCES src/util.c src/word_emb.c src/mcd.c src/dico_vec.c - src/feat_types.c src/form2pos.c src/word.c src/sentence.c diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index 1186de181b049137a32d620412d79ef2ddf92e90..ba4b6fb14d0fe34b564b908a148fd92c4593d554 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -6,19 +6,91 @@ #define MCD_REPRESENTATION_VOCAB 2 #define MCD_REPRESENTATION_INT 3 - #define MCD_INVALID_VALUE -1 +#define MCD_WF_NB 36 + +#define MCD_WF_INDEX 0 +#define MCD_WF_FORM 1 +#define MCD_WF_LEMMA 2 +#define MCD_WF_CPOS 3 +#define MCD_WF_POS 4 +#define MCD_WF_FEATS 5 +#define MCD_WF_GOV 6 +#define MCD_WF_LABEL 7 +#define MCD_WF_STAG 8 +#define MCD_WF_SENT_SEG 9 +#define MCD_WF_A 10 +#define MCD_WF_B 11 +#define MCD_WF_C 12 +#define MCD_WF_D 13 +#define MCD_WF_E 14 +#define MCD_WF_F 15 +#define MCD_WF_G 16 +#define MCD_WF_H 17 +#define MCD_WF_I 18 +#define MCD_WF_J 19 +#define MCD_WF_K 20 +#define MCD_WF_L 21 +#define MCD_WF_M 22 +#define MCD_WF_N 23 +#define MCD_WF_O 24 +#define MCD_WF_P 25 +#define MCD_WF_Q 26 +#define MCD_WF_R 27 +#define MCD_WF_S 28 +#define MCD_WF_T 29 +#define MCD_WF_U 30 +#define MCD_WF_V 31 +#define MCD_WF_W 32 +#define MCD_WF_X 33 +#define MCD_WF_Y 34 +#define MCD_WF_Z 35 #include "dico.h" -#include "feat_types.h" #include "word_emb.h" #include "dico_vec.h" -#define mcd_get_dico_label(m) (m)->dico_array[FEAT_TYPE_LABEL] +#define mcd_get_dico_label(m) (m)->dico_array[MCD_WF_LABEL] + +#define mcd_get_index_col(m) (m)->wf2col[MCD_WF_INDEX] +#define mcd_get_form_col(m) (m)->wf2col[MCD_WF_FORM] +#define mcd_get_lemma_col(m) (m)->wf2col[MCD_WF_LEMMA] +#define mcd_get_cpos_col(m) (m)->wf2col[MCD_WF_CPOS] +#define mcd_get_pos_col(m) (m)->wf2col[MCD_WF_POS] +#define mcd_get_feats_col(m) (m)->wf2col[MCD_WF_FEATS] +#define mcd_get_gov_col(m) (m)->wf2col[MCD_WF_GOV] +#define mcd_get_label_col(m) (m)->wf2col[MCD_WF_LABEL] +#define mcd_get_stag_col(m) (m)->wf2col[MCD_WF_STAG] +#define mcd_get_sent_seg_col(m) (m)->wf2col[MCD_WF_SENT_SEG] +#define mcd_get_a_col(m) (m)->wf2col[MCD_WF_A] +#define mcd_get_b_col(m) (m)->wf2col[MCD_WF_B] +#define mcd_get_c_col(m) (m)->wf2col[MCD_WF_C] +#define mcd_get_d_col(m) (m)->wf2col[MCD_WF_D] +#define mcd_get_e_col(m) (m)->wf2col[MCD_WF_E] +#define mcd_get_f_col(m) (m)->wf2col[MCD_WF_F] +#define mcd_get_g_col(m) (m)->wf2col[MCD_WF_G] +#define mcd_get_h_col(m) (m)->wf2col[MCD_WF_H] +#define mcd_get_i_col(m) (m)->wf2col[MCD_WF_I] +#define mcd_get_j_col(m) (m)->wf2col[MCD_WF_J] +#define mcd_get_k_col(m) (m)->wf2col[MCD_WF_K] +#define mcd_get_l_col(m) (m)->wf2col[MCD_WF_L] +#define mcd_get_m_col(m) (m)->wf2col[MCD_WF_M] +#define mcd_get_n_col(m) (m)->wf2col[MCD_WF_N] +#define mcd_get_o_col(m) (m)->wf2col[MCD_WF_O] +#define mcd_get_p_col(m) (m)->wf2col[MCD_WF_P] +#define mcd_get_q_col(m) (m)->wf2col[MCD_WF_Q] +#define mcd_get_r_col(m) (m)->wf2col[MCD_WF_R] +#define mcd_get_s_col(m) (m)->wf2col[MCD_WF_S] +#define mcd_get_t_col(m) (m)->wf2col[MCD_WF_T] +#define mcd_get_u_col(m) (m)->wf2col[MCD_WF_U] +#define mcd_get_v_col(m) (m)->wf2col[MCD_WF_V] +#define mcd_get_w_col(m) (m)->wf2col[MCD_WF_W] +#define mcd_get_x_col(m) (m)->wf2col[MCD_WF_X] +#define mcd_get_y_col(m) (m)->wf2col[MCD_WF_Y] +#define mcd_get_z_col(m) (m)->wf2col[MCD_WF_Z] -#define mcd_get_form_col(m) (m)->type[FEAT_TYPE_FORM] -#define mcd_set_form_col(m, v) (m)->type[FEAT_TYPE_FORM] = (v) +#define mcd_set_form_col(m, v) (m)->wf[MCD_WF_FORM] = (v) /* mcd (multi column description) files describe the format of corpus files */ /* every line of an mcd file describes the content of a column of the corpus file */ @@ -35,10 +107,9 @@ typedef struct { int nb_col; /* number of columns in the mcd file */ - int type2col[FEAT_TYPE_NB]; /* in which column is represented is the form (FEAT_TYPE_FORM) lemma ... represented */ - /* int *col2type; */ - int *type; /* array containing the type of every column */ - char **type_str; /* a string version of array type */ + int wf2col[MCD_WF_NB]; /* in which column are the word features (MCD_WF_FORM, MCD_WF_LEMMA ...) represented */ + int *wf; /* array containing the word feature that correspond to each column */ + char **wf_str; /* a string version of array word feature */ int *representation; /* array containing the representation mode of every column (integer, vocabulary, embedding, NULL) */ char **filename; /* array containing the file in which the different values for a columnn is represented */ dico **dico_array; /* array containing the dico corresponding to each column (NULL if no file) */ @@ -54,5 +125,6 @@ void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename); void mcd_free(mcd *m); int mcd_get_code(mcd *m, char *str, int col); dico_vec *mcd_build_dico_vec(mcd *mcd_struct); +int mcd_wf_code(char *wf); #endif diff --git a/maca_common/include/word.h b/maca_common/include/word.h index 2ecd333d6ca52762184aceb58d7079d0c67e4b9f..ba567bef034a027f458a821837d2744befec905d 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -3,88 +3,96 @@ #include "mcd.h" -#define word_get_index(w) (w)->feat_array[FEAT_TYPE_INDEX] -#define word_get_form(w) (w)->feat_array[FEAT_TYPE_FORM] -#define word_get_lemma(w) (w)->feat_array[FEAT_TYPE_LEMMA] -#define word_get_cpos(w) (w)->feat_array[FEAT_TYPE_CPOS] -#define word_get_pos(w) (w)->feat_array[FEAT_TYPE_POS] -#define word_get_feats(w) (w)->feat_array[FEAT_TYPE_FEATS] -#define word_get_gov(w) (w)->feat_array[FEAT_TYPE_GOV] -#define word_get_label(w) (w)->feat_array[FEAT_TYPE_LABEL] -#define word_get_stag(w) (w)->feat_array[FEAT_TYPE_STAG] -#define word_get_A(w) (w)->feat_array[FEAT_TYPE_A] -#define word_get_B(w) (w)->feat_array[FEAT_TYPE_B] -#define word_get_C(w) (w)->feat_array[FEAT_TYPE_C] -#define word_get_D(w) (w)->feat_array[FEAT_TYPE_D] -#define word_get_E(w) (w)->feat_array[FEAT_TYPE_E] -#define word_get_F(w) (w)->feat_array[FEAT_TYPE_F] -#define word_get_G(w) (w)->feat_array[FEAT_TYPE_G] -#define word_get_H(w) (w)->feat_array[FEAT_TYPE_H] -#define word_get_I(w) (w)->feat_array[FEAT_TYPE_I] -#define word_get_J(w) (w)->feat_array[FEAT_TYPE_J] -#define word_get_K(w) (w)->feat_array[FEAT_TYPE_K] -#define word_get_L(w) (w)->feat_array[FEAT_TYPE_L] -#define word_get_M(w) (w)->feat_array[FEAT_TYPE_M] -#define word_get_N(w) (w)->feat_array[FEAT_TYPE_N] -#define word_get_O(w) (w)->feat_array[FEAT_TYPE_O] -#define word_get_P(w) (w)->feat_array[FEAT_TYPE_P] -#define word_get_Q(w) (w)->feat_array[FEAT_TYPE_Q] -#define word_get_R(w) (w)->feat_array[FEAT_TYPE_R] -#define word_get_S(w) (w)->feat_array[FEAT_TYPE_S] -#define word_get_T(w) (w)->feat_array[FEAT_TYPE_T] -#define word_get_U(w) (w)->feat_array[FEAT_TYPE_U] -#define word_get_V(w) (w)->feat_array[FEAT_TYPE_V] -#define word_get_W(w) (w)->feat_array[FEAT_TYPE_W] -#define word_get_X(w) (w)->feat_array[FEAT_TYPE_X] -#define word_get_Y(w) (w)->feat_array[FEAT_TYPE_Y] -#define word_get_Z(w) (w)->feat_array[FEAT_TYPE_Z] +#define word_get_index(w) (w)->wf_array[MCD_WF_INDEX] +#define word_get_form(w) (w)->wf_array[MCD_WF_FORM] +#define word_get_lemma(w) (w)->wf_array[MCD_WF_LEMMA] +#define word_get_cpos(w) (w)->wf_array[MCD_WF_CPOS] +#define word_get_pos(w) (w)->wf_array[MCD_WF_POS] +#define word_get_feats(w) (w)->wf_array[MCD_WF_FEATS] +#define word_get_gov(w) (w)->wf_array[MCD_WF_GOV] +#define word_get_label(w) (w)->wf_array[MCD_WF_LABEL] +#define word_get_stag(w) (w)->wf_array[MCD_WF_STAG] +#define word_get_sent_seg(w) (w)->wf_array[MCD_WF_SENT_SEG] +#define word_get_A(w) (w)->wf_array[MCD_WF_A] +#define word_get_B(w) (w)->wf_array[MCD_WF_B] +#define word_get_C(w) (w)->wf_array[MCD_WF_C] +#define word_get_D(w) (w)->wf_array[MCD_WF_D] +#define word_get_E(w) (w)->wf_array[MCD_WF_E] +#define word_get_F(w) (w)->wf_array[MCD_WF_F] +#define word_get_G(w) (w)->wf_array[MCD_WF_G] +#define word_get_H(w) (w)->wf_array[MCD_WF_H] +#define word_get_I(w) (w)->wf_array[MCD_WF_I] +#define word_get_J(w) (w)->wf_array[MCD_WF_J] +#define word_get_K(w) (w)->wf_array[MCD_WF_K] +#define word_get_L(w) (w)->wf_array[MCD_WF_L] +#define word_get_M(w) (w)->wf_array[MCD_WF_M] +#define word_get_N(w) (w)->wf_array[MCD_WF_N] +#define word_get_O(w) (w)->wf_array[MCD_WF_O] +#define word_get_P(w) (w)->wf_array[MCD_WF_P] +#define word_get_Q(w) (w)->wf_array[MCD_WF_Q] +#define word_get_R(w) (w)->wf_array[MCD_WF_R] +#define word_get_S(w) (w)->wf_array[MCD_WF_S] +#define word_get_T(w) (w)->wf_array[MCD_WF_T] +#define word_get_U(w) (w)->wf_array[MCD_WF_U] +#define word_get_V(w) (w)->wf_array[MCD_WF_V] +#define word_get_W(w) (w)->wf_array[MCD_WF_W] +#define word_get_X(w) (w)->wf_array[MCD_WF_X] +#define word_get_Y(w) (w)->wf_array[MCD_WF_Y] +#define word_get_Z(w) (w)->wf_array[MCD_WF_Z] #define word_get_signature(w) (w)->signature -#define word_set_index(w, val) (w)->feat_array[FEAT_TYPE_INDEX] = (val) -#define word_set_form(w, val) (w)->feat_array[FEAT_TYPE_FORM] = (val) -#define word_set_lemma(w, val) (w)->feat_array[FEAT_TYPE_LEMMA] = (val) -#define word_set_cpos(w, val) (w)->feat_array[FEAT_TYPE_CPOS] = (val) -#define word_set_pos(w, val) (w)->feat_array[FEAT_TYPE_POS] = (val) -#define word_set_feats(w, val) (w)->feat_array[FEAT_TYPE_FEATS] = (val) -#define word_set_gov(w, val) (w)->feat_array[FEAT_TYPE_GOV] = (val) -#define word_set_label(w, val) (w)->feat_array[FEAT_TYPE_LABEL] = (val) -#define word_set_stag(w, val) (w)->feat_array[FEAT_TYPE_STAG] = (val) -#define word_set_A(w, val) (w)->feat_array[FEAT_TYPE_A] = (val) -#define word_set_B(w, val) (w)->feat_array[FEAT_TYPE_B] = (val) -#define word_set_C(w, val) (w)->feat_array[FEAT_TYPE_C] = (val) -#define word_set_D(w, val) (w)->feat_array[FEAT_TYPE_D] = (val) -#define word_set_E(w, val) (w)->feat_array[FEAT_TYPE_E] = (val) -#define word_set_F(w, val) (w)->feat_array[FEAT_TYPE_F] = (val) -#define word_set_G(w, val) (w)->feat_array[FEAT_TYPE_G] = (val) -#define word_set_H(w, val) (w)->feat_array[FEAT_TYPE_H] = (val) -#define word_set_I(w, val) (w)->feat_array[FEAT_TYPE_I] = (val) -#define word_set_J(w, val) (w)->feat_array[FEAT_TYPE_J] = (val) -#define word_set_K(w, val) (w)->feat_array[FEAT_TYPE_K] = (val) -#define word_set_L(w, val) (w)->feat_array[FEAT_TYPE_L] = (val) -#define word_set_M(w, val) (w)->feat_array[FEAT_TYPE_M] = (val) -#define word_set_N(w, val) (w)->feat_array[FEAT_TYPE_N] = (val) -#define word_set_O(w, val) (w)->feat_array[FEAT_TYPE_O] = (val) -#define word_set_P(w, val) (w)->feat_array[FEAT_TYPE_P] = (val) -#define word_set_Q(w, val) (w)->feat_array[FEAT_TYPE_Q] = (val) -#define word_set_R(w, val) (w)->feat_array[FEAT_TYPE_R] = (val) -#define word_set_S(w, val) (w)->feat_array[FEAT_TYPE_S] = (val) -#define word_set_T(w, val) (w)->feat_array[FEAT_TYPE_T] = (val) -#define word_set_U(w, val) (w)->feat_array[FEAT_TYPE_U] = (val) -#define word_set_V(w, val) (w)->feat_array[FEAT_TYPE_V] = (val) -#define word_set_W(w, val) (w)->feat_array[FEAT_TYPE_W] = (val) -#define word_set_X(w, val) (w)->feat_array[FEAT_TYPE_X] = (val) +#define word_set_index(w, val) (w)->wf_array[MCD_WF_INDEX] = (val) +#define word_set_form(w, val) (w)->wf_array[MCD_WF_FORM] = (val) +#define word_set_lemma(w, val) (w)->wf_array[MCD_WF_LEMMA] = (val) +#define word_set_cpos(w, val) (w)->wf_array[MCD_WF_CPOS] = (val) +#define word_set_pos(w, val) (w)->wf_array[MCD_WF_POS] = (val) +#define word_set_feats(w, val) (w)->wf_array[MCD_WF_FEATS] = (val) +#define word_set_gov(w, val) (w)->wf_array[MCD_WF_GOV] = (val) +#define word_set_label(w, val) (w)->wf_array[MCD_WF_LABEL] = (val) +#define word_set_stag(w, val) (w)->wf_array[MCD_WF_STAG] = (val) +#define word_set_word_seg(w) (w)->wf_array[MCD_WF_WORD_SEG] = (val) +#define word_set_A(w, val) (w)->wf_array[MCD_WF_A] = (val) +#define word_set_B(w, val) (w)->wf_array[MCD_WF_B] = (val) +#define word_set_C(w, val) (w)->wf_array[MCD_WF_C] = (val) +#define word_set_D(w, val) (w)->wf_array[MCD_WF_D] = (val) +#define word_set_E(w, val) (w)->wf_array[MCD_WF_E] = (val) +#define word_set_F(w, val) (w)->wf_array[MCD_WF_F] = (val) +#define word_set_G(w, val) (w)->wf_array[MCD_WF_G] = (val) +#define word_set_H(w, val) (w)->wf_array[MCD_WF_H] = (val) +#define word_set_I(w, val) (w)->wf_array[MCD_WF_I] = (val) +#define word_set_J(w, val) (w)->wf_array[MCD_WF_J] = (val) +#define word_set_K(w, val) (w)->wf_array[MCD_WF_K] = (val) +#define word_set_L(w, val) (w)->wf_array[MCD_WF_L] = (val) +#define word_set_M(w, val) (w)->wf_array[MCD_WF_M] = (val) +#define word_set_N(w, val) (w)->wf_array[MCD_WF_N] = (val) +#define word_set_O(w, val) (w)->wf_array[MCD_WF_O] = (val) +#define word_set_P(w, val) (w)->wf_array[MCD_WF_P] = (val) +#define word_set_Q(w, val) (w)->wf_array[MCD_WF_Q] = (val) +#define word_set_R(w, val) (w)->wf_array[MCD_WF_R] = (val) +#define word_set_S(w, val) (w)->wf_array[MCD_WF_S] = (val) +#define word_set_T(w, val) (w)->wf_array[MCD_WF_T] = (val) +#define word_set_U(w, val) (w)->wf_array[MCD_WF_U] = (val) +#define word_set_V(w, val) (w)->wf_array[MCD_WF_V] = (val) +#define word_set_W(w, val) (w)->wf_array[MCD_WF_W] = (val) +#define word_set_X(w, val) (w)->wf_array[MCD_WF_X] = (val) -#define word_set_Y(w, val) (w)->feat_array[FEAT_TYPE_Y] = (val) -#define word_set_Z(w, val) (w)->feat_array[FEAT_TYPE_Z] = (val) +#define word_set_Y(w, val) (w)->wf_array[MCD_WF_Y] = (val) +#define word_set_Z(w, val) (w)->wf_array[MCD_WF_Z] = (val) #define word_set_signature(w, val) (w)->signature = (val) +#define word_set_relative_index(w, val) (w)->relative_index = (val) +#define word_get_relative_index(w) (w)->relative_index + + + typedef struct _word { - int feat_array[FEAT_TYPE_NB]; /* array containing the codes corresponding to the different word features */ + int wf_array[MCD_WF_NB]; /* array containing the codes corresponding to the different word features */ char *input; /* the string corresponding to the actual line in the corpus file */ int U1; /* does the form begin with an uppercase character */ int signature; /* pos tags that this form can have (represented as a boolean string) */ int label; char *form; + int relative_index; } word; word *word_new(char *input); @@ -92,11 +100,13 @@ word *word_create_dummy(mcd *mcd_struct); word *word_copy(word *w); void word_free(word *w); +void word_print2(FILE *f, word *w); void word_print(FILE *f, word *w, mcd *mcd_struct, dico *dico_labels); word *word_read(FILE *f, mcd *mcd_struct); word *word_parse_buffer(char *buffer, mcd *mcd_struct); - +int word_is_eos(word *w, mcd *mcd_struct); +int word_get_gov_relative_index(word *w); #endif diff --git a/maca_common/src/dico.c b/maca_common/src/dico.c index d47d030a65650066edf48652854764c64ce6bc27..04701e7e87fcc5ef0b01e67295f9d497dd522bc0 100644 --- a/maca_common/src/dico.c +++ b/maca_common/src/dico.c @@ -89,9 +89,8 @@ void dico_print_fh(FILE *f, dico *d) void dico_print(char *filename, dico *d) { FILE *f; - if(filename == NULL){ + if(filename == NULL) f = stdout; - } else{ f= fopen(filename, "w"); if(f == NULL){ @@ -100,7 +99,9 @@ void dico_print(char *filename, dico *d) } } dico_print_fh(f, d); - fclose(f); + + if(filename != NULL) + fclose(f); } int dico_add(dico *d, char *key) @@ -136,7 +137,9 @@ char *dico_int2string(dico *d, int val) int dico_string2int(dico *d, char *string) { - cell *c = hash_lookup(d->htable, string); + cell *c; + + c= hash_lookup(d->htable, string); if(c) return c->val; else @@ -162,6 +165,7 @@ dico *dico_extract_from_corpus(char *filename, int column, char *dico_name) column_nb = 0; do{ if(column_nb == column){ + /* printf("token = %s\n", token); */ dico_add(d, token); } column_nb++; diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index a6385d011f25ad46156e20ce415876c89cedd3ba..4065f23381c5d6ce3ee1bd00fc70c72bc0cb20cd 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -1,3 +1,4 @@ + #include<stdio.h> #include<stdlib.h> #include<string.h> @@ -7,27 +8,26 @@ #include "dico.h" #include "word_emb.h" - mcd *mcd_new(int nb_col) { mcd *m = (mcd *)memalloc(sizeof(mcd)); int i; m->nb_col = nb_col; - for(i=0; i < FEAT_TYPE_NB; i++) - m->type2col[i] = -1; + for(i=0; i < MCD_WF_NB; i++) + m->wf2col[i] = -1; m->representation = (int *) memalloc(nb_col * sizeof(int)); - m->type = (int *) memalloc(nb_col * sizeof(int)); - m->type_str = (char **) memalloc(nb_col * sizeof(char *)); + m->wf = (int *) memalloc(nb_col * sizeof(int)); + m->wf_str = (char **) memalloc(nb_col * sizeof(char *)); m->filename = (char **) memalloc(nb_col * sizeof(char *)); m->dico_array = (dico **) memalloc(nb_col * sizeof(dico *)); m->word_emb_array = (word_emb **) memalloc(nb_col * sizeof(word_emb *)); for(i=0; i < nb_col; i++){ m->representation[i] = MCD_REPRESENTATION_NULL; - m->type[i] = -1; - m->type_str[i] = NULL; + m->wf[i] = -1; + m->wf_str[i] = NULL; m->filename[i] = NULL; m->dico_array[i] = NULL; m->word_emb_array[i] = NULL;; @@ -41,14 +41,14 @@ void mcd_free(mcd *m) for(i=0; i < m->nb_col; i++){ if(m->dico_array[i]) dico_free(m->dico_array[i]); if(m->word_emb_array[i]) word_emb_free(m->word_emb_array[i]); - if(m->type_str[i]) free(m->type_str[i]); + if(m->wf_str[i]) free(m->wf_str[i]); } free(m->representation); free(m->filename); free(m->dico_array); free(m->word_emb_array); - free(m->type_str); - free(m->type); + free(m->wf_str); + free(m->wf); free(m); } @@ -58,7 +58,7 @@ void mcd_free(mcd *m) int mcd_get_code(mcd *m, char *str, int col){ if(m->representation[col] == MCD_REPRESENTATION_VOCAB) - return dico_string2int(m->dico_array[col], str); + return (m->dico_array[col])? dico_string2int(m->dico_array[col], str) : -1; if(m->representation[col] == MCD_REPRESENTATION_EMB) return word_emb_get_code(m->word_emb_array[col], str); if(m->representation[col] == MCD_REPRESENTATION_INT) @@ -74,7 +74,7 @@ int mcd_max_column_index_in_file(char *mcd_filename) FILE *f = myfopen(mcd_filename, "r"); char buffer[1000]; /* ugly */ int column; - char type[100]; + char wf[100]; char representation[100]; char filename[500]; /* ugly */ int fields_number; @@ -84,7 +84,7 @@ int mcd_max_column_index_in_file(char *mcd_filename) line_number++; if(feof(f)) break; if((buffer[0] == '\n') || (buffer[0] == '#')) continue; - fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename); + fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename); if(fields_number != 4){ fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); continue; @@ -106,8 +106,8 @@ void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename) if((m->representation[column] == MCD_REPRESENTATION_VOCAB) /* && (strcmp(m->filename[column], "_")) */ && (m->dico_array[column] == NULL)){ - m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, m->type_str[column]); - fprintf(stderr, "extracting dico %s from corpus\n", m->type_str[column]); + m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, m->wf_str[column]); + fprintf(stderr, "extracting dico %s \tfrom corpus\n", m->wf_str[column]); } } } @@ -123,8 +123,8 @@ void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose) if((m->representation[column] == MCD_REPRESENTATION_VOCAB) && (!strcmp(m->filename[column], "_")) && (m->dico_array[column] == NULL)){ - m->dico_array[column] = dico_vec_get_dico(vocabs, m->type_str[column]); - if(verbose) fprintf(stderr, "linking to dico %s\n", m->type_str[column]); + m->dico_array[column] = dico_vec_get_dico(vocabs, m->wf_str[column]); + if(verbose) fprintf(stderr, "linking to dico %s\n", m->wf_str[column]); } } } @@ -134,7 +134,7 @@ void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose) mcd *mcd_read(char *mcd_filename, int verbose) { int column; - char type[100]; + char wf[100]; char representation[100]; char filename[500]; /* ugly */ int fields_number; @@ -149,19 +149,20 @@ mcd *mcd_read(char *mcd_filename, int verbose) line_number++; if(feof(f)) break; if((buffer[0] == '\n') || (buffer[0] == '#')) continue; - fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename); + fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename); if(fields_number != 4){ /* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */ continue; } - if(verbose) fprintf(stderr, "column = %d type = %s representation = %s filename = %s\n", column, type, representation, filename); - m->type[column] = feat_type_string2int(type); - m->type_str[column] = strdup(type); - if(m->type[column] == -1){ - fprintf(stderr, "in line %d of mcd file %s invalid type, I'm skipping it\n", line_number, mcd_filename); + if(verbose) fprintf(stderr, "column = %d\tword feature = %s\trepresentation = %s\tfilename = %s\n", column, wf, representation, filename); + m->wf[column] = mcd_wf_code(wf); + m->wf_str[column] = strdup(wf); + if(m->wf[column] == -1){ + fprintf(stderr, "in line %d of mcd file %s invalid wf, I'm skipping it\n", line_number, mcd_filename); continue; } - m->type2col[m->type[column]] = column; + + m->wf2col[m->wf[column]] = column; if(!strcmp(representation, "_")) m->representation[column] = MCD_REPRESENTATION_NULL; else if(!strcmp(representation, "EMB")) m->representation[column] = MCD_REPRESENTATION_EMB; @@ -185,6 +186,7 @@ mcd *mcd_read(char *mcd_filename, int verbose) } } } + fclose(f); return m; } @@ -194,53 +196,53 @@ mcd *mcd_read(char *mcd_filename, int verbose) mcd *mcd_build_conll07(void) { mcd *m = mcd_new(8); - m->type[0]=FEAT_TYPE_INDEX; - m->type_str[0]=strdup("INDEX"); + m->wf[0]=MCD_WF_INDEX; + m->wf_str[0]=strdup("INDEX"); m->representation[0]= MCD_REPRESENTATION_INT; m->filename[0] = strdup("_"); - m->type2col[FEAT_TYPE_INDEX] = 0; + m->wf2col[MCD_WF_INDEX] = 0; - m->type[1]=FEAT_TYPE_FORM; - m->type_str[1]=strdup("FORM"); + m->wf[1]=MCD_WF_FORM; + m->wf_str[1]=strdup("FORM"); m->representation[1]= MCD_REPRESENTATION_VOCAB; m->filename[1] = strdup("_"); - m->type2col[FEAT_TYPE_FORM] = 1; + m->wf2col[MCD_WF_FORM] = 1; - m->type[2]=FEAT_TYPE_LEMMA; - m->type_str[2]=strdup("LEMMA"); + m->wf[2]=MCD_WF_LEMMA; + m->wf_str[2]=strdup("LEMMA"); m->representation[2]= MCD_REPRESENTATION_VOCAB; m->filename[2] = strdup("_"); - m->type2col[FEAT_TYPE_LEMMA] = 2; + m->wf2col[MCD_WF_LEMMA] = 2; - m->type[3]=FEAT_TYPE_CPOS; - m->type_str[3]=strdup("CPOS"); + m->wf[3]=MCD_WF_CPOS; + m->wf_str[3]=strdup("CPOS"); m->representation[3]= MCD_REPRESENTATION_VOCAB; m->filename[3] = strdup("_"); - m->type2col[FEAT_TYPE_CPOS] = 3; + m->wf2col[MCD_WF_CPOS] = 3; - m->type[4]=FEAT_TYPE_POS; - m->type_str[4]=strdup("POS"); + m->wf[4]=MCD_WF_POS; + m->wf_str[4]=strdup("POS"); m->representation[4]= MCD_REPRESENTATION_VOCAB; m->filename[4] = strdup("_"); - m->type2col[FEAT_TYPE_POS] = 4; + m->wf2col[MCD_WF_POS] = 4; - m->type[5]=FEAT_TYPE_FEATS; - m->type_str[5]=strdup("FEATS"); + m->wf[5]=MCD_WF_FEATS; + m->wf_str[5]=strdup("FEATS"); m->representation[5]= MCD_REPRESENTATION_VOCAB; m->filename[5] = strdup("_"); - m->type2col[FEAT_TYPE_FEATS] = 5; + m->wf2col[MCD_WF_FEATS] = 5; - m->type[6]=FEAT_TYPE_GOV; - m->type_str[6]=strdup("GOV"); + m->wf[6]=MCD_WF_GOV; + m->wf_str[6]=strdup("GOV"); m->representation[6]= MCD_REPRESENTATION_INT; m->filename[6] = strdup("_"); - m->type2col[FEAT_TYPE_GOV] = 6; + m->wf2col[MCD_WF_GOV] = 6; - m->type[7]=FEAT_TYPE_LABEL; - m->type_str[7]=strdup("LABEL"); + m->wf[7]=MCD_WF_LABEL; + m->wf_str[7]=strdup("LABEL"); m->representation[7]= MCD_REPRESENTATION_VOCAB; m->filename[7] = strdup("_"); - m->type2col[FEAT_TYPE_LABEL] = 7; + m->wf2col[MCD_WF_LABEL] = 7; return m; } @@ -251,114 +253,45 @@ mcd *mcd_build_ifpls(void) { mcd *m = mcd_new(6); - m->type[0]=FEAT_TYPE_INDEX; - m->type_str[0]=strdup("INDEX"); + m->wf[0]=MCD_WF_INDEX; + m->wf_str[0]=strdup("INDEX"); m->representation[0]= MCD_REPRESENTATION_INT; m->filename[0] = strdup("_"); - m->type2col[FEAT_TYPE_INDEX] = 0; + m->wf2col[MCD_WF_INDEX] = 0; - m->type[1]=FEAT_TYPE_FORM; - m->type_str[1]=strdup("FORM"); + m->wf[1]=MCD_WF_FORM; + m->wf_str[1]=strdup("FORM"); m->representation[1]= MCD_REPRESENTATION_VOCAB; m->filename[1] = strdup("_"); - m->type2col[FEAT_TYPE_FORM] = 1; + m->wf2col[MCD_WF_FORM] = 1; - m->type[2]=FEAT_TYPE_POS; - m->type_str[2]=strdup("POS"); + m->wf[2]=MCD_WF_POS; + m->wf_str[2]=strdup("POS"); m->representation[2]= MCD_REPRESENTATION_VOCAB; m->filename[2] = strdup("_"); - m->type2col[FEAT_TYPE_POS] = 2; + m->wf2col[MCD_WF_POS] = 2; - m->type[3]=FEAT_TYPE_LEMMA; - m->type_str[3]=strdup("LEMMA"); + m->wf[3]=MCD_WF_LEMMA; + m->wf_str[3]=strdup("LEMMA"); m->representation[3]= MCD_REPRESENTATION_VOCAB; m->filename[3] = strdup("_"); - m->type2col[FEAT_TYPE_LEMMA] = 3; + m->wf2col[MCD_WF_LEMMA] = 3; - m->type[4]=FEAT_TYPE_GOV; - m->type_str[4]=strdup("GOV"); + m->wf[4]=MCD_WF_GOV; + m->wf_str[4]=strdup("GOV"); m->representation[4]= MCD_REPRESENTATION_INT; m->filename[4] = strdup("_"); - m->type2col[FEAT_TYPE_GOV] = 4; + m->wf2col[MCD_WF_GOV] = 4; - m->type[5]=FEAT_TYPE_LABEL; - m->type_str[5]=strdup("LABEL"); + m->wf[5]=MCD_WF_LABEL; + m->wf_str[5]=strdup("LABEL"); m->representation[5]= MCD_REPRESENTATION_VOCAB; m->filename[5] = strdup("_"); - m->type2col[FEAT_TYPE_LABEL] = 5; + m->wf2col[MCD_WF_LABEL] = 5; return m; } -mcd *mcd_read_old(char *mcd_filename, char *corpus_filename, dico_vec *vocabs) -{ - int column; - char type[100]; - char representation[100]; - char filename[500]; /* ugly */ - int fields_number; - int line_number = 0; - char buffer[1000]; /* ugly */ - int nb_col = mcd_max_column_index_in_file(mcd_filename); - mcd *m = mcd_new(nb_col + 1); - FILE *f = myfopen(mcd_filename, "r"); - /* int first = 1; */ - - while(fgets(buffer, 1000, f)){ - line_number++; - if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == '#')) continue; - fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename); - if(fields_number != 4){ - /* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */ - continue; - } - fprintf(stderr, "column = %d type = %s representation = %s filename = %s\n", column, type, representation, filename); - m->type[column] = feat_type_string2int(type); - if(m->type[column] == -1){ - fprintf(stderr, "in line %d of mcd file %s invalid type, I'm skipping it\n", line_number, mcd_filename); - continue; - } - m->type2col[m->type[column]] = column; - /* m->col2type[column] = m->type[column]; */ - if(!strcmp(representation, "_")) m->representation[column] = MCD_REPRESENTATION_NULL; - else if(!strcmp(representation, "EMB")) m->representation[column] = MCD_REPRESENTATION_EMB; - else if(!strcmp(representation, "VOCAB")) m->representation[column] = MCD_REPRESENTATION_VOCAB; - else if(!strcmp(representation, "INT")) m->representation[column] = MCD_REPRESENTATION_INT; - else{ - fprintf(stderr, "in line %d of mcd file %s invalid mode of representation, I'm skipping it\n", line_number, mcd_filename); - m->representation[column] = MCD_REPRESENTATION_NULL; - } - if(m->representation[column] != MCD_REPRESENTATION_NULL){ - m->filename[column] = strdup(filename); - if(m->representation[column] == MCD_REPRESENTATION_EMB){ - fprintf(stderr, "loading word embedding %s\n", m->filename[column]); - m->word_emb_array[column] = word_emb_load(m->filename[column]); - } - else if(m->representation[column] == MCD_REPRESENTATION_VOCAB){ - if(!strcmp(m->filename[column], "_")){ - if(corpus_filename){ - fprintf(stderr, "extracting dico %s from corpus\n", type); - m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, type); - } - else if(vocabs){ - fprintf(stderr, "linking to dico %s\n", type); - m->dico_array[column] = dico_vec_get_dico(vocabs, type); - } - if(m->dico_array[column] == NULL) - fprintf(stderr, "cannot find dico %s\n", type); - } - else{ - fprintf(stderr, "loading dico %s\n", m->filename[column]); - m->dico_array[column] = dico_read(m->filename[column], 0.5); - } - } - } - } - fclose(f); - return m; -} - /* returns a dico_vec containing the different dictionnaries found in an mcd structure */ dico_vec *mcd_build_dico_vec(mcd *mcd_struct) @@ -366,9 +299,53 @@ dico_vec *mcd_build_dico_vec(mcd *mcd_struct) dico_vec *dv = dico_vec_new(); int i; for(i=0; i < mcd_struct->nb_col; i++){ + /* printf("in mcd_build_dico_vec i = %d\n", i); */ if(mcd_struct->dico_array[i]){ + /* printf("dico name = %s\n", mcd_struct->dico_array[i]->name); */ dico_vec_add(dv, mcd_struct->dico_array[i]); } } return dv; } + +int mcd_wf_code(char *wf) +{ + if(!strcmp(wf, "INDEX")) return MCD_WF_INDEX; + if(!strcmp(wf, "FORM")) return MCD_WF_FORM; + if(!strcmp(wf, "LEMMA")) return MCD_WF_LEMMA; + if(!strcmp(wf, "CPOS")) return MCD_WF_CPOS; + if(!strcmp(wf, "POS")) return MCD_WF_POS; + if(!strcmp(wf, "FEATS")) return MCD_WF_FEATS; + if(!strcmp(wf, "LABEL")) return MCD_WF_LABEL; + if(!strcmp(wf, "STAG")) return MCD_WF_STAG; + /* if(!strcmp(wf, "INT")) return MCD_WF_INT; */ + if(!strcmp(wf, "GOV")) return MCD_WF_GOV; + if(!strcmp(wf, "SENT_SEG")) return MCD_WF_SENT_SEG; + if(!strcmp(wf, "A")) return MCD_WF_A; + if(!strcmp(wf, "B")) return MCD_WF_B; + if(!strcmp(wf, "C")) return MCD_WF_C; + if(!strcmp(wf, "D")) return MCD_WF_D; + if(!strcmp(wf, "E")) return MCD_WF_E; + if(!strcmp(wf, "F")) return MCD_WF_F; + if(!strcmp(wf, "G")) return MCD_WF_G; + if(!strcmp(wf, "H")) return MCD_WF_H; + if(!strcmp(wf, "I")) return MCD_WF_I; + if(!strcmp(wf, "J")) return MCD_WF_J; + if(!strcmp(wf, "K")) return MCD_WF_K; + if(!strcmp(wf, "L")) return MCD_WF_L; + if(!strcmp(wf, "M")) return MCD_WF_M; + if(!strcmp(wf, "N")) return MCD_WF_N; + if(!strcmp(wf, "O")) return MCD_WF_O; + if(!strcmp(wf, "P")) return MCD_WF_P; + if(!strcmp(wf, "Q")) return MCD_WF_Q; + if(!strcmp(wf, "R")) return MCD_WF_R; + if(!strcmp(wf, "S")) return MCD_WF_S; + if(!strcmp(wf, "T")) return MCD_WF_T; + if(!strcmp(wf, "U")) return MCD_WF_U; + if(!strcmp(wf, "V")) return MCD_WF_V; + if(!strcmp(wf, "W")) return MCD_WF_W; + if(!strcmp(wf, "X")) return MCD_WF_X; + if(!strcmp(wf, "Y")) return MCD_WF_Y; + if(!strcmp(wf, "Z")) return MCD_WF_Z; + return -1; +} diff --git a/maca_common/src/sentence.c b/maca_common/src/sentence.c index 750aaac7c71468e6105f4a33d6ef332ac5b719fe..431fa52c05d851fcc7f1aec30967aa5ec932d861 100644 --- a/maca_common/src/sentence.c +++ b/maca_common/src/sentence.c @@ -2,9 +2,7 @@ #include<stdlib.h> #include<string.h> #include"sentence.h" -#include"config.h" #include"dico.h" -#include"feat_types.h" sentence *sentence_new(mcd *m, FILE *f) { @@ -35,7 +33,10 @@ sentence *sentence_copy(sentence *s) void sentence_print(FILE *f, sentence *s, dico *dico_labels) { int i; + + for(i=1; i < s->length; i++){ + fprintf(f, "%d\t", i); word_print(f, s->words[i], s->mcd_struct, dico_labels); fprintf(f, "\n"); } @@ -47,6 +48,7 @@ void sentence_add_word(sentence *s, word *w) s->length++; s->words = (word **)realloc(s->words, s->length * sizeof(word *)); s->words[s->length -1] = w; + word_set_relative_index(w, s->length -1); } void sentence_free(sentence *s) @@ -69,11 +71,14 @@ sentence *sentence_read(FILE *f, mcd *mcd_struct) word *w = NULL; while(fgets(buffer, 1000, f)){ + /* printf("buffer = %s\n", buffer); */ if(feof(f)) break; - if((buffer[0] == '\n') || (buffer[0] == ' ')) break; /* end of the sentence */ + if((buffer[0] == '\n') || (buffer[0] == ' ')) break; /* end of the sentence indicated by empty line */ w = word_parse_buffer(buffer, mcd_struct); - sentence_add_word(s, w); + if(w) sentence_add_word(s, w); + if(word_is_eos(w, mcd_struct)) break; } + if(s->length == 1){ sentence_free(s); diff --git a/maca_common/src/word.c b/maca_common/src/word.c index 9d46d258e0c96d1501f5d0a3cbc1623f64d3c9ed..8cc623cdcd6eb57168311c6009fb1fd33856d400 100644 --- a/maca_common/src/word.c +++ b/maca_common/src/word.c @@ -4,7 +4,7 @@ #include<ctype.h> #include"word.h" #include"util.h" -#include"feat_types.h" + word *word_new(char *input) { @@ -15,8 +15,10 @@ word *word_new(char *input) else w->input = strdup(input); - for(i=0; i < FEAT_TYPE_NB; i++) w->feat_array[i] = -1; + for(i=0; i < MCD_WF_NB; i++) w->wf_array[i] = -1; w->form = NULL; + + w->relative_index = -1; return w; } @@ -39,13 +41,13 @@ word *word_read(FILE *f, mcd *mcd_struct) } /* parse string buffer to extract the different word features */ -/* codes of the word features are stored in feat_array */ +/* codes of the word features are stored in wf_array */ word *word_parse_buffer(char *buffer, mcd *mcd_struct) { char *token; word *w = NULL; - int column_nb = 0; + int col = 0; /* remove newline from buffer */ if(buffer[strlen(buffer)-1] == '\n') buffer[strlen(buffer)-1] = '\0'; @@ -53,14 +55,14 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct) w = word_new(buffer); token = strtok(buffer, "\t"); do{ - if((column_nb < mcd_struct->nb_col) && (mcd_struct->type[column_nb] != -1)){ - w->feat_array[mcd_struct->type[column_nb]] = mcd_get_code(mcd_struct, token, column_nb); + if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1)){ + w->wf_array[mcd_struct->wf[col]] = mcd_get_code(mcd_struct, token, col); } - if(mcd_struct->type[column_nb] == FEAT_TYPE_FORM){ + if(mcd_struct->wf[col] == MCD_WF_FORM){ w->form = strdup(token); w->U1 = isupper(token[0]) ? 1 : 0; } - column_nb++; + col++; } while((token = strtok(NULL , "\t"))); return w; @@ -72,9 +74,13 @@ word *word_copy(word *w) word *copy = word_new(w->input); int i; - for(i=0; i < FEAT_TYPE_NB; i++) - copy->feat_array[i] = w->feat_array[i]; + for(i=0; i < MCD_WF_NB; i++) + copy->wf_array[i] = w->wf_array[i]; + copy->U1 = w->U1; + copy->signature = w->signature; + copy->label = w->label; + copy->form = (w->form)? strdup(w->form): NULL; return copy; } @@ -91,26 +97,58 @@ word *word_create_dummy(mcd *mcd_struct) word *w = word_new(NULL); /* int type; */ - w->feat_array[FEAT_TYPE_INDEX] = 0; - - /* for(type = 1; type < FEAT_TYPE_NB; type++) - w->feat_array[type] = -1;*/ - /* if(mcd_struct->type2col[type] != -1) - w->feat_array[type] = mcd_get_code(mcd_struct, (char *) "ROOT", mcd_struct->type2col[type]);*/ + w->wf_array[MCD_WF_INDEX] = 0; + w->relative_index = 0; + /* for(type = 1; type < MCD_WF_NB; type++) + w->wf_array[type] = -1;*/ + /* if(mcd_struct->wf2col[type] != -1) + w->wf_array[type] = mcd_get_code(mcd_struct, (char *) "ROOT", mcd_struct->wf2col[type]);*/ return w; } -void word_print(FILE *f, word *w, mcd *mcd_struct, dico *dico_labels) +void word_print2(FILE *f, word *w) { + int i; if(w == NULL) return; - fprintf(f, "%s", w->input); - fprintf(f, "\t%d", word_get_gov(w)); + + + if(w->input) fprintf(f, "%s\t", w->input); + printf("form = %d\t", word_get_form(w)); + printf("lemma = %d\t", word_get_lemma(w)); + printf("pos = %d\t", word_get_pos(w)); + printf("index = %d\t", word_get_index(w)); + printf("rel index = %d\n", word_get_relative_index(w)); + + /* + if(dico_labels) fprintf(f, "\t%s", dico_int2string(dico_labels, w->label)); else - fprintf(f, "\t%d", word_get_label(w)); + fprintf(f, "\t%d", word_get_label(w));*/ } +void word_print(FILE *f, word *w, mcd *mcd_struct, dico *dico_labels) +{ + int i; + if(w == NULL) return; + + + + fprintf(f, "%s", w->input); + +} + +int word_is_eos(word *w, mcd *mcd_struct) +{ + if(w == NULL) return 0; + if(mcd_get_sent_seg_col(mcd_struct) == -1) return 0; + return word_get_sent_seg(w); +} +int word_get_gov_relative_index(word *w) +{ + if(word_get_gov(w) == 0) return 0; + return word_get_relative_index(w) + word_get_gov(w); +} diff --git a/maca_lemmatizer/src/maca_lemmatizer.c b/maca_lemmatizer/src/maca_lemmatizer.c index 7e22f08b15877319984f95907e35026cb471736f..f82aaa5ac8d8189378665d6cec9b64427a70f206 100644 --- a/maca_lemmatizer/src/maca_lemmatizer.c +++ b/maca_lemmatizer/src/maca_lemmatizer.c @@ -107,13 +107,13 @@ int main(int argc, char *argv[]) if(ctx->pos_column != -1) pos_column = ctx->pos_column; else - pos_column = ctx->mcd_struct->type2col[FEAT_TYPE_POS]; + pos_column = ctx->mcd_struct->wf2col[MCD_WF_POS]; if(ctx->form_column != -1) form_column = ctx->form_column; else - form_column = ctx->mcd_struct->type2col[FEAT_TYPE_FORM]; + form_column = ctx->mcd_struct->wf2col[MCD_WF_FORM]; if(ctx->conll_filename == NULL) f = stdin; diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 7d07e332bbfcf44b087dd9a1285d1ec3611c37b9..0b09d1606aad7623b6139ad0988bac71b7362f6e 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -21,6 +21,7 @@ set(SOURCES src/context.c src/config.c src/queue.c src/beam.c + src/feat_types.c ) #compiling library @@ -75,6 +76,13 @@ target_link_libraries(maca_trans_parser_cff_cutoff transparse) target_link_libraries(maca_trans_parser_cff_cutoff maca_common) install (TARGETS maca_trans_parser_cff_cutoff DESTINATION bin) +add_executable(eval_classifier ./src/eval_classifier.c) +target_link_libraries(eval_classifier transparse) +target_link_libraries(eval_classifier maca_common) +install (TARGETS eval_classifier DESTINATION bin) + + + #add_executable(test_w2v ./src/test_w2v.c) #target_link_libraries(test_w2v transparse) #install (TARGETS test_w2v DESTINATION bin) diff --git a/maca_trans_parser/src/config.c b/maca_trans_parser/src/config.c index 84fea5ab35953dfc67a67c10b91755c6c42513a4..7f08cd7d53af6c8a7c4a4c41562e4c80b900ad13 100644 --- a/maca_trans_parser/src/config.c +++ b/maca_trans_parser/src/config.c @@ -29,10 +29,13 @@ word *config_add_next_word_to_buffer(config *c) w = word_read(c->f, c->mcd_struct); if(w == NULL) return NULL; - if(word_get_index(w) == -1){ - w->feat_array[FEAT_TYPE_INDEX] = c->current_index++; - /* printf("current index = %d\n", c->current_index); */ - } + + word_set_relative_index(w, c->current_index); + + /* if(word_get_index(w) == -1){ + word_set_index(w, c->current_index); + }*/ + c->current_index++; queue_add(c->bf, w); return w; } @@ -109,7 +112,15 @@ void config_add_mvt(config *c, int mvt) void config_print(FILE *f, config *c) { + word *b0 = NULL; + word *s0 = NULL; if(c){ + if(!stack_is_empty(c->st)) + s0 = stack_elt_n(c->st, 0); + b0 = queue_elt_n(c->bf, 0); + if(s0) { printf("s0 = "); word_print2(stdout, s0);} + if(b0) { printf("b0 = "); word_print2(stdout, b0);} + stack_print(f, c->st); fprintf(f, " "); queue_print(f, c->bf); diff --git a/maca_trans_parser/src/depset.c b/maca_trans_parser/src/depset.c index d949351638f937473b430c9fca0eb343ba5d979c..c7f5cfe8f4b5989f75a60386323acb09a8da6e4e 100644 --- a/maca_trans_parser/src/depset.c +++ b/maca_trans_parser/src/depset.c @@ -44,10 +44,10 @@ void depset_add(depset *d, word *gov, int label, word *dep) int new_length; if(gov == NULL || dep == NULL) return; - word *max = (word_get_index(gov) > word_get_index(dep)) ? gov : dep; + word *max = (word_get_relative_index(gov) > word_get_relative_index(dep)) ? gov : dep; - if(word_get_index(max) >= d->length){ - new_length = word_get_index(max) + 1; + if(word_get_relative_index(max) >= d->length){ + new_length = word_get_relative_index(max) + 1; d->array = (dependency *)realloc(d->array, new_length * sizeof(dependency)); for(i=d->length; i < new_length; i++){ d->array[i].gov = NULL; @@ -56,9 +56,9 @@ void depset_add(depset *d, word *gov, int label, word *dep) } d->length = new_length; } - d->array[word_get_index(dep)].gov = gov; - d->array[word_get_index(dep)].dep = dep; - d->array[word_get_index(dep)].label = label; + d->array[word_get_relative_index(dep)].gov = gov; + d->array[word_get_relative_index(dep)].dep = dep; + d->array[word_get_relative_index(dep)].label = label; } void depset_print(FILE *f, depset *d) @@ -66,7 +66,7 @@ void depset_print(FILE *f, depset *d) int i; for(i=0; i < d->length; i++){ if((d->array[i].gov) && (d->array[i].dep)) - fprintf(f, "(%d, %d, %d) ", word_get_index(d->array[i].dep), d->array[i].label, word_get_index(d->array[i].gov)); + fprintf(f, "(%d, %d, %d) ", word_get_relative_index(d->array[i].dep), d->array[i].label, word_get_relative_index(d->array[i].gov)); } fprintf(f, "\n"); } @@ -74,27 +74,37 @@ void depset_print(FILE *f, depset *d) void depset_print2(FILE *f, depset *d, dico *dico_labels) { int i; - + int root_code = dico_string2int(dico_labels, "root"); + int distance; + for(i=1; i < d->length; i++){ if((d->array[i].gov) && (d->array[i].dep)){ - /* fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov) - word_get_index(d->array[i].dep), dico_int2string(dico_labels, d->array[i].label ));*/ - fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label)); - } - } - /* fprintf(f, "\n"); */ + /* if(d->array[i].label == root_code) */ + /* fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, 0, dico_int2string(dico_labels, d->array[i].label)); */ + /* else{ */ + distance = word_get_relative_index(d->array[i].gov) - word_get_relative_index(d->array[i].dep); + fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, distance, dico_int2string(dico_labels, d->array[i].label)); + /* } */ + } + } } void depset_print3(FILE *f, depset *d, dico *dico_labels) { int i; + int root_code = dico_string2int(dico_labels, "root"); + int distance; for(i=1; i < d->length; i++){ if((d->array[i].gov) && (d->array[i].dep)){ - /* fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov) - word_get_index(d->array[i].dep), dico_int2string(dico_labels, d->array[i].label ));*/ - fprintf(f, "%d\t%s\t%d\t%s\n", word_get_index(d->array[i].dep), d->array[i].dep->input, word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label)); + if(d->array[i].label == root_code) + fprintf(f, "%d\t%s\t%d\t%s\n", word_get_relative_index(d->array[i].dep), d->array[i].dep->input, 0, dico_int2string(dico_labels, d->array[i].label)); + else{ + distance = word_get_relative_index(d->array[i].gov) - word_get_relative_index(d->array[i].dep); + fprintf(f, "%d\t%s\t%d\t%s\n", word_get_relative_index(d->array[i].dep), d->array[i].dep->input, distance, dico_int2string(dico_labels, d->array[i].label)); + } } } - /* fprintf(f, "\n"); */ } char *skip_index(char *buffer) @@ -113,9 +123,9 @@ void depset_print_new_index(FILE *f, depset *d, dico *dico_labels) for(i=1; i < d->length; i++){ if((d->array[i].gov) && (d->array[i].dep)){ - /* fprintf(f, "%d\t", word_get_index(d->array[i].dep)); */ - fprintf(f, "%d\t", word_get_index(d->array[i].dep)); - fprintf(f, "%s\t%d\t%s\n", skip_index(d->array[i].dep->input), word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label)); + /* fprintf(f, "%d\t", word_get_relative_index(d->array[i].dep)); */ + fprintf(f, "%d\t", word_get_relative_index(d->array[i].dep)); + fprintf(f, "%s\t%d\t%s\n", skip_index(d->array[i].dep->input), word_get_relative_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label)); } } fprintf(f, "\n"); @@ -132,8 +142,8 @@ int depset_compare(depset *d1, depset *d2) if(d1->length != d2->length){ fprintf(stdout, "fail\n"); return 0;} for(i=0; i < d1->length; i++){ for(j=0; j < d2->length; j++){ - if((word_get_index(d1->array[i].gov) == word_get_index(d2->array[j].gov)) - && (word_get_index(d1->array[i].dep) == word_get_index(d2->array[j].dep)) + if((word_get_relative_index(d1->array[i].gov) == word_get_relative_index(d2->array[j].gov)) + && (word_get_relative_index(d1->array[i].dep) == word_get_relative_index(d2->array[j].dep)) && (d1->array[i].label == d2->array[j].label)) break; } if(j == d2->length){ diff --git a/maca_trans_parser/src/eval_classifier.c b/maca_trans_parser/src/eval_classifier.c index df6b787d7934bf1adf91ca46d953e1390f49c8ce..c5fd0a81e3da92987ab0a702fdb3eba27c10aada 100644 --- a/maca_trans_parser/src/eval_classifier.c +++ b/maca_trans_parser/src/eval_classifier.c @@ -3,14 +3,9 @@ #include<string.h> #include<unistd.h> #include<getopt.h> -#include"movement.h" -#include"oracle.h" -#include"feat_fct.h" #include"feature_table.h" -#include"dico.h" #include"perceptron.h" #include"context.h" -#include"config2feat_vec.h" void eval_classifier_help_message(context *ctx) { @@ -39,7 +34,7 @@ int main(int argc, char *argv[]) FILE *f; int i = 0; int err_nb = 0; - feat_vec *fv = feat_vec_new(feature_types_nb); + feat_vec *fv = feat_vec_new(2); char *token; float accuracy; context *ctx; @@ -48,7 +43,7 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); eval_classifier_check_options(ctx); - ft = feature_table_load(ctx->perc_model_filename); + ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); f = myfopen(ctx->cff_filename, "r"); i = 0; while(fgets(buffer, 10000, f)){ @@ -63,7 +58,7 @@ int main(int argc, char *argv[]) class_hyp = feature_table_argmax(fv, ft, &score); if(class_hyp != class_ref){ - /* printf("classe de reference = %d classe prédite = %d\n", class_ref, class_hyp); */ + printf("REF = %d HYP = %d\n", class_ref, class_hyp); err_nb++; } else{ diff --git a/maca_trans_parser/src/feat_fct.c b/maca_trans_parser/src/feat_fct.c index f8b45feaa8ddee247aeadb5e188ae71483886eb7..a20da58595a799642f62863f6010acca8ad41402 100644 --- a/maca_trans_parser/src/feat_fct.c +++ b/maca_trans_parser/src/feat_fct.c @@ -296,8 +296,8 @@ int ldep_s0r(config *c){ int i; if(top){ - if(word_get_index(top) >= c->ds->length) return -1; - for(i=word_get_index(top); i > 0; i--) + if(word_get_relative_index(top) >= c->ds->length) return -1; + for(i=word_get_relative_index(top); i > 0; i--) if(c->ds->array[i].gov == top) return i; } @@ -318,7 +318,7 @@ int rdep_s0r(config *c){ int i; if(top) - for(i=word_get_index(top); i < c->ds->length; i++) + for(i=word_get_relative_index(top); i < c->ds->length; i++) if(c->ds->array[i].gov == top) return i; return -1; @@ -338,8 +338,8 @@ int ldep_b0r(config *c){ int i; if(top){ - if(word_get_index(top) >= c->ds->length) return -1; - for(i=word_get_index(top); i > 0; i--) + if(word_get_relative_index(top) >= c->ds->length) return -1; + for(i=word_get_relative_index(top); i > 0; i--) if(c->ds->array[i].gov == top) return i; } @@ -364,7 +364,7 @@ int rdep_b0r(config *c){ int i; if(top) - for(i=word_get_index(top); i < c->ds->length; i++) + for(i=word_get_relative_index(top); i < c->ds->length; i++) if(c->ds->array[i].gov == top) return i; return -1; @@ -439,9 +439,9 @@ int dist_s0_b0(config *c){ int dist; if(stack_is_empty(c->st) || queue_is_empty(c->bf)) - return -1; + return 0; - dist = word_get_index(queue_elt_n(c->bf,0)) - word_get_index(stack_top(c->st)); + dist = word_get_relative_index(queue_elt_n(c->bf,0)) - word_get_relative_index(stack_top(c->st)); return (abs(dist) > 6)? 6 : dist; } diff --git a/maca_trans_parser/src/feat_model.c b/maca_trans_parser/src/feat_model.c index 6b6ad227d61582e2e5aade9e14c5ab3a4ffd567e..93b8bf0b230c380221643140501ea437aace440e 100644 --- a/maca_trans_parser/src/feat_model.c +++ b/maca_trans_parser/src/feat_model.c @@ -72,7 +72,7 @@ void feat_model_compute_ranges(feat_model *fm, mcd *m, int mvt_nb) sfd->range = mvt_nb; } else{ - column = m->type2col[sfd->type]; + column = m->wf2col[sfd->type]; mcd_representation = m->representation[column]; if(mcd_representation == MCD_REPRESENTATION_VOCAB) sfd->range = m->dico_array[column]->nbelem; diff --git a/maca_common/src/feat_types.c b/maca_trans_parser/src/feat_types.c similarity index 100% rename from maca_common/src/feat_types.c rename to maca_trans_parser/src/feat_types.c diff --git a/maca_common/include/feat_types.h b/maca_trans_parser/src/feat_types.h similarity index 97% rename from maca_common/include/feat_types.h rename to maca_trans_parser/src/feat_types.h index a4c3a82e60239d53a74db2d58fbc902b20e7675c..2a6518a7040fcdce6728bfddf543f30e865a0dc2 100644 --- a/maca_common/include/feat_types.h +++ b/maca_trans_parser/src/feat_types.h @@ -40,6 +40,7 @@ #define FEAT_TYPE_Z 34 #define FEAT_TYPE_TRANS 35 +/* #define FEAT_TYPE_SENT_SEG 36 */ #define FEAT_TYPE_INT 36 #define FEAT_TYPE_INT_0 37 diff --git a/maca_trans_parser/src/feat_vec.c b/maca_trans_parser/src/feat_vec.c index a8170567bb17042291d29fcedc8c7c3dc12f283b..f63b30c5d3d45776dfcb3cfe1b74843e4a59361b 100644 --- a/maca_trans_parser/src/feat_vec.c +++ b/maca_trans_parser/src/feat_vec.c @@ -101,7 +101,7 @@ void feat_vec_print_dnn(FILE *f, feat_vec *fv, feat_model *fm, mcd *m) } } else{ - column = m->type2col[sfd->type]; + column = m->wf2col[sfd->type]; mcd_representation = m->representation[column]; if(mcd_representation == MCD_REPRESENTATION_EMB){ /* if(i != 0) fprintf(f, " "); */ @@ -147,7 +147,7 @@ void feat_vec_fill_input_array_dnn(float *input_array, feat_vec *fv, feat_model } } else{ - column = m->type2col[sfd->type]; + column = m->wf2col[sfd->type]; mcd_representation = m->representation[column]; if(mcd_representation == MCD_REPRESENTATION_EMB){ current_index = word_emb_fill_input_array_dnn(input_array, m->word_emb_array[column], fv->t[i], current_index); diff --git a/maca_trans_parser/src/feature_table.c b/maca_trans_parser/src/feature_table.c index 3db2da4bb1059e9e4bb32c62bf1f0e3f2cfd5bf7..2a7dd03819317d144e7a21a778c3400d231b4243 100644 --- a/maca_trans_parser/src/feature_table.c +++ b/maca_trans_parser/src/feature_table.c @@ -160,8 +160,11 @@ int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max) argmax = 0; *max = classes_score[0]; + /* printf("max = %f argmax = %d\n", *max, argmax); */ for(cla=1; cla < classes_nb; cla++){ + /* printf("candidat %d = %f\n", cla, classes_score[cla]); */ if(classes_score[cla] > *max){ + /* printf("max = %f argmax = %d\n", *max, argmax); */ *max = classes_score[cla]; argmax = cla; } diff --git a/maca_trans_parser/src/maca_trans_parser.c b/maca_trans_parser/src/maca_trans_parser.c index 51a7c7e3f70ecc747e175e7af51380961c800c7a..ce7673d2ee4da49ab4d4ba2abac92b623faa77a5 100644 --- a/maca_trans_parser/src/maca_trans_parser.c +++ b/maca_trans_parser/src/maca_trans_parser.c @@ -100,6 +100,8 @@ int main(int argc, char *argv[]) set_linguistic_resources_filenames_parser(ctx); ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + + /* dico_vec_print(NULL, ctx->vocabs); */ mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff.c b/maca_trans_parser/src/maca_trans_parser_conll2cff.c index 95399445ad5ae618e3c2751f21046a7be88678cb..773d985935240478efeeda08fe19d775f95914f5 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2cff.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff.c @@ -43,7 +43,7 @@ void maca_trans_parser_conll2cff_check_options(context *ctx) } void generate_training_file_stream(FILE *output_file, context *ctx) -{ +{ config *c; int mvt_code; char mvt_type; @@ -51,24 +51,28 @@ void generate_training_file_stream(FILE *output_file, context *ctx) feat_vec *fv = feat_vec_new(feature_types_nb); sentence *ref = NULL; int sentence_nb = 0; - int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), ctx->root_label); + /* int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), (char *) ctx->root_label); */ + int root_label = dico_string2int(ctx->dico_labels, (char *) ctx->root_label); FILE *conll_file = myfopen(ctx->input_filename, "r"); FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); c = config_initial(conll_file, ctx->mcd_struct, 5); while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ - /* sentence_print(stdout, ref, mcd_get_dico_label(ctx->mcd_struct)); */ + /* sentence_print(stdout, ref, ctx->dico_labels); */ while(1){ - /* config_print(stdout,c); */ + /* config_print(stdout,c); */ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); - mvt_code = oracle(c, ref); + /* feat_vec_print(stdout, fv); */ + + mvt_code = oracle_parser(c, ref); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); - /* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */ + /* printf("mvt code = %d\n", mvt_code); */ + /* movement_print(stdout, mvt_code, ctx->dico_labels); */ fprintf(output_file, "%d", mvt_code); feat_vec_print(output_file, fv); @@ -83,14 +87,17 @@ void generate_training_file_stream(FILE *output_file, context *ctx) /* shift dummy word in stack */ movement_shift(c, 1, 0); - /* printf("sentence complete config : "); - config_print(stdout,c); */ + /* printf("sentence complete config : "); + config_print(stdout,c); */ /* empty depset */ depset_free(c->ds); c->ds = depset_new(); sentence_free(ref); sentence_nb++; + + c->current_index = queue_renumber_words(c->bf); + break; } @@ -132,7 +139,7 @@ void generate_training_file_buffer(FILE *output_file, context *ctx) config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); - mvt_code = oracle(c, ref); + mvt_code = oracle_parser(c, ref); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); @@ -169,9 +176,7 @@ int main(int argc, char *argv[]) ctx = context_read_options(argc, argv); maca_trans_parser_conll2cff_check_options(ctx); - ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); - if(ctx->mode == TRAIN_MODE){ mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); @@ -181,8 +186,12 @@ int main(int argc, char *argv[]) ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); } + + /* dico_vec_print(NULL, ctx->vocabs); */ ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + + if(ctx->dico_labels == NULL){ fprintf(stderr, "cannot find label names\n"); return 1; @@ -209,6 +218,10 @@ int main(int argc, char *argv[]) else output_file = stdout; + + + + if(ctx->stream_mode) generate_training_file_stream(output_file, ctx); else diff --git a/maca_trans_parser/src/maca_trans_parser_conll2fann.c b/maca_trans_parser/src/maca_trans_parser_conll2fann.c index 4c44bfefafd7c071c10e2134c4735fefc9c16b74..aa3eb8a0db816b483da9dad83f0a09e45d67bd96 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2fann.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2fann.c @@ -86,7 +86,7 @@ int generate_training_file_buffer(FILE *output_file, context *ctx) while(!config_is_terminal(c)){ /* config_print(stdout,c); */ config2feat_vec_fann(ctx->features_model, c, fv, ctx->mode); - mvt_code = oracle(c, ref); + mvt_code = oracle_parser(c, ref); nb_trans++; feat_vec_print_dnn(output_file, fv, ctx->features_model, ctx->mcd_struct); diff --git a/maca_trans_parser/src/maca_trans_tagger_conll2cff.c b/maca_trans_parser/src/maca_trans_tagger_conll2cff.c index a3acdbda342d3b47cd16ec4a4e2745592e8ede6e..8ca8e53d3f40f5fecf48ec24ea881b4b06bd00bd 100644 --- a/maca_trans_parser/src/maca_trans_tagger_conll2cff.c +++ b/maca_trans_parser/src/maca_trans_tagger_conll2cff.c @@ -32,6 +32,7 @@ void maca_trans_parser_conll2cff_help_message(context *ctx) context_general_help_message(ctx); context_mode_help_message(ctx); context_sent_nb_help_message(ctx); + context_mcd_help_message(ctx); fprintf(stderr, "INPUT\n"); context_conll_help_message(ctx); diff --git a/maca_trans_parser/src/movement_parser.c b/maca_trans_parser/src/movement_parser.c index a2d5af021740903671c46ec4f59877640ca70486..5ec8262ecfc83dedf089c5c80987223e8fb84710 100644 --- a/maca_trans_parser/src/movement_parser.c +++ b/maca_trans_parser/src/movement_parser.c @@ -32,6 +32,20 @@ int movement_cut(config *c) } #endif +void movement_print(FILE *f, int mvt_code, dico *dico_labels){ + int mvt_type = movement_type(mvt_code); + int mvt_label = movement_label(mvt_code); + char *label; + if(mvt_type == MVT_SHIFT) fprintf(f, "SHIFT"); + else{ + if(mvt_type == MVT_RIGHT) fprintf(f, "RIGHT"); + else fprintf(f, "LEFT"); + label = dico_int2string(dico_labels, mvt_label); + fprintf(f, " %s", label); + } + fprintf(f, "\n"); +} + int movement_type(int mvt) { @@ -52,7 +66,7 @@ int movement_left_arc(config *c, int label, float score) { if(stack_is_empty(c->st)) return 0; if(queue_is_empty(c->bf)) return 0; - if(word_get_index(stack_top(c->st)) == 0) return 0; + if(word_get_relative_index(stack_top(c->st)) == 0) return 0; depset_add(c->ds, queue_elt_n(c->bf, 0), label, stack_top(c->st)); stack_pop(c->st); config_add_mvt(c, movement_left_code(label)); @@ -63,7 +77,7 @@ config *movement_left_arc_dup(config *c, int label, float score, feat_vec *fv) config *copy = NULL; if(stack_is_empty(c->st)) return NULL; if(queue_is_empty(c->bf)) return NULL; - if(word_get_index(stack_top(c->st)) == 0) return NULL; + if(word_get_relative_index(stack_top(c->st)) == 0) return NULL; copy = config_copy(c); depset_add(copy->ds, queue_elt_n(copy->bf, 0), label, stack_top(copy->st)); diff --git a/maca_trans_parser/src/movement_parser.h b/maca_trans_parser/src/movement_parser.h index ce4841208fb184103ba6c9a7aa1380ab8846a091..4078c482906852ffb05798673e9b1d641215e414 100644 --- a/maca_trans_parser/src/movement_parser.h +++ b/maca_trans_parser/src/movement_parser.h @@ -40,7 +40,8 @@ int movement_shift(config *c, int stream, float score); config *movement_left_arc_dup(config *c, int label, float score, feat_vec *fv); config *movement_right_arc_dup(config *c, int label, float score, feat_vec *fv); config *movement_shift_dup(config *c, int stream, float score, feat_vec *fv); -/* int movement_cut(config *c); */ +void movement_print(FILE *f, int mvt_code, dico *dico_labels); + /* int movement_cut(config *c); */ #endif diff --git a/maca_trans_parser/src/oracle_parser.c b/maca_trans_parser/src/oracle_parser.c index 89cb0a82488af79e0d55e2060cfdd0aa2652f65b..b4145749558a3bc88926460e1b93e07f77177418 100644 --- a/maca_trans_parser/src/oracle_parser.c +++ b/maca_trans_parser/src/oracle_parser.c @@ -9,11 +9,11 @@ int check_all_dependents_of_word_in_ref_are_in_hyp(config *c, sentence *ref, int int dep; for(dep=0; dep < ref->length; dep++){ - if(word_get_gov(ref->words[dep]) == word_index){ /* found a dependent of word in ref */ + if(word_get_gov_relative_index(ref->words[dep]) == word_index){ /* found a dependent of word in ref */ /* look for a dependency in hyp such that its dependent is dep */ if((dep >= c->ds->length) || (c->ds->array[dep].gov == NULL) - || (word_get_index(c->ds->array[dep].gov) != word_index) + || (word_get_relative_index(c->ds->array[dep].gov) != word_index) || (c->ds->array[dep].label != word_get_label(ref->words[dep]))) return 0; } @@ -21,7 +21,7 @@ int check_all_dependents_of_word_in_ref_are_in_hyp(config *c, sentence *ref, int return 1; } -int oracle(config *c, sentence *ref) +int oracle_parser(config *c, sentence *ref) { word *s0; /* word on top of stack */ word *b0; /* next word in the bufer */ @@ -29,22 +29,22 @@ int oracle(config *c, sentence *ref) if(!stack_is_empty(c->st) && !queue_is_empty(c->bf)){ s0 = stack_top(c->st); - s0_index = word_get_index(s0); + s0_index = word_get_relative_index(s0); b0 = queue_elt_n(c->bf, 0); - b0_index = word_get_index(b0); + b0_index = word_get_relative_index(b0); - /* printf("s0 = %d b0 = %d\n", s0_index, b0_index); */ - /*printf("dans ref gov de %d = %d\n", s0_index, word_get_gov(ref->words[s0_index])); - printf("dans ref gov de %d = %d\n", b0_index, word_get_gov(ref->words[b0_index])); */ + /* printf("s0 = %d b0 = %d\n", s0_index, b0_index); */ + /*printf("dans ref gov de %d = %d\n", s0_index, word_get_gov_relative_index(ref->words[s0_index])); + printf("dans ref gov de %d = %d\n", b0_index, word_get_gov_relative_index(ref->words[b0_index])); */ /* LEFT ARC b0 is the governor and s0 the dependent */ - - if(word_get_gov(ref->words[s0_index]) == b0_index) + + if(word_get_gov_relative_index(ref->words[s0_index]) == b0_index) return movement_left_code(word_get_label(ref->words[s0_index])); /* RIGHT ARC s0 is the governor and b0 the dependent */ - if((word_get_gov(ref->words[b0_index]) == s0_index) + if((word_get_gov_relative_index(ref->words[b0_index]) == s0_index) && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, b0_index)){ return movement_right_code(word_get_label(ref->words[b0_index])); } diff --git a/maca_trans_parser/src/oracle_parser.h b/maca_trans_parser/src/oracle_parser.h index 2d454228f2fa32b0adccea2d190ae59050df497e..24d73a89adc933d7d34f67423b8300d1ec9df7d5 100644 --- a/maca_trans_parser/src/oracle_parser.h +++ b/maca_trans_parser/src/oracle_parser.h @@ -8,6 +8,6 @@ #include"sentence.h" -int oracle(config *c, sentence *ref); +int oracle_parser(config *c, sentence *ref); #endif diff --git a/maca_trans_parser/src/queue.c b/maca_trans_parser/src/queue.c index 53985be98cd6c7f7b8b825ffc4d31c234ecd78d1..599739a6f1fe4132289e1a479c8b593dfc042e08 100644 --- a/maca_trans_parser/src/queue.c +++ b/maca_trans_parser/src/queue.c @@ -7,7 +7,7 @@ int queue_renumber_words(queue *bf) int i; int index = 1; for(i=0; i < bf->nbelem; i++){ - word_set_index(queue_elt_n(bf, i), index++); + word_set_relative_index(queue_elt_n(bf, i), index++); } return index; } @@ -20,20 +20,18 @@ int queue_read_sentence(queue *bf, FILE *f, mcd *mcd_struct) while(fgets(buffer, 10000, f)){ if(feof(f)) break; - /* fprintf(stderr, "%s", buffer); */ + /* fprintf(stderr, "%s", buffer); */ if((buffer[0] == '\n') || (buffer[0] == ' ') || (buffer[0] == '\t')) break; /* end of the sentence */ w = word_parse_buffer(buffer, mcd_struct); - if(word_get_index(w) == -1){ - w->feat_array[FEAT_TYPE_INDEX] = index++; - } + word_set_relative_index(w, index); + index++; queue_add(bf, w); + if(word_is_eos(w, mcd_struct)) break; } /* return bf->nbelem - 1; */ /* because of the dummy word */ return bf->nbelem ; } - - word *queue_elt_n(queue *q, int n) { return (n >= q->nbelem) ? NULL : q->array[(q->head + n) % q->size]; @@ -45,12 +43,12 @@ void queue_print(FILE *f, queue *q) fprintf(f, "("); if(q->tail >= q->head) for(i=q->head; i < q->tail; i++) - fprintf(f, "%d ", word_get_index(q->array[i])); + fprintf(f, "%d ", word_get_relative_index(q->array[i])); else{ for(i=q->head; i < q->size; i++) - fprintf(f, "%d ", word_get_index(q->array[i])); + fprintf(f, "%d ", word_get_relative_index(q->array[i])); for(i=0; i < q->tail; i++) - fprintf(f, "%d ", word_get_index(q->array[i])); + fprintf(f, "%d ", word_get_relative_index(q->array[i])); } fprintf(f, ")\n"); } diff --git a/maca_trans_parser/src/simple_decoder_parser.c b/maca_trans_parser/src/simple_decoder_parser.c index 0459968cb14af10ec5f13e12f7c0b813da28e8d3..947844110733187602ffb58612515b5fa4e9184d 100644 --- a/maca_trans_parser/src/simple_decoder_parser.c +++ b/maca_trans_parser/src/simple_decoder_parser.c @@ -43,7 +43,6 @@ void simple_decoder_buffer(context *ctx, FILE *f, feature_table *ft, int root_la config_connect_subtrees(c, root_label); depset_print2(stdout, c->ds, ctx->dico_labels); - fprintf(stdout, "\n"); /* config_free(c); */ @@ -63,24 +62,24 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la feat_vec *fv = feat_vec_new(feature_types_nb); config *c = NULL; - /* when in stream mode, force to renumber the tokens (ugly !) */ - /* ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; */ - c = config_initial(f, ctx->mcd_struct, 5); while(!config_is_terminal(c)){ - /* config_print(stdout, c); */ + /* config_print(stdout, c); */ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); - /* feat_vec_print_string(fv, ctx->d_perceptron_features); */ + /* feat_vec_print(stdout, fv); */ mvt_code = feature_table_argmax(fv, ft, &max); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); /* printf("code predicted = %d\n", mvt_code); */ - + /* movement_print(stdout, mvt_code, ctx->dico_labels); */ + /* sentence is complete */ - if((stack_height(c->st)==1) && (mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ - /* if(mvt_label == root_label){ */ - /* config_print(stdout, c); */ + if((stack_height(c->st)==1) && (mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ + /* if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ */ + /* if(mvt_label == root_label){ */ + /* printf("sentence complete\n"); */ + /*config_print(stdout, c); */ /* create the root arc */ movement_right_arc(c, mvt_label, 0); @@ -88,12 +87,12 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la /* shift dummy word in stack */ movement_shift(c, 1, 0); - /* config_print(stdout, c); */ + /* config_print(stdout, c); */ - config_connect_subtrees(c, root_label); + /* config_connect_subtrees(c, root_label); */ /* depset_print_new_index(stdout, c->ds, ctx->dico_labels); */ - if(ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] == -1) + if(ctx->mcd_struct->wf2col[MCD_WF_INDEX] == -1) depset_print3(stdout, c->ds, ctx->dico_labels); else depset_print2(stdout, c->ds, ctx->dico_labels); @@ -106,7 +105,9 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la /* empty depset */ depset_free(c->ds); c->ds = depset_new(); - /* c->current_index = queue_renumber_words(c->bf); */ + + /* renumber the words that are left in the buffer */ + c->current_index = queue_renumber_words(c->bf); continue; } @@ -123,8 +124,17 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la /* config_print(stdout, c); */ - config_connect_subtrees(c, root_label); - depset_print_new_index(stdout, c->ds, ctx->dico_labels); + /* config_connect_subtrees(c, root_label); */ + + + if(ctx->mcd_struct->wf2col[MCD_WF_INDEX] == -1) + depset_print3(stdout, c->ds, ctx->dico_labels); + else + depset_print2(stdout, c->ds, ctx->dico_labels); + + + + /* depset_print_new_index(stdout, c->ds, ctx->dico_labels); */ /* config_free(c); */ diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c index 882a2aa5bc4c79a5194a07090f1e6a8865456c5b..579cc64406644c14657c9b40dd6977edd0bc8610 100644 --- a/maca_trans_parser/src/simple_decoder_tagger.c +++ b/maca_trans_parser/src/simple_decoder_tagger.c @@ -17,6 +17,7 @@ void add_signature_to_words_in_queue(queue *bf, form2pos *f2p) for(i=0; i < queue_nbelem(bf); i++){ w = queue_elt_n(bf, i); + w->signature = form2pos_get_signature(f2p, w->form); } } @@ -52,7 +53,7 @@ void simple_decoder_buffer(context *ctx) w = stack_elt_n(c->st, i); printf("%s\t%s\n", w->input, dico_int2string(dico_pos, word_get_pos(w))); } - printf("\n"); + /* printf("\n"); */ /* config_free(c); */ c = config_initial_no_dummy_word(f, ctx->mcd_struct, 0); diff --git a/maca_trans_parser/src/stack.c b/maca_trans_parser/src/stack.c index cbd419e9c496e6e6cc6c89b3616d0f21ea6a161d..24ed623226c1272f39cdfb60efb623bd351b0b0c 100644 --- a/maca_trans_parser/src/stack.c +++ b/maca_trans_parser/src/stack.c @@ -93,7 +93,7 @@ void stack_print(FILE *buffer, stack *s) if(s){ fprintf(buffer, "["); for(i=0; i < stack_height(s); i++) - fprintf(buffer, " %d", word_get_index(s->array[i])); + fprintf(buffer, " %d", word_get_relative_index(s->array[i])); fprintf(buffer, "]"); } } diff --git a/maca_trans_parser/src/train_perceptron.c b/maca_trans_parser/src/train_perceptron.c index 52fb1bcbb23576c445be206fb0d7f40e9f1943f7..bf1505cbfc51adfebf77cf73a1c1c21a6f6d4ba9 100644 --- a/maca_trans_parser/src/train_perceptron.c +++ b/maca_trans_parser/src/train_perceptron.c @@ -121,7 +121,7 @@ feature_table *train_perceptron(context *ctx) config2feat_vec_cff(ctx->features_model, config_oracle, ctx->d_perceptron_features, fv, ctx->mode); - mvt_oracle_code = oracle(config_oracle, ref); + mvt_oracle_code = oracle_parser(config_oracle, ref); mvt_oracle_type = movement_type(mvt_oracle_code); mvt_oracle_label = movement_label(mvt_oracle_code); @@ -297,7 +297,7 @@ feature_table *train_perceptron_early_update(context *ctx) /*---------------------------------------*/ /* compute new oracle transition */ - mvt_oracle_code = oracle(config_oracle, ref); + mvt_oracle_code = oracle_parser(config_oracle, ref); mvt_oracle_type = movement_type(mvt_oracle_code); mvt_oracle_label = movement_label(mvt_oracle_code);