Commit 1c1b0a6c authored by Alexis Nasr's avatar Alexis Nasr
Browse files

modified input file format for maca_trans_tagger and maca_trans_parser

parent fc0b36e8
...@@ -4,7 +4,6 @@ set(SOURCES src/util.c ...@@ -4,7 +4,6 @@ set(SOURCES src/util.c
src/word_emb.c src/word_emb.c
src/mcd.c src/mcd.c
src/dico_vec.c src/dico_vec.c
src/feat_types.c
src/form2pos.c src/form2pos.c
src/word.c src/word.c
src/sentence.c src/sentence.c
......
...@@ -6,19 +6,91 @@ ...@@ -6,19 +6,91 @@
#define MCD_REPRESENTATION_VOCAB 2 #define MCD_REPRESENTATION_VOCAB 2
#define MCD_REPRESENTATION_INT 3 #define MCD_REPRESENTATION_INT 3
#define MCD_INVALID_VALUE -1 #define MCD_INVALID_VALUE -1
#define MCD_WF_NB 36
#define MCD_WF_INDEX 0
#define MCD_WF_FORM 1
#define MCD_WF_LEMMA 2
#define MCD_WF_CPOS 3
#define MCD_WF_POS 4
#define MCD_WF_FEATS 5
#define MCD_WF_GOV 6
#define MCD_WF_LABEL 7
#define MCD_WF_STAG 8
#define MCD_WF_SENT_SEG 9
#define MCD_WF_A 10
#define MCD_WF_B 11
#define MCD_WF_C 12
#define MCD_WF_D 13
#define MCD_WF_E 14
#define MCD_WF_F 15
#define MCD_WF_G 16
#define MCD_WF_H 17
#define MCD_WF_I 18
#define MCD_WF_J 19
#define MCD_WF_K 20
#define MCD_WF_L 21
#define MCD_WF_M 22
#define MCD_WF_N 23
#define MCD_WF_O 24
#define MCD_WF_P 25
#define MCD_WF_Q 26
#define MCD_WF_R 27
#define MCD_WF_S 28
#define MCD_WF_T 29
#define MCD_WF_U 30
#define MCD_WF_V 31
#define MCD_WF_W 32
#define MCD_WF_X 33
#define MCD_WF_Y 34
#define MCD_WF_Z 35
#include "dico.h" #include "dico.h"
#include "feat_types.h"
#include "word_emb.h" #include "word_emb.h"
#include "dico_vec.h" #include "dico_vec.h"
#define mcd_get_dico_label(m) (m)->dico_array[FEAT_TYPE_LABEL] #define mcd_get_dico_label(m) (m)->dico_array[MCD_WF_LABEL]
#define mcd_get_index_col(m) (m)->wf2col[MCD_WF_INDEX]
#define mcd_get_form_col(m) (m)->wf2col[MCD_WF_FORM]
#define mcd_get_lemma_col(m) (m)->wf2col[MCD_WF_LEMMA]
#define mcd_get_cpos_col(m) (m)->wf2col[MCD_WF_CPOS]
#define mcd_get_pos_col(m) (m)->wf2col[MCD_WF_POS]
#define mcd_get_feats_col(m) (m)->wf2col[MCD_WF_FEATS]
#define mcd_get_gov_col(m) (m)->wf2col[MCD_WF_GOV]
#define mcd_get_label_col(m) (m)->wf2col[MCD_WF_LABEL]
#define mcd_get_stag_col(m) (m)->wf2col[MCD_WF_STAG]
#define mcd_get_sent_seg_col(m) (m)->wf2col[MCD_WF_SENT_SEG]
#define mcd_get_a_col(m) (m)->wf2col[MCD_WF_A]
#define mcd_get_b_col(m) (m)->wf2col[MCD_WF_B]
#define mcd_get_c_col(m) (m)->wf2col[MCD_WF_C]
#define mcd_get_d_col(m) (m)->wf2col[MCD_WF_D]
#define mcd_get_e_col(m) (m)->wf2col[MCD_WF_E]
#define mcd_get_f_col(m) (m)->wf2col[MCD_WF_F]
#define mcd_get_g_col(m) (m)->wf2col[MCD_WF_G]
#define mcd_get_h_col(m) (m)->wf2col[MCD_WF_H]
#define mcd_get_i_col(m) (m)->wf2col[MCD_WF_I]
#define mcd_get_j_col(m) (m)->wf2col[MCD_WF_J]
#define mcd_get_k_col(m) (m)->wf2col[MCD_WF_K]
#define mcd_get_l_col(m) (m)->wf2col[MCD_WF_L]
#define mcd_get_m_col(m) (m)->wf2col[MCD_WF_M]
#define mcd_get_n_col(m) (m)->wf2col[MCD_WF_N]
#define mcd_get_o_col(m) (m)->wf2col[MCD_WF_O]
#define mcd_get_p_col(m) (m)->wf2col[MCD_WF_P]
#define mcd_get_q_col(m) (m)->wf2col[MCD_WF_Q]
#define mcd_get_r_col(m) (m)->wf2col[MCD_WF_R]
#define mcd_get_s_col(m) (m)->wf2col[MCD_WF_S]
#define mcd_get_t_col(m) (m)->wf2col[MCD_WF_T]
#define mcd_get_u_col(m) (m)->wf2col[MCD_WF_U]
#define mcd_get_v_col(m) (m)->wf2col[MCD_WF_V]
#define mcd_get_w_col(m) (m)->wf2col[MCD_WF_W]
#define mcd_get_x_col(m) (m)->wf2col[MCD_WF_X]
#define mcd_get_y_col(m) (m)->wf2col[MCD_WF_Y]
#define mcd_get_z_col(m) (m)->wf2col[MCD_WF_Z]
#define mcd_get_form_col(m) (m)->type[FEAT_TYPE_FORM] #define mcd_set_form_col(m, v) (m)->wf[MCD_WF_FORM] = (v)
#define mcd_set_form_col(m, v) (m)->type[FEAT_TYPE_FORM] = (v)
/* mcd (multi column description) files describe the format of corpus files */ /* mcd (multi column description) files describe the format of corpus files */
/* every line of an mcd file describes the content of a column of the corpus file */ /* every line of an mcd file describes the content of a column of the corpus file */
...@@ -35,10 +107,9 @@ ...@@ -35,10 +107,9 @@
typedef struct { typedef struct {
int nb_col; /* number of columns in the mcd file */ int nb_col; /* number of columns in the mcd file */
int type2col[FEAT_TYPE_NB]; /* in which column is represented is the form (FEAT_TYPE_FORM) lemma ... represented */ int wf2col[MCD_WF_NB]; /* in which column are the word features (MCD_WF_FORM, MCD_WF_LEMMA ...) represented */
/* int *col2type; */ int *wf; /* array containing the word feature that correspond to each column */
int *type; /* array containing the type of every column */ char **wf_str; /* a string version of array word feature */
char **type_str; /* a string version of array type */
int *representation; /* array containing the representation mode of every column (integer, vocabulary, embedding, NULL) */ int *representation; /* array containing the representation mode of every column (integer, vocabulary, embedding, NULL) */
char **filename; /* array containing the file in which the different values for a columnn is represented */ char **filename; /* array containing the file in which the different values for a columnn is represented */
dico **dico_array; /* array containing the dico corresponding to each column (NULL if no file) */ dico **dico_array; /* array containing the dico corresponding to each column (NULL if no file) */
...@@ -54,5 +125,6 @@ void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename); ...@@ -54,5 +125,6 @@ void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename);
void mcd_free(mcd *m); void mcd_free(mcd *m);
int mcd_get_code(mcd *m, char *str, int col); int mcd_get_code(mcd *m, char *str, int col);
dico_vec *mcd_build_dico_vec(mcd *mcd_struct); dico_vec *mcd_build_dico_vec(mcd *mcd_struct);
int mcd_wf_code(char *wf);
#endif #endif
...@@ -3,88 +3,96 @@ ...@@ -3,88 +3,96 @@
#include "mcd.h" #include "mcd.h"
#define word_get_index(w) (w)->feat_array[FEAT_TYPE_INDEX] #define word_get_index(w) (w)->wf_array[MCD_WF_INDEX]
#define word_get_form(w) (w)->feat_array[FEAT_TYPE_FORM] #define word_get_form(w) (w)->wf_array[MCD_WF_FORM]
#define word_get_lemma(w) (w)->feat_array[FEAT_TYPE_LEMMA] #define word_get_lemma(w) (w)->wf_array[MCD_WF_LEMMA]
#define word_get_cpos(w) (w)->feat_array[FEAT_TYPE_CPOS] #define word_get_cpos(w) (w)->wf_array[MCD_WF_CPOS]
#define word_get_pos(w) (w)->feat_array[FEAT_TYPE_POS] #define word_get_pos(w) (w)->wf_array[MCD_WF_POS]
#define word_get_feats(w) (w)->feat_array[FEAT_TYPE_FEATS] #define word_get_feats(w) (w)->wf_array[MCD_WF_FEATS]
#define word_get_gov(w) (w)->feat_array[FEAT_TYPE_GOV] #define word_get_gov(w) (w)->wf_array[MCD_WF_GOV]
#define word_get_label(w) (w)->feat_array[FEAT_TYPE_LABEL] #define word_get_label(w) (w)->wf_array[MCD_WF_LABEL]
#define word_get_stag(w) (w)->feat_array[FEAT_TYPE_STAG] #define word_get_stag(w) (w)->wf_array[MCD_WF_STAG]
#define word_get_A(w) (w)->feat_array[FEAT_TYPE_A] #define word_get_sent_seg(w) (w)->wf_array[MCD_WF_SENT_SEG]
#define word_get_B(w) (w)->feat_array[FEAT_TYPE_B] #define word_get_A(w) (w)->wf_array[MCD_WF_A]
#define word_get_C(w) (w)->feat_array[FEAT_TYPE_C] #define word_get_B(w) (w)->wf_array[MCD_WF_B]
#define word_get_D(w) (w)->feat_array[FEAT_TYPE_D] #define word_get_C(w) (w)->wf_array[MCD_WF_C]
#define word_get_E(w) (w)->feat_array[FEAT_TYPE_E] #define word_get_D(w) (w)->wf_array[MCD_WF_D]
#define word_get_F(w) (w)->feat_array[FEAT_TYPE_F] #define word_get_E(w) (w)->wf_array[MCD_WF_E]
#define word_get_G(w) (w)->feat_array[FEAT_TYPE_G] #define word_get_F(w) (w)->wf_array[MCD_WF_F]
#define word_get_H(w) (w)->feat_array[FEAT_TYPE_H] #define word_get_G(w) (w)->wf_array[MCD_WF_G]
#define word_get_I(w) (w)->feat_array[FEAT_TYPE_I] #define word_get_H(w) (w)->wf_array[MCD_WF_H]
#define word_get_J(w) (w)->feat_array[FEAT_TYPE_J] #define word_get_I(w) (w)->wf_array[MCD_WF_I]
#define word_get_K(w) (w)->feat_array[FEAT_TYPE_K] #define word_get_J(w) (w)->wf_array[MCD_WF_J]
#define word_get_L(w) (w)->feat_array[FEAT_TYPE_L] #define word_get_K(w) (w)->wf_array[MCD_WF_K]
#define word_get_M(w) (w)->feat_array[FEAT_TYPE_M] #define word_get_L(w) (w)->wf_array[MCD_WF_L]
#define word_get_N(w) (w)->feat_array[FEAT_TYPE_N] #define word_get_M(w) (w)->wf_array[MCD_WF_M]
#define word_get_O(w) (w)->feat_array[FEAT_TYPE_O] #define word_get_N(w) (w)->wf_array[MCD_WF_N]
#define word_get_P(w) (w)->feat_array[FEAT_TYPE_P] #define word_get_O(w) (w)->wf_array[MCD_WF_O]
#define word_get_Q(w) (w)->feat_array[FEAT_TYPE_Q] #define word_get_P(w) (w)->wf_array[MCD_WF_P]
#define word_get_R(w) (w)->feat_array[FEAT_TYPE_R] #define word_get_Q(w) (w)->wf_array[MCD_WF_Q]
#define word_get_S(w) (w)->feat_array[FEAT_TYPE_S] #define word_get_R(w) (w)->wf_array[MCD_WF_R]
#define word_get_T(w) (w)->feat_array[FEAT_TYPE_T] #define word_get_S(w) (w)->wf_array[MCD_WF_S]
#define word_get_U(w) (w)->feat_array[FEAT_TYPE_U] #define word_get_T(w) (w)->wf_array[MCD_WF_T]
#define word_get_V(w) (w)->feat_array[FEAT_TYPE_V] #define word_get_U(w) (w)->wf_array[MCD_WF_U]
#define word_get_W(w) (w)->feat_array[FEAT_TYPE_W] #define word_get_V(w) (w)->wf_array[MCD_WF_V]
#define word_get_X(w) (w)->feat_array[FEAT_TYPE_X] #define word_get_W(w) (w)->wf_array[MCD_WF_W]
#define word_get_Y(w) (w)->feat_array[FEAT_TYPE_Y] #define word_get_X(w) (w)->wf_array[MCD_WF_X]
#define word_get_Z(w) (w)->feat_array[FEAT_TYPE_Z] #define word_get_Y(w) (w)->wf_array[MCD_WF_Y]
#define word_get_Z(w) (w)->wf_array[MCD_WF_Z]
#define word_get_signature(w) (w)->signature #define word_get_signature(w) (w)->signature
#define word_set_index(w, val) (w)->feat_array[FEAT_TYPE_INDEX] = (val) #define word_set_index(w, val) (w)->wf_array[MCD_WF_INDEX] = (val)
#define word_set_form(w, val) (w)->feat_array[FEAT_TYPE_FORM] = (val) #define word_set_form(w, val) (w)->wf_array[MCD_WF_FORM] = (val)
#define word_set_lemma(w, val) (w)->feat_array[FEAT_TYPE_LEMMA] = (val) #define word_set_lemma(w, val) (w)->wf_array[MCD_WF_LEMMA] = (val)
#define word_set_cpos(w, val) (w)->feat_array[FEAT_TYPE_CPOS] = (val) #define word_set_cpos(w, val) (w)->wf_array[MCD_WF_CPOS] = (val)
#define word_set_pos(w, val) (w)->feat_array[FEAT_TYPE_POS] = (val) #define word_set_pos(w, val) (w)->wf_array[MCD_WF_POS] = (val)
#define word_set_feats(w, val) (w)->feat_array[FEAT_TYPE_FEATS] = (val) #define word_set_feats(w, val) (w)->wf_array[MCD_WF_FEATS] = (val)
#define word_set_gov(w, val) (w)->feat_array[FEAT_TYPE_GOV] = (val) #define word_set_gov(w, val) (w)->wf_array[MCD_WF_GOV] = (val)
#define word_set_label(w, val) (w)->feat_array[FEAT_TYPE_LABEL] = (val) #define word_set_label(w, val) (w)->wf_array[MCD_WF_LABEL] = (val)
#define word_set_stag(w, val) (w)->feat_array[FEAT_TYPE_STAG] = (val) #define word_set_stag(w, val) (w)->wf_array[MCD_WF_STAG] = (val)
#define word_set_A(w, val) (w)->feat_array[FEAT_TYPE_A] = (val) #define word_set_word_seg(w) (w)->wf_array[MCD_WF_WORD_SEG] = (val)
#define word_set_B(w, val) (w)->feat_array[FEAT_TYPE_B] = (val) #define word_set_A(w, val) (w)->wf_array[MCD_WF_A] = (val)
#define word_set_C(w, val) (w)->feat_array[FEAT_TYPE_C] = (val) #define word_set_B(w, val) (w)->wf_array[MCD_WF_B] = (val)
#define word_set_D(w, val) (w)->feat_array[FEAT_TYPE_D] = (val) #define word_set_C(w, val) (w)->wf_array[MCD_WF_C] = (val)
#define word_set_E(w, val) (w)->feat_array[FEAT_TYPE_E] = (val) #define word_set_D(w, val) (w)->wf_array[MCD_WF_D] = (val)
#define word_set_F(w, val) (w)->feat_array[FEAT_TYPE_F] = (val) #define word_set_E(w, val) (w)->wf_array[MCD_WF_E] = (val)
#define word_set_G(w, val) (w)->feat_array[FEAT_TYPE_G] = (val) #define word_set_F(w, val) (w)->wf_array[MCD_WF_F] = (val)
#define word_set_H(w, val) (w)->feat_array[FEAT_TYPE_H] = (val) #define word_set_G(w, val) (w)->wf_array[MCD_WF_G] = (val)
#define word_set_I(w, val) (w)->feat_array[FEAT_TYPE_I] = (val) #define word_set_H(w, val) (w)->wf_array[MCD_WF_H] = (val)
#define word_set_J(w, val) (w)->feat_array[FEAT_TYPE_J] = (val) #define word_set_I(w, val) (w)->wf_array[MCD_WF_I] = (val)
#define word_set_K(w, val) (w)->feat_array[FEAT_TYPE_K] = (val) #define word_set_J(w, val) (w)->wf_array[MCD_WF_J] = (val)
#define word_set_L(w, val) (w)->feat_array[FEAT_TYPE_L] = (val) #define word_set_K(w, val) (w)->wf_array[MCD_WF_K] = (val)
#define word_set_M(w, val) (w)->feat_array[FEAT_TYPE_M] = (val) #define word_set_L(w, val) (w)->wf_array[MCD_WF_L] = (val)
#define word_set_N(w, val) (w)->feat_array[FEAT_TYPE_N] = (val) #define word_set_M(w, val) (w)->wf_array[MCD_WF_M] = (val)
#define word_set_O(w, val) (w)->feat_array[FEAT_TYPE_O] = (val) #define word_set_N(w, val) (w)->wf_array[MCD_WF_N] = (val)
#define word_set_P(w, val) (w)->feat_array[FEAT_TYPE_P] = (val) #define word_set_O(w, val) (w)->wf_array[MCD_WF_O] = (val)
#define word_set_Q(w, val) (w)->feat_array[FEAT_TYPE_Q] = (val) #define word_set_P(w, val) (w)->wf_array[MCD_WF_P] = (val)
#define word_set_R(w, val) (w)->feat_array[FEAT_TYPE_R] = (val) #define word_set_Q(w, val) (w)->wf_array[MCD_WF_Q] = (val)
#define word_set_S(w, val) (w)->feat_array[FEAT_TYPE_S] = (val) #define word_set_R(w, val) (w)->wf_array[MCD_WF_R] = (val)
#define word_set_T(w, val) (w)->feat_array[FEAT_TYPE_T] = (val) #define word_set_S(w, val) (w)->wf_array[MCD_WF_S] = (val)
#define word_set_U(w, val) (w)->feat_array[FEAT_TYPE_U] = (val) #define word_set_T(w, val) (w)->wf_array[MCD_WF_T] = (val)
#define word_set_V(w, val) (w)->feat_array[FEAT_TYPE_V] = (val) #define word_set_U(w, val) (w)->wf_array[MCD_WF_U] = (val)
#define word_set_W(w, val) (w)->feat_array[FEAT_TYPE_W] = (val) #define word_set_V(w, val) (w)->wf_array[MCD_WF_V] = (val)
#define word_set_X(w, val) (w)->feat_array[FEAT_TYPE_X] = (val) #define word_set_W(w, val) (w)->wf_array[MCD_WF_W] = (val)
#define word_set_X(w, val) (w)->wf_array[MCD_WF_X] = (val)
#define word_set_Y(w, val) (w)->feat_array[FEAT_TYPE_Y] = (val) #define word_set_Y(w, val) (w)->wf_array[MCD_WF_Y] = (val)
#define word_set_Z(w, val) (w)->feat_array[FEAT_TYPE_Z] = (val) #define word_set_Z(w, val) (w)->wf_array[MCD_WF_Z] = (val)
#define word_set_signature(w, val) (w)->signature = (val) #define word_set_signature(w, val) (w)->signature = (val)
#define word_set_relative_index(w, val) (w)->relative_index = (val)
#define word_get_relative_index(w) (w)->relative_index
typedef struct _word { typedef struct _word {
int feat_array[FEAT_TYPE_NB]; /* array containing the codes corresponding to the different word features */ int wf_array[MCD_WF_NB]; /* array containing the codes corresponding to the different word features */
char *input; /* the string corresponding to the actual line in the corpus file */ char *input; /* the string corresponding to the actual line in the corpus file */
int U1; /* does the form begin with an uppercase character */ int U1; /* does the form begin with an uppercase character */
int signature; /* pos tags that this form can have (represented as a boolean string) */ int signature; /* pos tags that this form can have (represented as a boolean string) */
int label; int label;
char *form; char *form;
int relative_index;
} word; } word;
word *word_new(char *input); word *word_new(char *input);
...@@ -92,11 +100,13 @@ word *word_create_dummy(mcd *mcd_struct); ...@@ -92,11 +100,13 @@ word *word_create_dummy(mcd *mcd_struct);
word *word_copy(word *w); word *word_copy(word *w);
void word_free(word *w); void word_free(word *w);
void word_print2(FILE *f, word *w);
void word_print(FILE *f, word *w, mcd *mcd_struct, dico *dico_labels); void word_print(FILE *f, word *w, mcd *mcd_struct, dico *dico_labels);
word *word_read(FILE *f, mcd *mcd_struct); word *word_read(FILE *f, mcd *mcd_struct);
word *word_parse_buffer(char *buffer, mcd *mcd_struct); word *word_parse_buffer(char *buffer, mcd *mcd_struct);
int word_is_eos(word *w, mcd *mcd_struct);
int word_get_gov_relative_index(word *w);
#endif #endif
...@@ -89,9 +89,8 @@ void dico_print_fh(FILE *f, dico *d) ...@@ -89,9 +89,8 @@ void dico_print_fh(FILE *f, dico *d)
void dico_print(char *filename, dico *d) void dico_print(char *filename, dico *d)
{ {
FILE *f; FILE *f;
if(filename == NULL){ if(filename == NULL)
f = stdout; f = stdout;
}
else{ else{
f= fopen(filename, "w"); f= fopen(filename, "w");
if(f == NULL){ if(f == NULL){
...@@ -100,7 +99,9 @@ void dico_print(char *filename, dico *d) ...@@ -100,7 +99,9 @@ void dico_print(char *filename, dico *d)
} }
} }
dico_print_fh(f, d); dico_print_fh(f, d);
fclose(f);
if(filename != NULL)
fclose(f);
} }
int dico_add(dico *d, char *key) int dico_add(dico *d, char *key)
...@@ -136,7 +137,9 @@ char *dico_int2string(dico *d, int val) ...@@ -136,7 +137,9 @@ char *dico_int2string(dico *d, int val)
int dico_string2int(dico *d, char *string) int dico_string2int(dico *d, char *string)
{ {
cell *c = hash_lookup(d->htable, string); cell *c;
c= hash_lookup(d->htable, string);
if(c) if(c)
return c->val; return c->val;
else else
...@@ -162,6 +165,7 @@ dico *dico_extract_from_corpus(char *filename, int column, char *dico_name) ...@@ -162,6 +165,7 @@ dico *dico_extract_from_corpus(char *filename, int column, char *dico_name)
column_nb = 0; column_nb = 0;
do{ do{
if(column_nb == column){ if(column_nb == column){
/* printf("token = %s\n", token); */
dico_add(d, token); dico_add(d, token);
} }
column_nb++; column_nb++;
......
#include<stdio.h> #include<stdio.h>
#include<stdlib.h> #include<stdlib.h>
#include<string.h> #include<string.h>
...@@ -7,27 +8,26 @@ ...@@ -7,27 +8,26 @@
#include "dico.h" #include "dico.h"
#include "word_emb.h" #include "word_emb.h"
mcd *mcd_new(int nb_col) mcd *mcd_new(int nb_col)
{ {
mcd *m = (mcd *)memalloc(sizeof(mcd)); mcd *m = (mcd *)memalloc(sizeof(mcd));
int i; int i;
m->nb_col = nb_col; m->nb_col = nb_col;
for(i=0; i < FEAT_TYPE_NB; i++) for(i=0; i < MCD_WF_NB; i++)
m->type2col[i] = -1; m->wf2col[i] = -1;
m->representation = (int *) memalloc(nb_col * sizeof(int)); m->representation = (int *) memalloc(nb_col * sizeof(int));
m->type = (int *) memalloc(nb_col * sizeof(int)); m->wf = (int *) memalloc(nb_col * sizeof(int));
m->type_str = (char **) memalloc(nb_col * sizeof(char *)); m->wf_str = (char **) memalloc(nb_col * sizeof(char *));
m->filename = (char **) memalloc(nb_col * sizeof(char *)); m->filename = (char **) memalloc(nb_col * sizeof(char *));
m->dico_array = (dico **) memalloc(nb_col * sizeof(dico *)); m->dico_array = (dico **) memalloc(nb_col * sizeof(dico *));
m->word_emb_array = (word_emb **) memalloc(nb_col * sizeof(word_emb *)); m->word_emb_array = (word_emb **) memalloc(nb_col * sizeof(word_emb *));
for(i=0; i < nb_col; i++){ for(i=0; i < nb_col; i++){
m->representation[i] = MCD_REPRESENTATION_NULL; m->representation[i] = MCD_REPRESENTATION_NULL;
m->type[i] = -1; m->wf[i] = -1;
m->type_str[i] = NULL; m->wf_str[i] = NULL;
m->filename[i] = NULL; m->filename[i] = NULL;
m->dico_array[i] = NULL; m->dico_array[i] = NULL;
m->word_emb_array[i] = NULL;; m->word_emb_array[i] = NULL;;
...@@ -41,14 +41,14 @@ void mcd_free(mcd *m) ...@@ -41,14 +41,14 @@ void mcd_free(mcd *m)
for(i=0; i < m->nb_col; i++){ for(i=0; i < m->nb_col; i++){
if(m->dico_array[i]) dico_free(m->dico_array[i]); if(m->dico_array[i]) dico_free(m->dico_array[i]);
if(m->word_emb_array[i]) word_emb_free(m->word_emb_array[i]); if(m->word_emb_array[i]) word_emb_free(m->word_emb_array[i]);
if(m->type_str[i]) free(m->type_str[i]); if(m->wf_str[i]) free(m->wf_str[i]);
} }
free(m->representation); free(m->representation);
free(m->filename); free(m->filename);
free(m->dico_array); free(m->dico_array);
free(m->word_emb_array); free(m->word_emb_array);
free(m->type_str); free(m->wf_str);
free(m->type); free(m->wf);
free(m); free(m);
} }
...@@ -58,7 +58,7 @@ void mcd_free(mcd *m) ...@@ -58,7 +58,7 @@ void mcd_free(mcd *m)
int mcd_get_code(mcd *m, char *str, int col){ int mcd_get_code(mcd *m, char *str, int col){
if(m->representation[col] == MCD_REPRESENTATION_VOCAB) if(m->representation[col] == MCD_REPRESENTATION_VOCAB)
return dico_string2int(m->dico_array[col], str); return (m->dico_array[col])? dico_string2int(m->dico_array[col], str) : -1;
if(m->representation[col] == MCD_REPRESENTATION_EMB) if(m->representation[col] == MCD_REPRESENTATION_EMB)
return word_emb_get_code(m->word_emb_array[col], str); return word_emb_get_code(m->word_emb_array[col], str);
if(m->representation[col] == MCD_REPRESENTATION_INT) if(m->representation[col] == MCD_REPRESENTATION_INT)
...@@ -74,7 +74,7 @@ int mcd_max_column_index_in_file(char *mcd_filename) ...@@ -74,7 +74,7 @@ int mcd_max_column_index_in_file(char *mcd_filename)
FILE *f = myfopen(mcd_filename, "r"); FILE *f = myfopen(mcd_filename, "r");
char buffer[1000]; /* ugly */ char buffer[1000]; /* ugly */
int column; int column;
char type[100]; char wf[100];
char representation[100]; char representation[100];
char filename[500]; /* ugly */ char filename[500]; /* ugly */
int fields_number; int fields_number;
...@@ -84,7 +84,7 @@ int mcd_max_column_index_in_file(char *mcd_filename) ...@@ -84,7 +84,7 @@ int mcd_max_column_index_in_file(char *mcd_filename)
line_number++; line_number++;
if(feof(f)) break; if(feof(f)) break;
if((buffer[0] == '\n') || (buffer[0] == '#')) continue; if((buffer[0] == '\n') || (buffer[0] == '#')) continue;
fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename); fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename);
if(fields_number != 4){ if(fields_number != 4){
fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename);
continue; continue;
...@@ -106,8 +106,8 @@ void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename) ...@@ -106,8 +106,8 @@ void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename)
if((m->representation[column] == MCD_REPRESENTATION_VOCAB) if((m->representation[column] == MCD_REPRESENTATION_VOCAB)
/* && (strcmp(m->filename[column], "_")) */ /* && (strcmp(m->filename[column], "_")) */
&& (m->dico_array[column] == NULL)){ && (m->dico_array[column] == NULL)){
m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, m->type_str[column]); m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, m->wf_str[column]);
fprintf(stderr, "extracting dico %s from corpus\n", m->type_str[column]); fprintf(stderr, "extracting dico %s \tfrom corpus\n", m->wf_str[column]);
} }
} }
} }
...@@ -123,8 +123,8 @@ void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose) ...@@ -123,8 +123,8 @@ void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose)
if((m->representation[column] == MCD_REPRESENTATION_VOCAB) if((m->representation[column] == MCD_REPRESENTATION_VOCAB)
&& (!strcmp(m->filename[column], "_")) && (!strcmp(m->filename[column], "_"))
&& (m->dico_array[column] == NULL)){ && (m->dico_array[column] == NULL)){
m->dico_array[column] = dico_vec_get_dico(vocabs, m->type_str[column]); m->dico_array[column] = dico_vec_get_dico(vocabs, m->wf_str[column]);
if(verbose) fprintf(stderr, "linking to dico %s\n", m->type_str[column]); if(verbose) fprintf(stderr, "linking to dico %s\n", m->wf_str[column]);
} }
} }
} }
...@@ -134,7 +134,7 @@ void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose) ...@@ -134,7 +134,7 @@ void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose)
mcd *mcd_read(char *mcd_filename, int verbose) mcd *mcd_read(char *mcd_filename, int verbose)
{ {
int column; int column;
char type[100]; char wf[100];
char representation[100]; char representation[100];
char filename[500]; /* ugly */ char filename[500]; /* ugly */
int fields_number; int fields_number;
...@@ -149,19 +149,20 @@ mcd *mcd_read(char *mcd_filename, int verbose) ...@@ -149,19 +149,20 @@ mcd *mcd_read(char *mcd_filename, int verbose)
line_number++; line_number++;
if(feof(f)) break; if(feof(f)) break;
if((buffer[0] == '\n') || (buffer[0] == '#')) continue; if((buffer[0] == '\n') || (buffer[0] == '#')) continue;
fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename); fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename);
if(fields_number != 4){ if(fields_number != 4){
/* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */ /* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */
continue; continue;
} }
if(verbose) fprintf(stderr, "column = %d type = %s representation = %s filename = %s\n", column, type, representation, filename); if(verbose) fprintf(stderr, "column = %d\tword feature = %s\trepresentation = %s\tfilename = %s\n", column, wf, representation, filename);
m->type[column] = feat_type_string2int(type); m->wf[column] = mcd_wf_code(wf);
m->type_str[column] = strdup(type); m->wf_str[column] = strdup(wf);
if(m->type[column] == -1){ if(m->wf[column] == -1){
fprintf(stderr, "in line %d of mcd file %s invalid type, I'm skipping it\n", line_number, mcd_filename); fprintf(stderr, "in line %d of mcd file %s invalid wf, I'm skipping it\n", line_number, mcd_filename);
continue; continue;
} }
m->type2col[m->type[column]] = column;
m->wf2col[m->wf[column]] = column;
if(!strcmp(representation, "_")) m->representation[column] = MCD_REPRESENTATION_NULL; if(!strcmp(representation, "_")) m->representation[column] = MCD_REPRESENTATION_NULL;
else if(!strcmp(representation, "EMB")) m->representation[column] = MCD_REPRESENTATION_EMB; else if(!strcmp(representation, "EMB")) m->representation[column] = MCD_REPRESENTATION_EMB;
...@@ -185,6 +186,7 @@ mcd *mcd_read(char *mcd_filename, int verbose) ...@@ -185,6 +186,7 @@ mcd *mcd_read(char *mcd_filename, int verbose)
} }
} }
} }
fclose(f); fclose(f);
return m; return m;
} }
...@@ -194,53 +196,53 @@ mcd *mcd_read(char *mcd_filename, int verbose) ...@@ -194,53 +196,53 @@ mcd *mcd_read(char *mcd_filename, int verbose)
mcd *mcd_build_conll07(void)