Commit 1c1b0a6c authored by Alexis Nasr's avatar Alexis Nasr
Browse files

modified input file format for maca_trans_tagger and maca_trans_parser

parent fc0b36e8
......@@ -4,7 +4,6 @@ set(SOURCES src/util.c
src/word_emb.c
src/mcd.c
src/dico_vec.c
src/feat_types.c
src/form2pos.c
src/word.c
src/sentence.c
......
......@@ -6,19 +6,91 @@
#define MCD_REPRESENTATION_VOCAB 2
#define MCD_REPRESENTATION_INT 3
#define MCD_INVALID_VALUE -1
#define MCD_WF_NB 36
#define MCD_WF_INDEX 0
#define MCD_WF_FORM 1
#define MCD_WF_LEMMA 2
#define MCD_WF_CPOS 3
#define MCD_WF_POS 4
#define MCD_WF_FEATS 5
#define MCD_WF_GOV 6
#define MCD_WF_LABEL 7
#define MCD_WF_STAG 8
#define MCD_WF_SENT_SEG 9
#define MCD_WF_A 10
#define MCD_WF_B 11
#define MCD_WF_C 12
#define MCD_WF_D 13
#define MCD_WF_E 14
#define MCD_WF_F 15
#define MCD_WF_G 16
#define MCD_WF_H 17
#define MCD_WF_I 18
#define MCD_WF_J 19
#define MCD_WF_K 20
#define MCD_WF_L 21
#define MCD_WF_M 22
#define MCD_WF_N 23
#define MCD_WF_O 24
#define MCD_WF_P 25
#define MCD_WF_Q 26
#define MCD_WF_R 27
#define MCD_WF_S 28
#define MCD_WF_T 29
#define MCD_WF_U 30
#define MCD_WF_V 31
#define MCD_WF_W 32
#define MCD_WF_X 33
#define MCD_WF_Y 34
#define MCD_WF_Z 35
#include "dico.h"
#include "feat_types.h"
#include "word_emb.h"
#include "dico_vec.h"
#define mcd_get_dico_label(m) (m)->dico_array[FEAT_TYPE_LABEL]
#define mcd_get_dico_label(m) (m)->dico_array[MCD_WF_LABEL]
#define mcd_get_index_col(m) (m)->wf2col[MCD_WF_INDEX]
#define mcd_get_form_col(m) (m)->wf2col[MCD_WF_FORM]
#define mcd_get_lemma_col(m) (m)->wf2col[MCD_WF_LEMMA]
#define mcd_get_cpos_col(m) (m)->wf2col[MCD_WF_CPOS]
#define mcd_get_pos_col(m) (m)->wf2col[MCD_WF_POS]
#define mcd_get_feats_col(m) (m)->wf2col[MCD_WF_FEATS]
#define mcd_get_gov_col(m) (m)->wf2col[MCD_WF_GOV]
#define mcd_get_label_col(m) (m)->wf2col[MCD_WF_LABEL]
#define mcd_get_stag_col(m) (m)->wf2col[MCD_WF_STAG]
#define mcd_get_sent_seg_col(m) (m)->wf2col[MCD_WF_SENT_SEG]
#define mcd_get_a_col(m) (m)->wf2col[MCD_WF_A]
#define mcd_get_b_col(m) (m)->wf2col[MCD_WF_B]
#define mcd_get_c_col(m) (m)->wf2col[MCD_WF_C]
#define mcd_get_d_col(m) (m)->wf2col[MCD_WF_D]
#define mcd_get_e_col(m) (m)->wf2col[MCD_WF_E]
#define mcd_get_f_col(m) (m)->wf2col[MCD_WF_F]
#define mcd_get_g_col(m) (m)->wf2col[MCD_WF_G]
#define mcd_get_h_col(m) (m)->wf2col[MCD_WF_H]
#define mcd_get_i_col(m) (m)->wf2col[MCD_WF_I]
#define mcd_get_j_col(m) (m)->wf2col[MCD_WF_J]
#define mcd_get_k_col(m) (m)->wf2col[MCD_WF_K]
#define mcd_get_l_col(m) (m)->wf2col[MCD_WF_L]
#define mcd_get_m_col(m) (m)->wf2col[MCD_WF_M]
#define mcd_get_n_col(m) (m)->wf2col[MCD_WF_N]
#define mcd_get_o_col(m) (m)->wf2col[MCD_WF_O]
#define mcd_get_p_col(m) (m)->wf2col[MCD_WF_P]
#define mcd_get_q_col(m) (m)->wf2col[MCD_WF_Q]
#define mcd_get_r_col(m) (m)->wf2col[MCD_WF_R]
#define mcd_get_s_col(m) (m)->wf2col[MCD_WF_S]
#define mcd_get_t_col(m) (m)->wf2col[MCD_WF_T]
#define mcd_get_u_col(m) (m)->wf2col[MCD_WF_U]
#define mcd_get_v_col(m) (m)->wf2col[MCD_WF_V]
#define mcd_get_w_col(m) (m)->wf2col[MCD_WF_W]
#define mcd_get_x_col(m) (m)->wf2col[MCD_WF_X]
#define mcd_get_y_col(m) (m)->wf2col[MCD_WF_Y]
#define mcd_get_z_col(m) (m)->wf2col[MCD_WF_Z]
#define mcd_get_form_col(m) (m)->type[FEAT_TYPE_FORM]
#define mcd_set_form_col(m, v) (m)->type[FEAT_TYPE_FORM] = (v)
#define mcd_set_form_col(m, v) (m)->wf[MCD_WF_FORM] = (v)
/* mcd (multi column description) files describe the format of corpus files */
/* every line of an mcd file describes the content of a column of the corpus file */
......@@ -35,10 +107,9 @@
typedef struct {
int nb_col; /* number of columns in the mcd file */
int type2col[FEAT_TYPE_NB]; /* in which column is represented is the form (FEAT_TYPE_FORM) lemma ... represented */
/* int *col2type; */
int *type; /* array containing the type of every column */
char **type_str; /* a string version of array type */
int wf2col[MCD_WF_NB]; /* in which column are the word features (MCD_WF_FORM, MCD_WF_LEMMA ...) represented */
int *wf; /* array containing the word feature that correspond to each column */
char **wf_str; /* a string version of array word feature */
int *representation; /* array containing the representation mode of every column (integer, vocabulary, embedding, NULL) */
char **filename; /* array containing the file in which the different values for a columnn is represented */
dico **dico_array; /* array containing the dico corresponding to each column (NULL if no file) */
......@@ -54,5 +125,6 @@ void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename);
void mcd_free(mcd *m);
int mcd_get_code(mcd *m, char *str, int col);
dico_vec *mcd_build_dico_vec(mcd *mcd_struct);
int mcd_wf_code(char *wf);
#endif
......@@ -3,88 +3,96 @@
#include "mcd.h"
#define word_get_index(w) (w)->feat_array[FEAT_TYPE_INDEX]
#define word_get_form(w) (w)->feat_array[FEAT_TYPE_FORM]
#define word_get_lemma(w) (w)->feat_array[FEAT_TYPE_LEMMA]
#define word_get_cpos(w) (w)->feat_array[FEAT_TYPE_CPOS]
#define word_get_pos(w) (w)->feat_array[FEAT_TYPE_POS]
#define word_get_feats(w) (w)->feat_array[FEAT_TYPE_FEATS]
#define word_get_gov(w) (w)->feat_array[FEAT_TYPE_GOV]
#define word_get_label(w) (w)->feat_array[FEAT_TYPE_LABEL]
#define word_get_stag(w) (w)->feat_array[FEAT_TYPE_STAG]
#define word_get_A(w) (w)->feat_array[FEAT_TYPE_A]
#define word_get_B(w) (w)->feat_array[FEAT_TYPE_B]
#define word_get_C(w) (w)->feat_array[FEAT_TYPE_C]
#define word_get_D(w) (w)->feat_array[FEAT_TYPE_D]
#define word_get_E(w) (w)->feat_array[FEAT_TYPE_E]
#define word_get_F(w) (w)->feat_array[FEAT_TYPE_F]
#define word_get_G(w) (w)->feat_array[FEAT_TYPE_G]
#define word_get_H(w) (w)->feat_array[FEAT_TYPE_H]
#define word_get_I(w) (w)->feat_array[FEAT_TYPE_I]
#define word_get_J(w) (w)->feat_array[FEAT_TYPE_J]
#define word_get_K(w) (w)->feat_array[FEAT_TYPE_K]
#define word_get_L(w) (w)->feat_array[FEAT_TYPE_L]
#define word_get_M(w) (w)->feat_array[FEAT_TYPE_M]
#define word_get_N(w) (w)->feat_array[FEAT_TYPE_N]
#define word_get_O(w) (w)->feat_array[FEAT_TYPE_O]
#define word_get_P(w) (w)->feat_array[FEAT_TYPE_P]
#define word_get_Q(w) (w)->feat_array[FEAT_TYPE_Q]
#define word_get_R(w) (w)->feat_array[FEAT_TYPE_R]
#define word_get_S(w) (w)->feat_array[FEAT_TYPE_S]
#define word_get_T(w) (w)->feat_array[FEAT_TYPE_T]
#define word_get_U(w) (w)->feat_array[FEAT_TYPE_U]
#define word_get_V(w) (w)->feat_array[FEAT_TYPE_V]
#define word_get_W(w) (w)->feat_array[FEAT_TYPE_W]
#define word_get_X(w) (w)->feat_array[FEAT_TYPE_X]
#define word_get_Y(w) (w)->feat_array[FEAT_TYPE_Y]
#define word_get_Z(w) (w)->feat_array[FEAT_TYPE_Z]
#define word_get_index(w) (w)->wf_array[MCD_WF_INDEX]
#define word_get_form(w) (w)->wf_array[MCD_WF_FORM]
#define word_get_lemma(w) (w)->wf_array[MCD_WF_LEMMA]
#define word_get_cpos(w) (w)->wf_array[MCD_WF_CPOS]
#define word_get_pos(w) (w)->wf_array[MCD_WF_POS]
#define word_get_feats(w) (w)->wf_array[MCD_WF_FEATS]
#define word_get_gov(w) (w)->wf_array[MCD_WF_GOV]
#define word_get_label(w) (w)->wf_array[MCD_WF_LABEL]
#define word_get_stag(w) (w)->wf_array[MCD_WF_STAG]
#define word_get_sent_seg(w) (w)->wf_array[MCD_WF_SENT_SEG]
#define word_get_A(w) (w)->wf_array[MCD_WF_A]
#define word_get_B(w) (w)->wf_array[MCD_WF_B]
#define word_get_C(w) (w)->wf_array[MCD_WF_C]
#define word_get_D(w) (w)->wf_array[MCD_WF_D]
#define word_get_E(w) (w)->wf_array[MCD_WF_E]
#define word_get_F(w) (w)->wf_array[MCD_WF_F]
#define word_get_G(w) (w)->wf_array[MCD_WF_G]
#define word_get_H(w) (w)->wf_array[MCD_WF_H]
#define word_get_I(w) (w)->wf_array[MCD_WF_I]
#define word_get_J(w) (w)->wf_array[MCD_WF_J]
#define word_get_K(w) (w)->wf_array[MCD_WF_K]
#define word_get_L(w) (w)->wf_array[MCD_WF_L]
#define word_get_M(w) (w)->wf_array[MCD_WF_M]
#define word_get_N(w) (w)->wf_array[MCD_WF_N]
#define word_get_O(w) (w)->wf_array[MCD_WF_O]
#define word_get_P(w) (w)->wf_array[MCD_WF_P]
#define word_get_Q(w) (w)->wf_array[MCD_WF_Q]
#define word_get_R(w) (w)->wf_array[MCD_WF_R]
#define word_get_S(w) (w)->wf_array[MCD_WF_S]
#define word_get_T(w) (w)->wf_array[MCD_WF_T]
#define word_get_U(w) (w)->wf_array[MCD_WF_U]
#define word_get_V(w) (w)->wf_array[MCD_WF_V]
#define word_get_W(w) (w)->wf_array[MCD_WF_W]
#define word_get_X(w) (w)->wf_array[MCD_WF_X]
#define word_get_Y(w) (w)->wf_array[MCD_WF_Y]
#define word_get_Z(w) (w)->wf_array[MCD_WF_Z]
#define word_get_signature(w) (w)->signature
#define word_set_index(w, val) (w)->feat_array[FEAT_TYPE_INDEX] = (val)
#define word_set_form(w, val) (w)->feat_array[FEAT_TYPE_FORM] = (val)
#define word_set_lemma(w, val) (w)->feat_array[FEAT_TYPE_LEMMA] = (val)
#define word_set_cpos(w, val) (w)->feat_array[FEAT_TYPE_CPOS] = (val)
#define word_set_pos(w, val) (w)->feat_array[FEAT_TYPE_POS] = (val)
#define word_set_feats(w, val) (w)->feat_array[FEAT_TYPE_FEATS] = (val)
#define word_set_gov(w, val) (w)->feat_array[FEAT_TYPE_GOV] = (val)
#define word_set_label(w, val) (w)->feat_array[FEAT_TYPE_LABEL] = (val)
#define word_set_stag(w, val) (w)->feat_array[FEAT_TYPE_STAG] = (val)
#define word_set_A(w, val) (w)->feat_array[FEAT_TYPE_A] = (val)
#define word_set_B(w, val) (w)->feat_array[FEAT_TYPE_B] = (val)
#define word_set_C(w, val) (w)->feat_array[FEAT_TYPE_C] = (val)
#define word_set_D(w, val) (w)->feat_array[FEAT_TYPE_D] = (val)
#define word_set_E(w, val) (w)->feat_array[FEAT_TYPE_E] = (val)
#define word_set_F(w, val) (w)->feat_array[FEAT_TYPE_F] = (val)
#define word_set_G(w, val) (w)->feat_array[FEAT_TYPE_G] = (val)
#define word_set_H(w, val) (w)->feat_array[FEAT_TYPE_H] = (val)
#define word_set_I(w, val) (w)->feat_array[FEAT_TYPE_I] = (val)
#define word_set_J(w, val) (w)->feat_array[FEAT_TYPE_J] = (val)
#define word_set_K(w, val) (w)->feat_array[FEAT_TYPE_K] = (val)
#define word_set_L(w, val) (w)->feat_array[FEAT_TYPE_L] = (val)
#define word_set_M(w, val) (w)->feat_array[FEAT_TYPE_M] = (val)
#define word_set_N(w, val) (w)->feat_array[FEAT_TYPE_N] = (val)
#define word_set_O(w, val) (w)->feat_array[FEAT_TYPE_O] = (val)
#define word_set_P(w, val) (w)->feat_array[FEAT_TYPE_P] = (val)
#define word_set_Q(w, val) (w)->feat_array[FEAT_TYPE_Q] = (val)
#define word_set_R(w, val) (w)->feat_array[FEAT_TYPE_R] = (val)
#define word_set_S(w, val) (w)->feat_array[FEAT_TYPE_S] = (val)
#define word_set_T(w, val) (w)->feat_array[FEAT_TYPE_T] = (val)
#define word_set_U(w, val) (w)->feat_array[FEAT_TYPE_U] = (val)
#define word_set_V(w, val) (w)->feat_array[FEAT_TYPE_V] = (val)
#define word_set_W(w, val) (w)->feat_array[FEAT_TYPE_W] = (val)
#define word_set_X(w, val) (w)->feat_array[FEAT_TYPE_X] = (val)
#define word_set_index(w, val) (w)->wf_array[MCD_WF_INDEX] = (val)
#define word_set_form(w, val) (w)->wf_array[MCD_WF_FORM] = (val)
#define word_set_lemma(w, val) (w)->wf_array[MCD_WF_LEMMA] = (val)
#define word_set_cpos(w, val) (w)->wf_array[MCD_WF_CPOS] = (val)
#define word_set_pos(w, val) (w)->wf_array[MCD_WF_POS] = (val)
#define word_set_feats(w, val) (w)->wf_array[MCD_WF_FEATS] = (val)
#define word_set_gov(w, val) (w)->wf_array[MCD_WF_GOV] = (val)
#define word_set_label(w, val) (w)->wf_array[MCD_WF_LABEL] = (val)
#define word_set_stag(w, val) (w)->wf_array[MCD_WF_STAG] = (val)
#define word_set_word_seg(w) (w)->wf_array[MCD_WF_WORD_SEG] = (val)
#define word_set_A(w, val) (w)->wf_array[MCD_WF_A] = (val)
#define word_set_B(w, val) (w)->wf_array[MCD_WF_B] = (val)
#define word_set_C(w, val) (w)->wf_array[MCD_WF_C] = (val)
#define word_set_D(w, val) (w)->wf_array[MCD_WF_D] = (val)
#define word_set_E(w, val) (w)->wf_array[MCD_WF_E] = (val)
#define word_set_F(w, val) (w)->wf_array[MCD_WF_F] = (val)
#define word_set_G(w, val) (w)->wf_array[MCD_WF_G] = (val)
#define word_set_H(w, val) (w)->wf_array[MCD_WF_H] = (val)
#define word_set_I(w, val) (w)->wf_array[MCD_WF_I] = (val)
#define word_set_J(w, val) (w)->wf_array[MCD_WF_J] = (val)
#define word_set_K(w, val) (w)->wf_array[MCD_WF_K] = (val)
#define word_set_L(w, val) (w)->wf_array[MCD_WF_L] = (val)
#define word_set_M(w, val) (w)->wf_array[MCD_WF_M] = (val)
#define word_set_N(w, val) (w)->wf_array[MCD_WF_N] = (val)
#define word_set_O(w, val) (w)->wf_array[MCD_WF_O] = (val)
#define word_set_P(w, val) (w)->wf_array[MCD_WF_P] = (val)
#define word_set_Q(w, val) (w)->wf_array[MCD_WF_Q] = (val)
#define word_set_R(w, val) (w)->wf_array[MCD_WF_R] = (val)
#define word_set_S(w, val) (w)->wf_array[MCD_WF_S] = (val)
#define word_set_T(w, val) (w)->wf_array[MCD_WF_T] = (val)
#define word_set_U(w, val) (w)->wf_array[MCD_WF_U] = (val)
#define word_set_V(w, val) (w)->wf_array[MCD_WF_V] = (val)
#define word_set_W(w, val) (w)->wf_array[MCD_WF_W] = (val)
#define word_set_X(w, val) (w)->wf_array[MCD_WF_X] = (val)
#define word_set_Y(w, val) (w)->feat_array[FEAT_TYPE_Y] = (val)
#define word_set_Z(w, val) (w)->feat_array[FEAT_TYPE_Z] = (val)
#define word_set_Y(w, val) (w)->wf_array[MCD_WF_Y] = (val)
#define word_set_Z(w, val) (w)->wf_array[MCD_WF_Z] = (val)
#define word_set_signature(w, val) (w)->signature = (val)
#define word_set_relative_index(w, val) (w)->relative_index = (val)
#define word_get_relative_index(w) (w)->relative_index
typedef struct _word {
int feat_array[FEAT_TYPE_NB]; /* array containing the codes corresponding to the different word features */
int wf_array[MCD_WF_NB]; /* array containing the codes corresponding to the different word features */
char *input; /* the string corresponding to the actual line in the corpus file */
int U1; /* does the form begin with an uppercase character */
int signature; /* pos tags that this form can have (represented as a boolean string) */
int label;
char *form;
int relative_index;
} word;
word *word_new(char *input);
......@@ -92,11 +100,13 @@ word *word_create_dummy(mcd *mcd_struct);
word *word_copy(word *w);
void word_free(word *w);
void word_print2(FILE *f, word *w);
void word_print(FILE *f, word *w, mcd *mcd_struct, dico *dico_labels);
word *word_read(FILE *f, mcd *mcd_struct);
word *word_parse_buffer(char *buffer, mcd *mcd_struct);
int word_is_eos(word *w, mcd *mcd_struct);
int word_get_gov_relative_index(word *w);
#endif
......@@ -89,9 +89,8 @@ void dico_print_fh(FILE *f, dico *d)
void dico_print(char *filename, dico *d)
{
FILE *f;
if(filename == NULL){
if(filename == NULL)
f = stdout;
}
else{
f= fopen(filename, "w");
if(f == NULL){
......@@ -100,7 +99,9 @@ void dico_print(char *filename, dico *d)
}
}
dico_print_fh(f, d);
fclose(f);
if(filename != NULL)
fclose(f);
}
int dico_add(dico *d, char *key)
......@@ -136,7 +137,9 @@ char *dico_int2string(dico *d, int val)
int dico_string2int(dico *d, char *string)
{
cell *c = hash_lookup(d->htable, string);
cell *c;
c= hash_lookup(d->htable, string);
if(c)
return c->val;
else
......@@ -162,6 +165,7 @@ dico *dico_extract_from_corpus(char *filename, int column, char *dico_name)
column_nb = 0;
do{
if(column_nb == column){
/* printf("token = %s\n", token); */
dico_add(d, token);
}
column_nb++;
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
......@@ -7,27 +8,26 @@
#include "dico.h"
#include "word_emb.h"
mcd *mcd_new(int nb_col)
{
mcd *m = (mcd *)memalloc(sizeof(mcd));
int i;
m->nb_col = nb_col;
for(i=0; i < FEAT_TYPE_NB; i++)
m->type2col[i] = -1;
for(i=0; i < MCD_WF_NB; i++)
m->wf2col[i] = -1;
m->representation = (int *) memalloc(nb_col * sizeof(int));
m->type = (int *) memalloc(nb_col * sizeof(int));
m->type_str = (char **) memalloc(nb_col * sizeof(char *));
m->wf = (int *) memalloc(nb_col * sizeof(int));
m->wf_str = (char **) memalloc(nb_col * sizeof(char *));
m->filename = (char **) memalloc(nb_col * sizeof(char *));
m->dico_array = (dico **) memalloc(nb_col * sizeof(dico *));
m->word_emb_array = (word_emb **) memalloc(nb_col * sizeof(word_emb *));
for(i=0; i < nb_col; i++){
m->representation[i] = MCD_REPRESENTATION_NULL;
m->type[i] = -1;
m->type_str[i] = NULL;
m->wf[i] = -1;
m->wf_str[i] = NULL;
m->filename[i] = NULL;
m->dico_array[i] = NULL;
m->word_emb_array[i] = NULL;;
......@@ -41,14 +41,14 @@ void mcd_free(mcd *m)
for(i=0; i < m->nb_col; i++){
if(m->dico_array[i]) dico_free(m->dico_array[i]);
if(m->word_emb_array[i]) word_emb_free(m->word_emb_array[i]);
if(m->type_str[i]) free(m->type_str[i]);
if(m->wf_str[i]) free(m->wf_str[i]);
}
free(m->representation);
free(m->filename);
free(m->dico_array);
free(m->word_emb_array);
free(m->type_str);
free(m->type);
free(m->wf_str);
free(m->wf);
free(m);
}
......@@ -58,7 +58,7 @@ void mcd_free(mcd *m)
int mcd_get_code(mcd *m, char *str, int col){
if(m->representation[col] == MCD_REPRESENTATION_VOCAB)
return dico_string2int(m->dico_array[col], str);
return (m->dico_array[col])? dico_string2int(m->dico_array[col], str) : -1;
if(m->representation[col] == MCD_REPRESENTATION_EMB)
return word_emb_get_code(m->word_emb_array[col], str);
if(m->representation[col] == MCD_REPRESENTATION_INT)
......@@ -74,7 +74,7 @@ int mcd_max_column_index_in_file(char *mcd_filename)
FILE *f = myfopen(mcd_filename, "r");
char buffer[1000]; /* ugly */
int column;
char type[100];
char wf[100];
char representation[100];
char filename[500]; /* ugly */
int fields_number;
......@@ -84,7 +84,7 @@ int mcd_max_column_index_in_file(char *mcd_filename)
line_number++;
if(feof(f)) break;
if((buffer[0] == '\n') || (buffer[0] == '#')) continue;
fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename);
fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename);
if(fields_number != 4){
fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename);
continue;
......@@ -106,8 +106,8 @@ void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename)
if((m->representation[column] == MCD_REPRESENTATION_VOCAB)
/* && (strcmp(m->filename[column], "_")) */
&& (m->dico_array[column] == NULL)){
m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, m->type_str[column]);
fprintf(stderr, "extracting dico %s from corpus\n", m->type_str[column]);
m->dico_array[column] = dico_extract_from_corpus(corpus_filename, column, m->wf_str[column]);
fprintf(stderr, "extracting dico %s \tfrom corpus\n", m->wf_str[column]);
}
}
}
......@@ -123,8 +123,8 @@ void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose)
if((m->representation[column] == MCD_REPRESENTATION_VOCAB)
&& (!strcmp(m->filename[column], "_"))
&& (m->dico_array[column] == NULL)){
m->dico_array[column] = dico_vec_get_dico(vocabs, m->type_str[column]);
if(verbose) fprintf(stderr, "linking to dico %s\n", m->type_str[column]);
m->dico_array[column] = dico_vec_get_dico(vocabs, m->wf_str[column]);
if(verbose) fprintf(stderr, "linking to dico %s\n", m->wf_str[column]);
}
}
}
......@@ -134,7 +134,7 @@ void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose)
mcd *mcd_read(char *mcd_filename, int verbose)
{
int column;
char type[100];
char wf[100];
char representation[100];
char filename[500]; /* ugly */
int fields_number;
......@@ -149,19 +149,20 @@ mcd *mcd_read(char *mcd_filename, int verbose)
line_number++;
if(feof(f)) break;
if((buffer[0] == '\n') || (buffer[0] == '#')) continue;
fields_number = sscanf(buffer, "%d %s %s %s", &column, type, representation, filename);
fields_number = sscanf(buffer, "%d %s %s %s", &column, wf, representation, filename);
if(fields_number != 4){
/* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */
continue;
}
if(verbose) fprintf(stderr, "column = %d type = %s representation = %s filename = %s\n", column, type, representation, filename);
m->type[column] = feat_type_string2int(type);
m->type_str[column] = strdup(type);
if(m->type[column] == -1){
fprintf(stderr, "in line %d of mcd file %s invalid type, I'm skipping it\n", line_number, mcd_filename);
if(verbose) fprintf(stderr, "column = %d\tword feature = %s\trepresentation = %s\tfilename = %s\n", column, wf, representation, filename);
m->wf[column] = mcd_wf_code(wf);
m->wf_str[column] = strdup(wf);
if(m->wf[column] == -1){
fprintf(stderr, "in line %d of mcd file %s invalid wf, I'm skipping it\n", line_number, mcd_filename);
continue;
}
m->type2col[m->type[column]] = column;
m->wf2col[m->wf[column]] = column;
if(!strcmp(representation, "_")) m->representation[column] = MCD_REPRESENTATION_NULL;
else if(!strcmp(representation, "EMB")) m->representation[column] = MCD_REPRESENTATION_EMB;
......@@ -185,6 +186,7 @@ mcd *mcd_read(char *mcd_filename, int verbose)
}
}
}
fclose(f);
return m;
}
......@@ -194,53 +196,53 @@ mcd *mcd_read(char *mcd_filename, int verbose)
mcd *mcd_build_conll07(void)
{
mcd *m = mcd_new(8);
m->type[0]=FEAT_TYPE_INDEX;
m->type_str[0]=strdup("INDEX");
m->wf[0]=MCD_WF_INDEX;
m->wf_str[0]=strdup("INDEX");
m->representation[0]= MCD_REPRESENTATION_INT;
m->filename[0] = strdup("_");
m->type2col[FEAT_TYPE_INDEX] = 0;
m->wf2col[MCD_WF_INDEX] = 0;
m->type[1]=FEAT_TYPE_FORM;
m->type_str[1]=strdup("FORM");
m->wf[1]=MCD_WF_FORM;
m->wf_str[1]=strdup("FORM");
m->representation[1]= MCD_REPRESENTATION_VOCAB;
m->filename[1] = strdup("_");
m->type2col[FEAT_TYPE_FORM] = 1;
m->wf2col[MCD_WF_FORM] = 1;
m->type[2]=FEAT_TYPE_LEMMA;
m->type_str[2]=strdup("LEMMA");
m->wf[2]=MCD_WF_LEMMA;
m->wf_str[2]=strdup("LEMMA");
m->representation[2]= MCD_REPRESENTATION_VOCAB;
m->filename[2] = strdup("_");
m->type2col[FEAT_TYPE_LEMMA] = 2;
m->wf2col[MCD_WF_LEMMA] = 2;
m->type[3]=FEAT_TYPE_CPOS;
m->type_str[3]=strdup("CPOS");
m->wf[3]=MCD_WF_CPOS;
m->wf_str[3]=strdup("CPOS");
m->representation[3]= MCD_REPRESENTATION_VOCAB;
m->filename[3] = strdup("_");
m->type2col[FEAT_TYPE_CPOS] = 3;
m->wf2col[MCD_WF_CPOS] = 3;
m->type[4]=FEAT_TYPE_POS;
m->type_str[4]=strdup("POS");
m->wf[4]=MCD_WF_POS;
m->wf_str[4]=strdup("POS");
m->representation[4]= MCD_REPRESENTATION_VOCAB;
m->filename[4] = strdup("_");
m->type2col[FEAT_TYPE_POS] = 4;
m->wf2col[MCD_WF_POS] = 4;
m->type[5]=FEAT_TYPE_FEATS;
m->type_str[5]=strdup("FEATS");
m->wf[5]=MCD_WF_FEATS;
m->wf_str[5]=strdup("FEATS");
m->representation[5]= MCD_REPRESENTATION_VOCAB;
m->filename[5] = strdup("_");
m->type2col[FEAT_TYPE_FEATS] = 5;
m->wf2col[MCD_WF_FEATS] = 5;
m->type[6]=FEAT_TYPE_GOV;
m->type_str[6]=strdup("GOV");
m->wf[6]=MCD_WF_GOV;
m->wf_str[6]=strdup("GOV");
m->representation[6]= MCD_REPRESENTATION_INT;
m->filename[6] = strdup("_");
m->type2col[FEAT_TYPE_GOV] = 6;
m->wf2col[MCD_WF_GOV] = 6;
m->type[7]=FEAT_TYPE_LABEL;
m->type_str[7]=strdup("LABEL");
m->wf[7]=MCD_WF_LABEL;
m->wf_str[7]=strdup("LABEL");
m->representation[7]= MCD_REPRESENTATION_VOCAB;
m->filename[7] = strdup("_");
m->type2col[FEAT_TYPE_LABEL] = 7;
m->wf2col[MCD_WF_LABEL] = 7;
return m;
}
......@@ -251,114 +253,45 @@ mcd *mcd_build_ifpls(void)
{
mcd *m = mcd_new(6);
m->type[0]=FEAT_TYPE_INDEX;
m->type_str[0]=strdup("INDEX");
m->wf[0]=MCD_WF_INDEX;
m->wf_str[0]=strdup("INDEX");
m->representation[0]= MCD_REPRESENTATION_INT;
m->filename[0] = strdup("_");
m->type2col[FEAT_TYPE_INDEX] = 0;
m->wf2col[MCD_WF_INDEX] = 0;
m->type[1]=FEAT_TYPE_FORM;
m->type_str[1]=strdup("FORM");
m->wf[1]=MCD_WF_FORM;
m->wf_str[1]=strdup("FORM");
m->representation[1]= MCD_REPRESENTATION_VOCAB;
m->filename[1] = strdup("_");
m->type2col[FEAT_TYPE_FORM] = 1;
m->wf2col[MCD_WF_FORM] = 1;
m->type[2]=FEAT_TYPE_POS;
m->type_str[2]=strdup("POS");
m->wf[2]=MCD_WF_POS;
m->wf_str[2]=strdup("POS");
m->representation[2]= MCD_REPRESENTATION_VOCAB;
m->filename[2] = strdup("_");
m->type2col[FEAT_TYPE_POS] = 2;
m->wf2col[MCD_WF_POS] = 2;
m->type[3]=FEAT_TYPE_LEMMA;
m->type_str[3]=strdup("LEMMA");
m->wf[3]=MCD_WF_LEMMA;
m->wf_str[3]=strdup("LEMMA");
m->representation[3]= MCD_REPRESENTATION_VOCAB