diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index fe4eecf2e8f13aac08e418f3973606db8e1ce32e..881b98e80fff65bd5031a234cd16ebea0f9f14fc 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -96,20 +96,20 @@ /* mcd (multi column description) files describe the format of corpus files */ /* every line of an mcd file describes the content of a column of the corpus file */ /* every line contains four fields separated by a space character */ -/* first field is the index of the column described (first column corresponds to index zero) */ -/* second field is the name of the column. Such must be taken from the following list: */ +/* first field is the index of the column described (first column corresponds to index one) */ +/* second field is the name of the column. Such name must be taken from the following list: */ /* INDEX, FORM, LEMMA, CPOS, POS, FEAT, LABEL, STAG, INT, GOV, A ... Z */ -/* third field correspond to the internal representation of the tokens found in the column described. Four values are possible : */ +/* third field corresponds to the internal representation of the tokens found in the column described. Four values are possible : */ /* VOCAB if the internal representation is an integer code corresponding to the token */ /* INT if the token is already an integer and its corresponding internal value is the same integer */ -/* EMB if the internal representation of the token is a real valued vector. */ +/* EMB if the internal representation of the token is a real valued vector (an embedding). */ /* _ if no internal representation is associated to the field */ /* fourth field is the name of a file in which the encoding is represented, this file can either be a dico (see dico.h) format file or an embedding file (see word_emb.h)*/ typedef struct { int nb_col; /* number of columns in the mcd file */ - int wf2col[MCD_WF_NB]; /* in which column are the word features (MCD_WF_FORM, MCD_WF_LEMMA ...) represented */ - int *wf; /* array containing the word feature that correspond to each column */ + int wf2col[MCD_WF_NB]; /* in which column are the word features (MCD_WF_FORM, MCD_WF_LEMMA ... MCD_WF_A ... MCD_WF_Z) represented */ + int *wf; /* array containing the word feature that corresponds to each column */ char **wf_str; /* a string version of array word feature */ int *representation; /* array containing the representation mode of every column (integer, vocabulary, embedding, NULL) */ char **filename; /* array containing the file in which the different values for a columnn is represented */ @@ -122,15 +122,15 @@ mcd *mcd_build_ifpls(void); mcd *mcd_build_wplgf(void); mcd *mcd_build_wplgfs(void); -mcd *mcd_read(char *mcd_filename, int verbose); -void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose); -void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename); -void mcd_free(mcd *m); -int mcd_get_code(mcd *m, char *str, int col); +mcd *mcd_read(char *mcd_filename, int verbose); +void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose); +void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename); +void mcd_free(mcd *m); +int mcd_get_code(mcd *m, char *str, int col); dico_vec *mcd_build_dico_vec(mcd *mcd_struct); -int mcd_wf_code(char *wf); -void mcd_remove_wf_column(mcd *m, int wf_code); -mcd *mcd_copy(mcd *m); -char *mcd_get_str(mcd *m, int code, int col); +int mcd_wf_code(char *wf); +void mcd_remove_wf_column(mcd *m, int wf_code); +mcd *mcd_copy(mcd *m); +char *mcd_get_str(mcd *m, int code, int col); #endif