Select Git revision
2016-03-08-code-sample.md
mcd.h 6.50 KiB
#ifndef __MCD__
#define __MCD__
#define MCD_REPRESENTATION_NULL 0
#define MCD_REPRESENTATION_EMB 1
#define MCD_REPRESENTATION_VOCAB 2
#define MCD_REPRESENTATION_INT 3
#define MCD_INVALID_VALUE -1
#define MCD_WF_NB 47
#define MCD_WF_ID 0
#define MCD_WF_FORM 1
#define MCD_WF_LEMMA 2
#define MCD_WF_CPOS 3
#define MCD_WF_POS 4
#define MCD_WF_FEATS 5
#define MCD_WF_GOV 6
#define MCD_WF_LABEL 7
#define MCD_WF_STAG 8
#define MCD_WF_SENT_SEG 9
#define MCD_WF_A 10
#define MCD_WF_B 11
#define MCD_WF_C 12
#define MCD_WF_D 13
#define MCD_WF_E 14
#define MCD_WF_F 15
#define MCD_WF_G 16
#define MCD_WF_H 17
#define MCD_WF_I 18
#define MCD_WF_J 19
#define MCD_WF_K 20
#define MCD_WF_L 21
#define MCD_WF_M 22
#define MCD_WF_N 23
#define MCD_WF_O 24
#define MCD_WF_P 25
#define MCD_WF_Q 26
#define MCD_WF_R 27
#define MCD_WF_S 28
#define MCD_WF_T 29
#define MCD_WF_U 30
#define MCD_WF_V 31
#define MCD_WF_W 32
#define MCD_WF_X 33
#define MCD_WF_Y 34
#define MCD_WF_Z 35
#define MCD_WF_Aspect 36
#define MCD_WF_Case 37
#define MCD_WF_Clitic 38
#define MCD_WF_Definite 39
#define MCD_WF_Gender 40
#define MCD_WF_Mood 41
#define MCD_WF_NameType 42
#define MCD_WF_NounType 43
#define MCD_WF_Number 44
#define MCD_WF_Person 45
#define MCD_WF_Tense 46
/*Abbr
AdpType
AdvType
Animacy
Animacy[gram]
ConjType
Connegative
Degree
Derivation
Dialect
Echo
Evident
Foreign
Form
Gender[dat]
Gender[erg]
Gender[psor]
HebBinyan
HebExistential
HebSource
Hyph
InfForm
Number[abs]
Number[dat]
Number[erg]
Number[psed]
Number[psor]
NumForm
NumType
NumValue
PartForm
PartType
Person[abs]
Person[dat]
Person[erg]
Person[psor]
Polarity
Polite
Polite[abs]
Polite[dat]
Polite[erg]
Position
Poss
Prefix
PrepCase
PrepForm
PronType
PunctSide
PunctType
Reflex
Strength
Style
Subcat
Typo
Variant
VerbForm
VerbType
Voice
Xtra*/
#include "dico.h"
#include "word_emb.h"
#include "dico_vec.h"
#define mcd_get_dico_label(m) (m)->dico_array[MCD_WF_LABEL]
#define mcd_get_index_col(m) (m)->wf2col[MCD_WF_ID]
#define mcd_get_form_col(m) (m)->wf2col[MCD_WF_FORM]
#define mcd_get_lemma_col(m) (m)->wf2col[MCD_WF_LEMMA]
#define mcd_get_cpos_col(m) (m)->wf2col[MCD_WF_CPOS]
#define mcd_get_pos_col(m) (m)->wf2col[MCD_WF_POS]
#define mcd_get_feats_col(m) (m)->wf2col[MCD_WF_FEATS]
#define mcd_get_gov_col(m) (m)->wf2col[MCD_WF_GOV]
#define mcd_get_label_col(m) (m)->wf2col[MCD_WF_LABEL]
#define mcd_get_stag_col(m) (m)->wf2col[MCD_WF_STAG]
#define mcd_get_sent_seg_col(m) (m)->wf2col[MCD_WF_SENT_SEG]
#define mcd_get_a_col(m) (m)->wf2col[MCD_WF_A]
#define mcd_get_b_col(m) (m)->wf2col[MCD_WF_B]
#define mcd_get_c_col(m) (m)->wf2col[MCD_WF_C]
#define mcd_get_d_col(m) (m)->wf2col[MCD_WF_D]
#define mcd_get_e_col(m) (m)->wf2col[MCD_WF_E]
#define mcd_get_f_col(m) (m)->wf2col[MCD_WF_F]
#define mcd_get_g_col(m) (m)->wf2col[MCD_WF_G]
#define mcd_get_h_col(m) (m)->wf2col[MCD_WF_H]
#define mcd_get_i_col(m) (m)->wf2col[MCD_WF_I]
#define mcd_get_j_col(m) (m)->wf2col[MCD_WF_J]
#define mcd_get_k_col(m) (m)->wf2col[MCD_WF_K]
#define mcd_get_l_col(m) (m)->wf2col[MCD_WF_L]
#define mcd_get_m_col(m) (m)->wf2col[MCD_WF_M]
#define mcd_get_n_col(m) (m)->wf2col[MCD_WF_N]
#define mcd_get_o_col(m) (m)->wf2col[MCD_WF_O]
#define mcd_get_p_col(m) (m)->wf2col[MCD_WF_P]
#define mcd_get_q_col(m) (m)->wf2col[MCD_WF_Q]
#define mcd_get_r_col(m) (m)->wf2col[MCD_WF_R]
#define mcd_get_s_col(m) (m)->wf2col[MCD_WF_S]
#define mcd_get_t_col(m) (m)->wf2col[MCD_WF_T]
#define mcd_get_u_col(m) (m)->wf2col[MCD_WF_U]
#define mcd_get_v_col(m) (m)->wf2col[MCD_WF_V]
#define mcd_get_w_col(m) (m)->wf2col[MCD_WF_W]
#define mcd_get_x_col(m) (m)->wf2col[MCD_WF_X]
#define mcd_get_y_col(m) (m)->wf2col[MCD_WF_Y]
#define mcd_get_z_col(m) (m)->wf2col[MCD_WF_Z]
#define mcd_set_form_col(m, v) (m)->wf[MCD_WF_FORM] = (v)
/* mcd (multi column description) files describe the format of corpus files */
/* every line of an mcd file describes the content of a column of the corpus file */
/* every line contains four fields separated by a space character */
/* first field is the index of the column described (first column corresponds to index one) */
/* second field is the name of the column. Such name must be taken from the following list: */
/* INDEX, FORM, LEMMA, CPOS, POS, FEAT, LABEL, STAG, INT, GOV, A ... Z */
/* third field corresponds to the internal representation of the tokens found in the column described. Four values are possible : */
/* VOCAB if the internal representation is an integer code corresponding to the token */
/* INT if the token is already an integer and its corresponding internal value is the same integer */
/* EMB if the internal representation of the token is a real valued vector (an embedding). */
/* _ if no internal representation is associated to the field */
/* fourth field is the name of a file in which the encoding is represented, this file can either be a dico (see dico.h) format file or an embedding file (see word_emb.h)*/
typedef struct {
int nb_col; /* number of columns in the mcd file */
int wf2col[MCD_WF_NB]; /* in which column are the word features (MCD_WF_FORM, MCD_WF_LEMMA ... MCD_WF_A ... MCD_WF_Z) represented */
int *wf; /* array containing the word feature that corresponds to each column */
char **wf_str; /* a string version of array word feature */
int *representation; /* array containing the representation mode of every column (integer, vocabulary, embedding, NULL) */
char **filename; /* array containing the file in which the different values for a columnn is represented */
dico **dico_array; /* array containing the dico corresponding to each column (NULL if no file) */
word_emb **word_emb_array; /* array containing the word embedding structure corresponding to each column (NULL if no file) */
} mcd;
mcd *mcd_build_conll07(void);
mcd *mcd_build_ifpls(void);
mcd *mcd_build_wplgf(void);
mcd *mcd_build_wplgfs(void);
mcd *mcd_build_wpmlgfs(void);
mcd *mcd_read(char *mcd_filename, int verbose);
void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose);
void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename);
void mcd_free(mcd *m);
int mcd_get_code(mcd *m, char *str, int col);
dico_vec *mcd_build_dico_vec(mcd *mcd_struct);
int mcd_wf_code(char *wf);
void mcd_remove_wf_column(mcd *m, int wf_code);
mcd *mcd_copy(mcd *m);
char *mcd_get_str(mcd *m, int code, int col);
#endif