Skip to content
Snippets Groups Projects
Select Git revision
  • b6a119771a69c8103a2b81d7cc5d1d3cb2f38dc6
  • master default protected
2 results

2016-03-08-code-sample.md

Blame
  • mcd.h 6.50 KiB
    #ifndef __MCD__
    #define __MCD__
    
    #define MCD_REPRESENTATION_NULL 0
    #define MCD_REPRESENTATION_EMB 1
    #define MCD_REPRESENTATION_VOCAB 2
    #define MCD_REPRESENTATION_INT 3
    
    #define MCD_INVALID_VALUE -1
    
    #define MCD_WF_NB 47
    
    #define MCD_WF_ID 0
    #define MCD_WF_FORM 1
    #define MCD_WF_LEMMA 2
    #define MCD_WF_CPOS 3
    #define MCD_WF_POS 4
    #define MCD_WF_FEATS 5
    #define MCD_WF_GOV 6
    #define MCD_WF_LABEL 7
    #define MCD_WF_STAG 8
    #define MCD_WF_SENT_SEG 9
    #define MCD_WF_A 10
    #define MCD_WF_B 11
    #define MCD_WF_C 12
    #define MCD_WF_D 13
    #define MCD_WF_E 14
    #define MCD_WF_F 15
    #define MCD_WF_G 16
    #define MCD_WF_H 17
    #define MCD_WF_I 18
    #define MCD_WF_J 19
    #define MCD_WF_K 20
    #define MCD_WF_L 21
    #define MCD_WF_M 22
    #define MCD_WF_N 23
    #define MCD_WF_O 24
    #define MCD_WF_P 25
    #define MCD_WF_Q 26
    #define MCD_WF_R 27
    #define MCD_WF_S 28
    #define MCD_WF_T 29
    #define MCD_WF_U 30
    #define MCD_WF_V 31
    #define MCD_WF_W 32
    #define MCD_WF_X 33
    #define MCD_WF_Y 34
    #define MCD_WF_Z 35
    
    #define MCD_WF_Aspect 36
    #define MCD_WF_Case 37
    #define MCD_WF_Clitic 38
    #define MCD_WF_Definite 39
    #define MCD_WF_Gender 40
    #define MCD_WF_Mood 41
    #define MCD_WF_NameType 42
    #define MCD_WF_NounType 43
    #define MCD_WF_Number 44
    #define MCD_WF_Person 45
    #define MCD_WF_Tense 46
    
    /*Abbr
    AdpType
    AdvType
    Animacy
    Animacy[gram]
    ConjType
    Connegative
    Degree
    Derivation
    Dialect
    Echo
    Evident
    Foreign
    Form
    Gender[dat]
    Gender[erg]
    Gender[psor]
    HebBinyan
    HebExistential
    HebSource
    Hyph
    InfForm
    
    Number[abs]
    Number[dat]
    Number[erg]
    Number[psed]
    Number[psor]
    NumForm
    NumType
    NumValue
    PartForm
    PartType
    Person[abs]
    Person[dat]
    Person[erg]
    Person[psor]
    Polarity
    Polite
    Polite[abs]
    Polite[dat]
    Polite[erg]
    Position
    Poss
    Prefix
    PrepCase
    PrepForm
    PronType
    PunctSide
    PunctType
    Reflex
    Strength
    Style
    Subcat
    Typo
    Variant
    VerbForm
    VerbType
    Voice
    Xtra*/
    
    
    
    #include "dico.h"
    #include "word_emb.h"
    #include "dico_vec.h"
    
    #define mcd_get_dico_label(m) (m)->dico_array[MCD_WF_LABEL]
    
    #define mcd_get_index_col(m)    (m)->wf2col[MCD_WF_ID]
    #define mcd_get_form_col(m)     (m)->wf2col[MCD_WF_FORM]
    #define mcd_get_lemma_col(m)    (m)->wf2col[MCD_WF_LEMMA]
    #define mcd_get_cpos_col(m)     (m)->wf2col[MCD_WF_CPOS]
    #define mcd_get_pos_col(m)      (m)->wf2col[MCD_WF_POS]
    #define mcd_get_feats_col(m)    (m)->wf2col[MCD_WF_FEATS]
    #define mcd_get_gov_col(m)      (m)->wf2col[MCD_WF_GOV]
    #define mcd_get_label_col(m)    (m)->wf2col[MCD_WF_LABEL]
    #define mcd_get_stag_col(m)     (m)->wf2col[MCD_WF_STAG]
    #define mcd_get_sent_seg_col(m) (m)->wf2col[MCD_WF_SENT_SEG]
    #define mcd_get_a_col(m)        (m)->wf2col[MCD_WF_A]
    #define mcd_get_b_col(m)        (m)->wf2col[MCD_WF_B]
    #define mcd_get_c_col(m)        (m)->wf2col[MCD_WF_C]
    #define mcd_get_d_col(m)        (m)->wf2col[MCD_WF_D]
    #define mcd_get_e_col(m)        (m)->wf2col[MCD_WF_E]
    #define mcd_get_f_col(m)        (m)->wf2col[MCD_WF_F]
    #define mcd_get_g_col(m)        (m)->wf2col[MCD_WF_G]
    #define mcd_get_h_col(m)        (m)->wf2col[MCD_WF_H]
    #define mcd_get_i_col(m)        (m)->wf2col[MCD_WF_I]
    #define mcd_get_j_col(m)        (m)->wf2col[MCD_WF_J]
    #define mcd_get_k_col(m)        (m)->wf2col[MCD_WF_K]
    #define mcd_get_l_col(m)        (m)->wf2col[MCD_WF_L]
    #define mcd_get_m_col(m)        (m)->wf2col[MCD_WF_M]
    #define mcd_get_n_col(m)        (m)->wf2col[MCD_WF_N]
    #define mcd_get_o_col(m)        (m)->wf2col[MCD_WF_O]
    #define mcd_get_p_col(m)        (m)->wf2col[MCD_WF_P]
    #define mcd_get_q_col(m)        (m)->wf2col[MCD_WF_Q]
    #define mcd_get_r_col(m)        (m)->wf2col[MCD_WF_R]
    #define mcd_get_s_col(m)        (m)->wf2col[MCD_WF_S]
    #define mcd_get_t_col(m)        (m)->wf2col[MCD_WF_T]
    #define mcd_get_u_col(m)        (m)->wf2col[MCD_WF_U]
    #define mcd_get_v_col(m)        (m)->wf2col[MCD_WF_V]
    #define mcd_get_w_col(m)        (m)->wf2col[MCD_WF_W]
    #define mcd_get_x_col(m)        (m)->wf2col[MCD_WF_X]
    #define mcd_get_y_col(m)        (m)->wf2col[MCD_WF_Y]
    #define mcd_get_z_col(m)        (m)->wf2col[MCD_WF_Z]
    
    
    
    
    #define mcd_set_form_col(m, v) (m)->wf[MCD_WF_FORM] = (v)
    
    
    /* mcd (multi column description) files describe the format of corpus files */
    /* every line of an mcd file describes the content of a column of the corpus file */
    /* every line contains four fields separated by a space character */
    /* first field is the index of the column described (first column corresponds to index one) */
    /* second field is the name of the column. Such name must be taken from the following list: */
    /* INDEX, FORM, LEMMA, CPOS, POS, FEAT, LABEL, STAG, INT, GOV, A ... Z */
    /* third field corresponds to the internal representation of the tokens found in the column described. Four values are possible : */
    /* VOCAB if the internal representation is an integer code corresponding to the token */
    /* INT if the token is already an integer and its corresponding internal value is the same integer */
    /* EMB if the internal representation of the token is a real valued vector (an embedding). */
    /* _   if no internal representation is associated to the field */
    /* fourth field is the name of a file in which the encoding is represented, this file can either be a dico (see dico.h) format file or an embedding file (see word_emb.h)*/
    
    typedef struct {
      int nb_col;                 /* number of columns in the mcd file */
      int wf2col[MCD_WF_NB];      /* in which column are the word features (MCD_WF_FORM, MCD_WF_LEMMA ... MCD_WF_A ... MCD_WF_Z) represented */ 
      int *wf;                    /* array containing the word feature that corresponds to each column */
      char **wf_str;              /* a string version of array word feature */
      int *representation;        /* array containing the representation mode of every column (integer, vocabulary, embedding, NULL) */
      char **filename;            /* array containing the file in which the different values for a columnn is represented */
      dico **dico_array;          /* array containing the dico corresponding to each column (NULL if no file) */
      word_emb **word_emb_array;  /* array containing the word embedding structure corresponding to each column (NULL if no file) */
    } mcd;
    
    mcd *mcd_build_conll07(void);
    mcd *mcd_build_ifpls(void);
    mcd *mcd_build_wplgf(void);
    mcd *mcd_build_wplgfs(void);
    mcd *mcd_build_wpmlgfs(void);
    
    mcd      *mcd_read(char *mcd_filename, int verbose);
    void      mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose);
    void      mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename);
    void      mcd_free(mcd *m);
    int       mcd_get_code(mcd *m, char *str, int col);
    dico_vec *mcd_build_dico_vec(mcd *mcd_struct);
    int       mcd_wf_code(char *wf);
    void      mcd_remove_wf_column(mcd *m, int wf_code);
    mcd      *mcd_copy(mcd *m);
    char     *mcd_get_str(mcd *m, int code, int col);
    
    #endif