Skip to content
Snippets Groups Projects
Select Git revision
  • aeb14ceb0fdf3b6ef06d550b9d516030c2663947
  • master default protected
  • johannes
  • partial_parser
  • Aloui_Dary
  • ignore_punct
  • AC
  • classifier
  • fixhelp
  • libmacaon2
  • error_predictor
  • morpho
  • ssrnn
  • tfparsing
  • silvio
  • tagger_options
  • maca_trans_frame_parser
  • alexis
  • new_config
  • tagparse
  • maca_graph_parser
21 results

mcd.h

Blame
  • user avatar
    Silvio Ricardo Cordeiro authored
    aeb14ceb
    History
    mcd.h 5.63 KiB
    #ifndef __MCD__
    #define __MCD__
    
    #define MCD_REPRESENTATION_NULL 0
    #define MCD_REPRESENTATION_EMB 1
    #define MCD_REPRESENTATION_VOCAB 2
    #define MCD_REPRESENTATION_INT 3
    
    #define MCD_INVALID_VALUE -1
    
    #define MCD_WF_NB 36
    
    #define MCD_WF_ID 0
    #define MCD_WF_FORM 1
    #define MCD_WF_LEMMA 2
    #define MCD_WF_CPOS 3
    #define MCD_WF_POS 4
    #define MCD_WF_FEATS 5
    #define MCD_WF_GOV 6
    #define MCD_WF_LABEL 7
    #define MCD_WF_STAG 8
    #define MCD_WF_SENT_SEG 9
    #define MCD_WF_A 10
    #define MCD_WF_B 11
    #define MCD_WF_C 12
    #define MCD_WF_D 13
    #define MCD_WF_E 14
    #define MCD_WF_F 15
    #define MCD_WF_G 16
    #define MCD_WF_H 17
    #define MCD_WF_I 18
    #define MCD_WF_J 19
    #define MCD_WF_K 20
    #define MCD_WF_L 21
    #define MCD_WF_M 22
    #define MCD_WF_N 23
    #define MCD_WF_O 24
    #define MCD_WF_P 25
    #define MCD_WF_Q 26
    #define MCD_WF_R 27
    #define MCD_WF_S 28
    #define MCD_WF_T 29
    #define MCD_WF_U 30
    #define MCD_WF_V 31
    #define MCD_WF_W 32
    #define MCD_WF_X 33
    #define MCD_WF_Y 34
    #define MCD_WF_Z 35
    
    #include "dico.h"
    #include "word_emb.h"
    #include "dico_vec.h"
    
    #define mcd_get_dico_label(m) (m)->dico_array[MCD_WF_LABEL]
    
    #define mcd_get_index_col(m)    (m)->wf2col[MCD_WF_ID]
    #define mcd_get_form_col(m)     (m)->wf2col[MCD_WF_FORM]
    #define mcd_get_lemma_col(m)    (m)->wf2col[MCD_WF_LEMMA]
    #define mcd_get_cpos_col(m)     (m)->wf2col[MCD_WF_CPOS]
    #define mcd_get_pos_col(m)      (m)->wf2col[MCD_WF_POS]
    #define mcd_get_feats_col(m)    (m)->wf2col[MCD_WF_FEATS]
    #define mcd_get_gov_col(m)      (m)->wf2col[MCD_WF_GOV]
    #define mcd_get_label_col(m)    (m)->wf2col[MCD_WF_LABEL]
    #define mcd_get_stag_col(m)     (m)->wf2col[MCD_WF_STAG]
    #define mcd_get_sent_seg_col(m) (m)->wf2col[MCD_WF_SENT_SEG]
    #define mcd_get_letter_col(m,L) (m)->wf2col[MCD_WF_A+(L)]
    #define mcd_get_a_col(m)        (m)->wf2col[MCD_WF_A]
    #define mcd_get_b_col(m)        (m)->wf2col[MCD_WF_B]
    #define mcd_get_c_col(m)        (m)->wf2col[MCD_WF_C]
    #define mcd_get_d_col(m)        (m)->wf2col[MCD_WF_D]
    #define mcd_get_e_col(m)        (m)->wf2col[MCD_WF_E]
    #define mcd_get_f_col(m)        (m)->wf2col[MCD_WF_F]
    #define mcd_get_g_col(m)        (m)->wf2col[MCD_WF_G]
    #define mcd_get_h_col(m)        (m)->wf2col[MCD_WF_H]
    #define mcd_get_i_col(m)        (m)->wf2col[MCD_WF_I]
    #define mcd_get_j_col(m)        (m)->wf2col[MCD_WF_J]
    #define mcd_get_k_col(m)        (m)->wf2col[MCD_WF_K]
    #define mcd_get_l_col(m)        (m)->wf2col[MCD_WF_L]
    #define mcd_get_m_col(m)        (m)->wf2col[MCD_WF_M]
    #define mcd_get_n_col(m)        (m)->wf2col[MCD_WF_N]
    #define mcd_get_o_col(m)        (m)->wf2col[MCD_WF_O]
    #define mcd_get_p_col(m)        (m)->wf2col[MCD_WF_P]
    #define mcd_get_q_col(m)        (m)->wf2col[MCD_WF_Q]
    #define mcd_get_r_col(m)        (m)->wf2col[MCD_WF_R]
    #define mcd_get_s_col(m)        (m)->wf2col[MCD_WF_S]
    #define mcd_get_t_col(m)        (m)->wf2col[MCD_WF_T]
    #define mcd_get_u_col(m)        (m)->wf2col[MCD_WF_U]
    #define mcd_get_v_col(m)        (m)->wf2col[MCD_WF_V]
    #define mcd_get_w_col(m)        (m)->wf2col[MCD_WF_W]
    #define mcd_get_x_col(m)        (m)->wf2col[MCD_WF_X]
    #define mcd_get_y_col(m)        (m)->wf2col[MCD_WF_Y]
    #define mcd_get_z_col(m)        (m)->wf2col[MCD_WF_Z]
    
    #define mcd_set_form_col(m, v) (m)->wf[MCD_WF_FORM] = (v)
    
    
    /* mcd (multi column description) files describe the format of corpus files */
    /* every line of an mcd file describes the content of a column of the corpus file */
    /* every line contains four fields separated by a space character */
    /* first field is the index of the column described (first column corresponds to index zero) */
    /* second field is the name of the column. Such must be taken from the following list: */
    /* INDEX, FORM, LEMMA, CPOS, POS, FEAT, LABEL, STAG, INT, GOV, A ... Z */
    /* third field correspond to the internal representation of the tokens found in the column described. Four values are possible : */
    /* VOCAB if the internal representation is an integer code corresponding to the token */
    /* INT if the token is already an integer and its corresponding internal value is the same integer */
    /* EMB if the internal representation of the token is a real valued vector. */
    /* _   if no internal representation is associated to the field */
    /* fourth field is the name of a file in which the encoding is represented, this file can either be a dico (see dico.h) format file or an embedding file (see word_emb.h)*/
    
    typedef struct {
      int nb_col;                 /* number of columns in the mcd file */
      int wf2col[MCD_WF_NB];      /* in which column are the word features (MCD_WF_FORM, MCD_WF_LEMMA ...) represented */ 
      int *wf;                    /* array containing the word feature that correspond to each column */
      char **wf_str;              /* a string version of array word feature */
      int *representation;        /* array containing the representation mode of every column (integer, vocabulary, embedding, NULL) */
      char **filename;            /* array containing the file in which the different values for a columnn is represented */
      dico **dico_array;          /* array containing the dico corresponding to each column (NULL if no file) */
      word_emb **word_emb_array;  /* array containing the word embedding structure corresponding to each column (NULL if no file) */
    } mcd;
    
    mcd *mcd_build_conll07(void);
    mcd *mcd_build_ifpls(void);
    mcd *mcd_build_wplgf(void);
    mcd *mcd_build_wplgfs(void);
    
    mcd *mcd_read(char *mcd_filename, int verbose);
    void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose);
    void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename);
    void mcd_free(mcd *m);
    int mcd_get_code(mcd *m, char *str, int col);
    dico_vec *mcd_build_dico_vec(mcd *mcd_struct);
    int mcd_wf_code(char *wf);
    void mcd_remove_wf_column(mcd *m, int wf_code);
    mcd *mcd_copy(mcd *m);
    char *mcd_get_str(mcd *m, int code, int col);
    
    #endif