Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Alexis Nasr
macaon2
Commits
1c1b0a6c
Commit
1c1b0a6c
authored
Sep 29, 2016
by
Alexis Nasr
Browse files
modified input file format for maca_trans_tagger and maca_trans_parser
parent
fc0b36e8
Changes
31
Show whitespace changes
Inline
Side-by-side
maca_common/CMakeLists.txt
View file @
1c1b0a6c
...
...
@@ -4,7 +4,6 @@ set(SOURCES src/util.c
src/word_emb.c
src/mcd.c
src/dico_vec.c
src/feat_types.c
src/form2pos.c
src/word.c
src/sentence.c
...
...
maca_common/include/mcd.h
View file @
1c1b0a6c
...
...
@@ -6,19 +6,91 @@
#define MCD_REPRESENTATION_VOCAB 2
#define MCD_REPRESENTATION_INT 3
#define MCD_INVALID_VALUE -1
#define MCD_WF_NB 36
#define MCD_WF_INDEX 0
#define MCD_WF_FORM 1
#define MCD_WF_LEMMA 2
#define MCD_WF_CPOS 3
#define MCD_WF_POS 4
#define MCD_WF_FEATS 5
#define MCD_WF_GOV 6
#define MCD_WF_LABEL 7
#define MCD_WF_STAG 8
#define MCD_WF_SENT_SEG 9
#define MCD_WF_A 10
#define MCD_WF_B 11
#define MCD_WF_C 12
#define MCD_WF_D 13
#define MCD_WF_E 14
#define MCD_WF_F 15
#define MCD_WF_G 16
#define MCD_WF_H 17
#define MCD_WF_I 18
#define MCD_WF_J 19
#define MCD_WF_K 20
#define MCD_WF_L 21
#define MCD_WF_M 22
#define MCD_WF_N 23
#define MCD_WF_O 24
#define MCD_WF_P 25
#define MCD_WF_Q 26
#define MCD_WF_R 27
#define MCD_WF_S 28
#define MCD_WF_T 29
#define MCD_WF_U 30
#define MCD_WF_V 31
#define MCD_WF_W 32
#define MCD_WF_X 33
#define MCD_WF_Y 34
#define MCD_WF_Z 35
#include "dico.h"
#include "feat_types.h"
#include "word_emb.h"
#include "dico_vec.h"
#define mcd_get_dico_label(m) (m)->dico_array[FEAT_TYPE_LABEL]
#define mcd_get_dico_label(m) (m)->dico_array[MCD_WF_LABEL]
#define mcd_get_index_col(m) (m)->wf2col[MCD_WF_INDEX]
#define mcd_get_form_col(m) (m)->wf2col[MCD_WF_FORM]
#define mcd_get_lemma_col(m) (m)->wf2col[MCD_WF_LEMMA]
#define mcd_get_cpos_col(m) (m)->wf2col[MCD_WF_CPOS]
#define mcd_get_pos_col(m) (m)->wf2col[MCD_WF_POS]
#define mcd_get_feats_col(m) (m)->wf2col[MCD_WF_FEATS]
#define mcd_get_gov_col(m) (m)->wf2col[MCD_WF_GOV]
#define mcd_get_label_col(m) (m)->wf2col[MCD_WF_LABEL]
#define mcd_get_stag_col(m) (m)->wf2col[MCD_WF_STAG]
#define mcd_get_sent_seg_col(m) (m)->wf2col[MCD_WF_SENT_SEG]
#define mcd_get_a_col(m) (m)->wf2col[MCD_WF_A]
#define mcd_get_b_col(m) (m)->wf2col[MCD_WF_B]
#define mcd_get_c_col(m) (m)->wf2col[MCD_WF_C]
#define mcd_get_d_col(m) (m)->wf2col[MCD_WF_D]
#define mcd_get_e_col(m) (m)->wf2col[MCD_WF_E]
#define mcd_get_f_col(m) (m)->wf2col[MCD_WF_F]
#define mcd_get_g_col(m) (m)->wf2col[MCD_WF_G]
#define mcd_get_h_col(m) (m)->wf2col[MCD_WF_H]
#define mcd_get_i_col(m) (m)->wf2col[MCD_WF_I]
#define mcd_get_j_col(m) (m)->wf2col[MCD_WF_J]
#define mcd_get_k_col(m) (m)->wf2col[MCD_WF_K]
#define mcd_get_l_col(m) (m)->wf2col[MCD_WF_L]
#define mcd_get_m_col(m) (m)->wf2col[MCD_WF_M]
#define mcd_get_n_col(m) (m)->wf2col[MCD_WF_N]
#define mcd_get_o_col(m) (m)->wf2col[MCD_WF_O]
#define mcd_get_p_col(m) (m)->wf2col[MCD_WF_P]
#define mcd_get_q_col(m) (m)->wf2col[MCD_WF_Q]
#define mcd_get_r_col(m) (m)->wf2col[MCD_WF_R]
#define mcd_get_s_col(m) (m)->wf2col[MCD_WF_S]
#define mcd_get_t_col(m) (m)->wf2col[MCD_WF_T]
#define mcd_get_u_col(m) (m)->wf2col[MCD_WF_U]
#define mcd_get_v_col(m) (m)->wf2col[MCD_WF_V]
#define mcd_get_w_col(m) (m)->wf2col[MCD_WF_W]
#define mcd_get_x_col(m) (m)->wf2col[MCD_WF_X]
#define mcd_get_y_col(m) (m)->wf2col[MCD_WF_Y]
#define mcd_get_z_col(m) (m)->wf2col[MCD_WF_Z]
#define mcd_get_form_col(m) (m)->type[FEAT_TYPE_FORM]
#define mcd_set_form_col(m, v) (m)->type[FEAT_TYPE_FORM] = (v)
#define mcd_set_form_col(m, v) (m)->wf[MCD_WF_FORM] = (v)
/* mcd (multi column description) files describe the format of corpus files */
/* every line of an mcd file describes the content of a column of the corpus file */
...
...
@@ -35,10 +107,9 @@
typedef
struct
{
int
nb_col
;
/* number of columns in the mcd file */
int
type2col
[
FEAT_TYPE_NB
];
/* in which column is represented is the form (FEAT_TYPE_FORM) lemma ... represented */
/* int *col2type; */
int
*
type
;
/* array containing the type of every column */
char
**
type_str
;
/* a string version of array type */
int
wf2col
[
MCD_WF_NB
];
/* in which column are the word features (MCD_WF_FORM, MCD_WF_LEMMA ...) represented */
int
*
wf
;
/* array containing the word feature that correspond to each column */
char
**
wf_str
;
/* a string version of array word feature */
int
*
representation
;
/* array containing the representation mode of every column (integer, vocabulary, embedding, NULL) */
char
**
filename
;
/* array containing the file in which the different values for a columnn is represented */
dico
**
dico_array
;
/* array containing the dico corresponding to each column (NULL if no file) */
...
...
@@ -54,5 +125,6 @@ void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename);
void
mcd_free
(
mcd
*
m
);
int
mcd_get_code
(
mcd
*
m
,
char
*
str
,
int
col
);
dico_vec
*
mcd_build_dico_vec
(
mcd
*
mcd_struct
);
int
mcd_wf_code
(
char
*
wf
);
#endif
maca_common/include/word.h
View file @
1c1b0a6c
...
...
@@ -3,88 +3,96 @@
#include "mcd.h"
#define word_get_index(w) (w)->feat_array[FEAT_TYPE_INDEX]
#define word_get_form(w) (w)->feat_array[FEAT_TYPE_FORM]
#define word_get_lemma(w) (w)->feat_array[FEAT_TYPE_LEMMA]
#define word_get_cpos(w) (w)->feat_array[FEAT_TYPE_CPOS]
#define word_get_pos(w) (w)->feat_array[FEAT_TYPE_POS]
#define word_get_feats(w) (w)->feat_array[FEAT_TYPE_FEATS]
#define word_get_gov(w) (w)->feat_array[FEAT_TYPE_GOV]
#define word_get_label(w) (w)->feat_array[FEAT_TYPE_LABEL]
#define word_get_stag(w) (w)->feat_array[FEAT_TYPE_STAG]
#define word_get_A(w) (w)->feat_array[FEAT_TYPE_A]
#define word_get_B(w) (w)->feat_array[FEAT_TYPE_B]
#define word_get_C(w) (w)->feat_array[FEAT_TYPE_C]
#define word_get_D(w) (w)->feat_array[FEAT_TYPE_D]
#define word_get_E(w) (w)->feat_array[FEAT_TYPE_E]
#define word_get_F(w) (w)->feat_array[FEAT_TYPE_F]
#define word_get_G(w) (w)->feat_array[FEAT_TYPE_G]
#define word_get_H(w) (w)->feat_array[FEAT_TYPE_H]
#define word_get_I(w) (w)->feat_array[FEAT_TYPE_I]
#define word_get_J(w) (w)->feat_array[FEAT_TYPE_J]
#define word_get_K(w) (w)->feat_array[FEAT_TYPE_K]
#define word_get_L(w) (w)->feat_array[FEAT_TYPE_L]
#define word_get_M(w) (w)->feat_array[FEAT_TYPE_M]
#define word_get_N(w) (w)->feat_array[FEAT_TYPE_N]
#define word_get_O(w) (w)->feat_array[FEAT_TYPE_O]
#define word_get_P(w) (w)->feat_array[FEAT_TYPE_P]
#define word_get_Q(w) (w)->feat_array[FEAT_TYPE_Q]
#define word_get_R(w) (w)->feat_array[FEAT_TYPE_R]
#define word_get_S(w) (w)->feat_array[FEAT_TYPE_S]
#define word_get_T(w) (w)->feat_array[FEAT_TYPE_T]
#define word_get_U(w) (w)->feat_array[FEAT_TYPE_U]
#define word_get_V(w) (w)->feat_array[FEAT_TYPE_V]
#define word_get_W(w) (w)->feat_array[FEAT_TYPE_W]
#define word_get_X(w) (w)->feat_array[FEAT_TYPE_X]
#define word_get_Y(w) (w)->feat_array[FEAT_TYPE_Y]
#define word_get_Z(w) (w)->feat_array[FEAT_TYPE_Z]
#define word_get_index(w) (w)->wf_array[MCD_WF_INDEX]
#define word_get_form(w) (w)->wf_array[MCD_WF_FORM]
#define word_get_lemma(w) (w)->wf_array[MCD_WF_LEMMA]
#define word_get_cpos(w) (w)->wf_array[MCD_WF_CPOS]
#define word_get_pos(w) (w)->wf_array[MCD_WF_POS]
#define word_get_feats(w) (w)->wf_array[MCD_WF_FEATS]
#define word_get_gov(w) (w)->wf_array[MCD_WF_GOV]
#define word_get_label(w) (w)->wf_array[MCD_WF_LABEL]
#define word_get_stag(w) (w)->wf_array[MCD_WF_STAG]
#define word_get_sent_seg(w) (w)->wf_array[MCD_WF_SENT_SEG]
#define word_get_A(w) (w)->wf_array[MCD_WF_A]
#define word_get_B(w) (w)->wf_array[MCD_WF_B]
#define word_get_C(w) (w)->wf_array[MCD_WF_C]
#define word_get_D(w) (w)->wf_array[MCD_WF_D]
#define word_get_E(w) (w)->wf_array[MCD_WF_E]
#define word_get_F(w) (w)->wf_array[MCD_WF_F]
#define word_get_G(w) (w)->wf_array[MCD_WF_G]
#define word_get_H(w) (w)->wf_array[MCD_WF_H]
#define word_get_I(w) (w)->wf_array[MCD_WF_I]
#define word_get_J(w) (w)->wf_array[MCD_WF_J]
#define word_get_K(w) (w)->wf_array[MCD_WF_K]
#define word_get_L(w) (w)->wf_array[MCD_WF_L]
#define word_get_M(w) (w)->wf_array[MCD_WF_M]
#define word_get_N(w) (w)->wf_array[MCD_WF_N]
#define word_get_O(w) (w)->wf_array[MCD_WF_O]
#define word_get_P(w) (w)->wf_array[MCD_WF_P]
#define word_get_Q(w) (w)->wf_array[MCD_WF_Q]
#define word_get_R(w) (w)->wf_array[MCD_WF_R]
#define word_get_S(w) (w)->wf_array[MCD_WF_S]
#define word_get_T(w) (w)->wf_array[MCD_WF_T]
#define word_get_U(w) (w)->wf_array[MCD_WF_U]
#define word_get_V(w) (w)->wf_array[MCD_WF_V]
#define word_get_W(w) (w)->wf_array[MCD_WF_W]
#define word_get_X(w) (w)->wf_array[MCD_WF_X]
#define word_get_Y(w) (w)->wf_array[MCD_WF_Y]
#define word_get_Z(w) (w)->wf_array[MCD_WF_Z]
#define word_get_signature(w) (w)->signature
#define word_set_index(w, val) (w)->feat_array[FEAT_TYPE_INDEX] = (val)
#define word_set_form(w, val) (w)->feat_array[FEAT_TYPE_FORM] = (val)
#define word_set_lemma(w, val) (w)->feat_array[FEAT_TYPE_LEMMA] = (val)
#define word_set_cpos(w, val) (w)->feat_array[FEAT_TYPE_CPOS] = (val)
#define word_set_pos(w, val) (w)->feat_array[FEAT_TYPE_POS] = (val)
#define word_set_feats(w, val) (w)->feat_array[FEAT_TYPE_FEATS] = (val)
#define word_set_gov(w, val) (w)->feat_array[FEAT_TYPE_GOV] = (val)
#define word_set_label(w, val) (w)->feat_array[FEAT_TYPE_LABEL] = (val)
#define word_set_stag(w, val) (w)->feat_array[FEAT_TYPE_STAG] = (val)
#define word_set_A(w, val) (w)->feat_array[FEAT_TYPE_A] = (val)
#define word_set_B(w, val) (w)->feat_array[FEAT_TYPE_B] = (val)
#define word_set_C(w, val) (w)->feat_array[FEAT_TYPE_C] = (val)
#define word_set_D(w, val) (w)->feat_array[FEAT_TYPE_D] = (val)
#define word_set_E(w, val) (w)->feat_array[FEAT_TYPE_E] = (val)
#define word_set_F(w, val) (w)->feat_array[FEAT_TYPE_F] = (val)
#define word_set_G(w, val) (w)->feat_array[FEAT_TYPE_G] = (val)
#define word_set_H(w, val) (w)->feat_array[FEAT_TYPE_H] = (val)
#define word_set_I(w, val) (w)->feat_array[FEAT_TYPE_I] = (val)
#define word_set_J(w, val) (w)->feat_array[FEAT_TYPE_J] = (val)
#define word_set_K(w, val) (w)->feat_array[FEAT_TYPE_K] = (val)
#define word_set_L(w, val) (w)->feat_array[FEAT_TYPE_L] = (val)
#define word_set_M(w, val) (w)->feat_array[FEAT_TYPE_M] = (val)
#define word_set_N(w, val) (w)->feat_array[FEAT_TYPE_N] = (val)
#define word_set_O(w, val) (w)->feat_array[FEAT_TYPE_O] = (val)
#define word_set_P(w, val) (w)->feat_array[FEAT_TYPE_P] = (val)
#define word_set_Q(w, val) (w)->feat_array[FEAT_TYPE_Q] = (val)
#define word_set_R(w, val) (w)->feat_array[FEAT_TYPE_R] = (val)
#define word_set_S(w, val) (w)->feat_array[FEAT_TYPE_S] = (val)
#define word_set_T(w, val) (w)->feat_array[FEAT_TYPE_T] = (val)
#define word_set_U(w, val) (w)->feat_array[FEAT_TYPE_U] = (val)
#define word_set_V(w, val) (w)->feat_array[FEAT_TYPE_V] = (val)
#define word_set_W(w, val) (w)->feat_array[FEAT_TYPE_W] = (val)
#define word_set_X(w, val) (w)->feat_array[FEAT_TYPE_X] = (val)
#define word_set_index(w, val) (w)->wf_array[MCD_WF_INDEX] = (val)
#define word_set_form(w, val) (w)->wf_array[MCD_WF_FORM] = (val)
#define word_set_lemma(w, val) (w)->wf_array[MCD_WF_LEMMA] = (val)
#define word_set_cpos(w, val) (w)->wf_array[MCD_WF_CPOS] = (val)
#define word_set_pos(w, val) (w)->wf_array[MCD_WF_POS] = (val)
#define word_set_feats(w, val) (w)->wf_array[MCD_WF_FEATS] = (val)
#define word_set_gov(w, val) (w)->wf_array[MCD_WF_GOV] = (val)
#define word_set_label(w, val) (w)->wf_array[MCD_WF_LABEL] = (val)
#define word_set_stag(w, val) (w)->wf_array[MCD_WF_STAG] = (val)
#define word_set_word_seg(w) (w)->wf_array[MCD_WF_WORD_SEG] = (val)
#define word_set_A(w, val) (w)->wf_array[MCD_WF_A] = (val)
#define word_set_B(w, val) (w)->wf_array[MCD_WF_B] = (val)
#define word_set_C(w, val) (w)->wf_array[MCD_WF_C] = (val)
#define word_set_D(w, val) (w)->wf_array[MCD_WF_D] = (val)
#define word_set_E(w, val) (w)->wf_array[MCD_WF_E] = (val)
#define word_set_F(w, val) (w)->wf_array[MCD_WF_F] = (val)
#define word_set_G(w, val) (w)->wf_array[MCD_WF_G] = (val)
#define word_set_H(w, val) (w)->wf_array[MCD_WF_H] = (val)
#define word_set_I(w, val) (w)->wf_array[MCD_WF_I] = (val)
#define word_set_J(w, val) (w)->wf_array[MCD_WF_J] = (val)
#define word_set_K(w, val) (w)->wf_array[MCD_WF_K] = (val)
#define word_set_L(w, val) (w)->wf_array[MCD_WF_L] = (val)
#define word_set_M(w, val) (w)->wf_array[MCD_WF_M] = (val)
#define word_set_N(w, val) (w)->wf_array[MCD_WF_N] = (val)
#define word_set_O(w, val) (w)->wf_array[MCD_WF_O] = (val)
#define word_set_P(w, val) (w)->wf_array[MCD_WF_P] = (val)
#define word_set_Q(w, val) (w)->wf_array[MCD_WF_Q] = (val)
#define word_set_R(w, val) (w)->wf_array[MCD_WF_R] = (val)
#define word_set_S(w, val) (w)->wf_array[MCD_WF_S] = (val)
#define word_set_T(w, val) (w)->wf_array[MCD_WF_T] = (val)
#define word_set_U(w, val) (w)->wf_array[MCD_WF_U] = (val)
#define word_set_V(w, val) (w)->wf_array[MCD_WF_V] = (val)
#define word_set_W(w, val) (w)->wf_array[MCD_WF_W] = (val)
#define word_set_X(w, val) (w)->wf_array[MCD_WF_X] = (val)
#define word_set_Y(w, val) (w)->f
eat
_array[
FEAT_TYPE
_Y] = (val)
#define word_set_Z(w, val) (w)->f
eat
_array[
FEAT_TYPE
_Z] = (val)
#define word_set_Y(w, val) (w)->
w
f_array[
MCD_WF
_Y] = (val)
#define word_set_Z(w, val) (w)->
w
f_array[
MCD_WF
_Z] = (val)
#define word_set_signature(w, val) (w)->signature = (val)
#define word_set_relative_index(w, val) (w)->relative_index = (val)
#define word_get_relative_index(w) (w)->relative_index
typedef
struct
_word
{
int
f
eat
_array
[
FEAT_TYPE_NB
];
/* array containing the codes corresponding to the different word features */
int
w
f_array
[
MCD_WF_NB
];
/* array containing the codes corresponding to the different word features */
char
*
input
;
/* the string corresponding to the actual line in the corpus file */
int
U1
;
/* does the form begin with an uppercase character */
int
signature
;
/* pos tags that this form can have (represented as a boolean string) */
int
label
;
char
*
form
;
int
relative_index
;
}
word
;
word
*
word_new
(
char
*
input
);
...
...
@@ -92,11 +100,13 @@ word *word_create_dummy(mcd *mcd_struct);
word
*
word_copy
(
word
*
w
);
void
word_free
(
word
*
w
);
void
word_print2
(
FILE
*
f
,
word
*
w
);
void
word_print
(
FILE
*
f
,
word
*
w
,
mcd
*
mcd_struct
,
dico
*
dico_labels
);
word
*
word_read
(
FILE
*
f
,
mcd
*
mcd_struct
);
word
*
word_parse_buffer
(
char
*
buffer
,
mcd
*
mcd_struct
);
int
word_is_eos
(
word
*
w
,
mcd
*
mcd_struct
);
int
word_get_gov_relative_index
(
word
*
w
);
#endif
maca_common/src/dico.c
View file @
1c1b0a6c
...
...
@@ -89,9 +89,8 @@ void dico_print_fh(FILE *f, dico *d)
void
dico_print
(
char
*
filename
,
dico
*
d
)
{
FILE
*
f
;
if
(
filename
==
NULL
)
{
if
(
filename
==
NULL
)
f
=
stdout
;
}
else
{
f
=
fopen
(
filename
,
"w"
);
if
(
f
==
NULL
){
...
...
@@ -100,6 +99,8 @@ void dico_print(char *filename, dico *d)
}
}
dico_print_fh
(
f
,
d
);
if
(
filename
!=
NULL
)
fclose
(
f
);
}
...
...
@@ -136,7 +137,9 @@ char *dico_int2string(dico *d, int val)
int
dico_string2int
(
dico
*
d
,
char
*
string
)
{
cell
*
c
=
hash_lookup
(
d
->
htable
,
string
);
cell
*
c
;
c
=
hash_lookup
(
d
->
htable
,
string
);
if
(
c
)
return
c
->
val
;
else
...
...
@@ -162,6 +165,7 @@ dico *dico_extract_from_corpus(char *filename, int column, char *dico_name)
column_nb
=
0
;
do
{
if
(
column_nb
==
column
){
/* printf("token = %s\n", token); */
dico_add
(
d
,
token
);
}
column_nb
++
;
...
...
maca_common/src/mcd.c
View file @
1c1b0a6c
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
...
...
@@ -7,27 +8,26 @@
#include "dico.h"
#include "word_emb.h"
mcd
*
mcd_new
(
int
nb_col
)
{
mcd
*
m
=
(
mcd
*
)
memalloc
(
sizeof
(
mcd
));
int
i
;
m
->
nb_col
=
nb_col
;
for
(
i
=
0
;
i
<
FEAT_TYPE
_NB
;
i
++
)
m
->
type
2col
[
i
]
=
-
1
;
for
(
i
=
0
;
i
<
MCD_WF
_NB
;
i
++
)
m
->
wf
2col
[
i
]
=
-
1
;
m
->
representation
=
(
int
*
)
memalloc
(
nb_col
*
sizeof
(
int
));
m
->
type
=
(
int
*
)
memalloc
(
nb_col
*
sizeof
(
int
));
m
->
type
_str
=
(
char
**
)
memalloc
(
nb_col
*
sizeof
(
char
*
));
m
->
wf
=
(
int
*
)
memalloc
(
nb_col
*
sizeof
(
int
));
m
->
wf
_str
=
(
char
**
)
memalloc
(
nb_col
*
sizeof
(
char
*
));
m
->
filename
=
(
char
**
)
memalloc
(
nb_col
*
sizeof
(
char
*
));
m
->
dico_array
=
(
dico
**
)
memalloc
(
nb_col
*
sizeof
(
dico
*
));
m
->
word_emb_array
=
(
word_emb
**
)
memalloc
(
nb_col
*
sizeof
(
word_emb
*
));
for
(
i
=
0
;
i
<
nb_col
;
i
++
){
m
->
representation
[
i
]
=
MCD_REPRESENTATION_NULL
;
m
->
type
[
i
]
=
-
1
;
m
->
type
_str
[
i
]
=
NULL
;
m
->
wf
[
i
]
=
-
1
;
m
->
wf
_str
[
i
]
=
NULL
;
m
->
filename
[
i
]
=
NULL
;
m
->
dico_array
[
i
]
=
NULL
;
m
->
word_emb_array
[
i
]
=
NULL
;;
...
...
@@ -41,14 +41,14 @@ void mcd_free(mcd *m)
for
(
i
=
0
;
i
<
m
->
nb_col
;
i
++
){
if
(
m
->
dico_array
[
i
])
dico_free
(
m
->
dico_array
[
i
]);
if
(
m
->
word_emb_array
[
i
])
word_emb_free
(
m
->
word_emb_array
[
i
]);
if
(
m
->
type
_str
[
i
])
free
(
m
->
type
_str
[
i
]);
if
(
m
->
wf
_str
[
i
])
free
(
m
->
wf
_str
[
i
]);
}
free
(
m
->
representation
);
free
(
m
->
filename
);
free
(
m
->
dico_array
);
free
(
m
->
word_emb_array
);
free
(
m
->
type
_str
);
free
(
m
->
type
);
free
(
m
->
wf
_str
);
free
(
m
->
wf
);
free
(
m
);
}
...
...
@@ -58,7 +58,7 @@ void mcd_free(mcd *m)
int
mcd_get_code
(
mcd
*
m
,
char
*
str
,
int
col
){
if
(
m
->
representation
[
col
]
==
MCD_REPRESENTATION_VOCAB
)
return
dico_string2int
(
m
->
dico_array
[
col
],
str
);
return
(
m
->
dico_array
[
col
])
?
dico_string2int
(
m
->
dico_array
[
col
],
str
)
:
-
1
;
if
(
m
->
representation
[
col
]
==
MCD_REPRESENTATION_EMB
)
return
word_emb_get_code
(
m
->
word_emb_array
[
col
],
str
);
if
(
m
->
representation
[
col
]
==
MCD_REPRESENTATION_INT
)
...
...
@@ -74,7 +74,7 @@ int mcd_max_column_index_in_file(char *mcd_filename)
FILE
*
f
=
myfopen
(
mcd_filename
,
"r"
);
char
buffer
[
1000
];
/* ugly */
int
column
;
char
type
[
100
];
char
wf
[
100
];
char
representation
[
100
];
char
filename
[
500
];
/* ugly */
int
fields_number
;
...
...
@@ -84,7 +84,7 @@ int mcd_max_column_index_in_file(char *mcd_filename)
line_number
++
;
if
(
feof
(
f
))
break
;
if
((
buffer
[
0
]
==
'\n'
)
||
(
buffer
[
0
]
==
'#'
))
continue
;
fields_number
=
sscanf
(
buffer
,
"%d %s %s %s"
,
&
column
,
type
,
representation
,
filename
);
fields_number
=
sscanf
(
buffer
,
"%d %s %s %s"
,
&
column
,
wf
,
representation
,
filename
);
if
(
fields_number
!=
4
){
fprintf
(
stderr
,
"line %d of mcd file %s ill formed, I'm skipping it
\n
"
,
line_number
,
mcd_filename
);
continue
;
...
...
@@ -106,8 +106,8 @@ void mcd_extract_dico_from_corpus(mcd *m, char *corpus_filename)
if
((
m
->
representation
[
column
]
==
MCD_REPRESENTATION_VOCAB
)
/* && (strcmp(m->filename[column], "_")) */
&&
(
m
->
dico_array
[
column
]
==
NULL
)){
m
->
dico_array
[
column
]
=
dico_extract_from_corpus
(
corpus_filename
,
column
,
m
->
type
_str
[
column
]);
fprintf
(
stderr
,
"extracting dico %s from corpus
\n
"
,
m
->
type
_str
[
column
]);
m
->
dico_array
[
column
]
=
dico_extract_from_corpus
(
corpus_filename
,
column
,
m
->
wf
_str
[
column
]);
fprintf
(
stderr
,
"extracting dico %s
\t
from corpus
\n
"
,
m
->
wf
_str
[
column
]);
}
}
}
...
...
@@ -123,8 +123,8 @@ void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose)
if
((
m
->
representation
[
column
]
==
MCD_REPRESENTATION_VOCAB
)
&&
(
!
strcmp
(
m
->
filename
[
column
],
"_"
))
&&
(
m
->
dico_array
[
column
]
==
NULL
)){
m
->
dico_array
[
column
]
=
dico_vec_get_dico
(
vocabs
,
m
->
type
_str
[
column
]);
if
(
verbose
)
fprintf
(
stderr
,
"linking to dico %s
\n
"
,
m
->
type
_str
[
column
]);
m
->
dico_array
[
column
]
=
dico_vec_get_dico
(
vocabs
,
m
->
wf
_str
[
column
]);
if
(
verbose
)
fprintf
(
stderr
,
"linking to dico %s
\n
"
,
m
->
wf
_str
[
column
]);
}
}
}
...
...
@@ -134,7 +134,7 @@ void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose)
mcd
*
mcd_read
(
char
*
mcd_filename
,
int
verbose
)
{
int
column
;
char
type
[
100
];
char
wf
[
100
];
char
representation
[
100
];
char
filename
[
500
];
/* ugly */
int
fields_number
;
...
...
@@ -149,19 +149,20 @@ mcd *mcd_read(char *mcd_filename, int verbose)
line_number
++
;
if
(
feof
(
f
))
break
;
if
((
buffer
[
0
]
==
'\n'
)
||
(
buffer
[
0
]
==
'#'
))
continue
;
fields_number
=
sscanf
(
buffer
,
"%d %s %s %s"
,
&
column
,
type
,
representation
,
filename
);
fields_number
=
sscanf
(
buffer
,
"%d %s %s %s"
,
&
column
,
wf
,
representation
,
filename
);
if
(
fields_number
!=
4
){
/* fprintf(stderr, "line %d of mcd file %s ill formed, I'm skipping it\n", line_number, mcd_filename); */
continue
;
}
if
(
verbose
)
fprintf
(
stderr
,
"column = %d
typ
e = %s
representation = %s
filename = %s
\n
"
,
column
,
type
,
representation
,
filename
);
m
->
type
[
column
]
=
feat_type_string2int
(
type
);
m
->
type
_str
[
column
]
=
strdup
(
type
);
if
(
m
->
type
[
column
]
==
-
1
){
fprintf
(
stderr
,
"in line %d of mcd file %s invalid
type
, I'm skipping it
\n
"
,
line_number
,
mcd_filename
);
if
(
verbose
)
fprintf
(
stderr
,
"column = %d
\t
word featur
e = %s
\t
representation = %s
\t
filename = %s
\n
"
,
column
,
wf
,
representation
,
filename
);
m
->
wf
[
column
]
=
mcd_wf_code
(
wf
);
m
->
wf
_str
[
column
]
=
strdup
(
wf
);
if
(
m
->
wf
[
column
]
==
-
1
){
fprintf
(
stderr
,
"in line %d of mcd file %s invalid
wf
, I'm skipping it
\n
"
,
line_number
,
mcd_filename
);
continue
;
}
m
->
type2col
[
m
->
type
[
column
]]
=
column
;
m
->
wf2col
[
m
->
wf
[
column
]]
=
column
;
if
(
!
strcmp
(
representation
,
"_"
))
m
->
representation
[
column
]
=
MCD_REPRESENTATION_NULL
;
else
if
(
!
strcmp
(
representation
,
"EMB"
))
m
->
representation
[
column
]
=
MCD_REPRESENTATION_EMB
;
...
...
@@ -185,6 +186,7 @@ mcd *mcd_read(char *mcd_filename, int verbose)
}
}
}
fclose
(
f
);
return
m
;
}
...
...
@@ -194,53 +196,53 @@ mcd *mcd_read(char *mcd_filename, int verbose)
mcd
*
mcd_build_conll07
(
void
)
{
mcd
*
m
=
mcd_new
(
8
);
m
->
type
[
0
]
=
FEAT_TYPE
_INDEX
;
m
->
type
_str
[
0
]
=
strdup
(
"INDEX"
);
m
->
wf
[
0
]
=
MCD_WF
_INDEX
;
m
->
wf
_str
[
0
]
=
strdup
(
"INDEX"
);
m
->
representation
[
0
]
=
MCD_REPRESENTATION_INT
;
m
->
filename
[
0
]
=
strdup
(
"_"
);
m
->
type
2col
[
FEAT_TYPE
_INDEX
]
=
0
;
m
->
wf
2col
[
MCD_WF
_INDEX
]
=
0
;
m
->
type
[
1
]
=
FEAT_TYPE
_FORM
;
m
->
type
_str
[
1
]
=
strdup
(
"FORM"
);
m
->
wf
[
1
]
=
MCD_WF
_FORM
;
m
->
wf
_str
[
1
]
=
strdup
(
"FORM"
);
m
->
representation
[
1
]
=
MCD_REPRESENTATION_VOCAB
;
m
->
filename
[
1
]
=
strdup
(
"_"
);
m
->
type
2col
[
FEAT_TYPE
_FORM
]
=
1
;
m
->
wf
2col
[
MCD_WF
_FORM
]
=
1
;
m
->
type
[
2
]
=
FEAT_TYPE
_LEMMA
;
m
->
type
_str
[
2
]
=
strdup
(
"LEMMA"
);
m
->
wf
[
2
]
=
MCD_WF
_LEMMA
;
m
->
wf
_str
[
2
]
=
strdup
(
"LEMMA"
);
m
->
representation
[
2
]
=
MCD_REPRESENTATION_VOCAB
;
m
->
filename
[
2
]
=
strdup
(
"_"
);
m
->
type
2col
[
FEAT_TYPE
_LEMMA
]
=
2
;
m
->
wf
2col
[
MCD_WF
_LEMMA
]
=
2
;
m
->
type
[
3
]
=
FEAT_TYPE
_CPOS
;
m
->
type
_str
[
3
]
=
strdup
(
"CPOS"
);
m
->
wf
[
3
]
=
MCD_WF
_CPOS
;
m
->
wf
_str
[
3
]
=
strdup
(
"CPOS"
);
m
->
representation
[
3
]
=
MCD_REPRESENTATION_VOCAB
;
m
->
filename
[
3
]
=
strdup
(
"_"
);
m
->
type
2col
[
FEAT_TYPE
_CPOS
]
=
3
;
m
->
wf
2col
[
MCD_WF
_CPOS
]
=
3
;
m
->
type
[
4
]
=
FEAT_TYPE
_POS
;
m
->
type
_str
[
4
]
=
strdup
(
"POS"
);
m
->
wf
[
4
]
=
MCD_WF
_POS
;
m
->
wf
_str
[
4
]
=
strdup
(
"POS"
);
m
->
representation
[
4
]
=
MCD_REPRESENTATION_VOCAB
;
m
->
filename
[
4
]
=
strdup
(
"_"
);
m
->
type
2col
[
FEAT_TYPE
_POS
]
=
4
;
m
->
wf
2col
[
MCD_WF
_POS
]
=
4
;
m
->
type
[
5
]
=
FEAT_TYPE
_FEATS
;
m
->
type
_str
[
5
]
=
strdup
(
"FEATS"
);
m
->
wf
[
5
]
=
MCD_WF
_FEATS
;
m
->
wf
_str
[
5
]
=
strdup
(
"FEATS"
);
m
->
representation
[
5
]
=
MCD_REPRESENTATION_VOCAB
;
m
->
filename
[
5
]
=
strdup
(
"_"
);
m
->
type
2col
[
FEAT_TYPE
_FEATS
]
=
5
;
m
->
wf
2col
[
MCD_WF
_FEATS
]
=
5
;
m
->
type
[
6
]
=
FEAT_TYPE
_GOV
;
m
->
type
_str
[
6
]
=
strdup
(
"GOV"
);
m
->
wf
[
6
]
=
MCD_WF
_GOV
;
m
->
wf
_str
[
6
]
=
strdup
(
"GOV"
);
m
->
representation
[
6
]
=
MCD_REPRESENTATION_INT
;
m
->
filename
[
6
]
=
strdup
(
"_"
);
m
->
type
2col
[
FEAT_TYPE
_GOV
]
=
6
;
m
->
wf
2col
[
MCD_WF
_GOV
]
=
6
;
m
->
type
[
7
]
=
FEAT_TYPE
_LABEL
;
m
->
type
_str
[
7
]
=
strdup
(
"LABEL"
);
m
->
wf
[
7
]
=
MCD_WF
_LABEL
;
m
->
wf
_str
[
7
]
=
strdup
(
"LABEL"
);
m
->
representation
[
7
]
=
MCD_REPRESENTATION_VOCAB
;
m
->
filename
[
7
]
=
strdup
(
"_"
);
m
->
type
2col
[
FEAT_TYPE
_LABEL
]
=
7
;
m
->
wf
2col
[
MCD_WF
_LABEL
]
=
7
;
return
m
;
}
...
...
@@ -251,114 +253,45 @@ mcd *mcd_build_ifpls(void)
{
mcd
*
m
=
mcd_new
(
6
);
m
->
type
[
0
]
=
FEAT_TYPE
_INDEX
;
m
->
type
_str
[
0
]
=
strdup
(
"INDEX"
);
m
->
wf
[
0
]
=
MCD_WF
_INDEX
;
m
->
wf
_str
[
0
]
=
strdup
(
"INDEX"
);
m
->
representation
[
0
]
=
MCD_REPRESENTATION_INT
;
m
->
filename
[
0
]
=
strdup
(
"_"
);
m
->
type
2col
[
FEAT_TYPE
_INDEX
]
=
0
;
m
->
wf
2col
[
MCD_WF
_INDEX
]
=
0
;
m
->
type
[
1
]
=
FEAT_TYPE
_FORM
;
m
->
type
_str
[
1
]
=
strdup
(
"FORM"
);
m
->
wf
[
1
]
=
MCD_WF
_FORM
;
m
->
wf
_str
[
1
]
=
strdup
(
"FORM"
);
m
->
representation
[
1
]
=
MCD_REPRESENTATION_VOCAB
;
m
->
filename
[
1
]
=
strdup
(
"_"
);
m
->
type
2col
[
FEAT_TYPE
_FORM
]
=
1
;
m
->
wf
2col
[
MCD_WF
_FORM
]
=
1
;
m
->
type
[
2
]
=
FEAT_TYPE
_POS
;
m
->
type
_str
[
2
]
=
strdup
(
"POS"
);
m
->
wf
[
2
]
=
MCD_WF
_POS
;
m
->
wf
_str
[
2
]
=
strdup
(
"POS"
);
m
->
representation
[
2
]
=
MCD_REPRESENTATION_VOCAB
;
m
->
filename
[
2
]
=
strdup
(
"_"
);
m
->
type
2col
[
FEAT_TYPE
_POS
]
=
2
;
m
->
wf
2col
[
MCD_WF
_POS
]
=
2
;
m
->
type
[
3
]
=
FEAT_TYPE
_LEMMA
;
m
->
type
_str
[
3
]
=
strdup
(
"LEMMA"
);
m
->
wf
[
3
]
=
MCD_WF
_LEMMA
;
m
->
wf
_str
[
3
]
=
strdup
(
"LEMMA"
);
m
->
representation
[
3
]
=
MCD_REPRESENTATION_VOCAB
;
m
->
filename
[
3
]
=
strdup
(
"_"
);
m
->
type
2col
[