diff --git a/CMakeLists.txt b/CMakeLists.txt index 389bdf0e217738811da3e08daf8eaac4d5703f23..9c1ade7f2970596a6570c05ac52ae522064d6035 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,13 @@ project(macaon2) find_package(FLEX) add_definitions("-Wall" ) +SET(CMAKE_C_COMPILER g++) +SET(CMAKE_CXX_COMPILER g++) + + +SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Ofast -DUSE_CBLAS") +SET( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lm -lopenblas" ) + if (${CMAKE_C_COMPILER_VERSION} VERSION_LESS 5.3) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu11") @@ -28,11 +35,12 @@ add_subdirectory(maca_common) add_subdirectory(maca_tools) add_subdirectory(perceptron) #add_subdirectory(maca_lemmatizer) +#add_subdirectory(maca_morpho) add_subdirectory(maca_tokenizer) add_subdirectory(maca_lexer) add_subdirectory(maca_trans_parser) add_subdirectory(maca_crf_tagger) -add_subdirectory(maca_graph_parser) +#add_subdirectory(maca_graph_parser) if(MACA_EXPORT) add_subdirectory(maca_export) diff --git a/maca_common/CMakeLists.txt b/maca_common/CMakeLists.txt index 503e9214c5a8190648c7c237e0e7e5f96b12cd54..ea4625b2b13c5452932cd1cef17f3590794d6304 100644 --- a/maca_common/CMakeLists.txt +++ b/maca_common/CMakeLists.txt @@ -12,6 +12,7 @@ set(SOURCES src/util.c src/feat_desc.c src/feat_lib.c src/feat_model.c + src/char16.c ) diff --git a/maca_common/include/char16.h b/maca_common/include/char16.h new file mode 100644 index 0000000000000000000000000000000000000000..a46a23425d1815fb3e704f25c4f3265d55e680a6 --- /dev/null +++ b/maca_common/include/char16.h @@ -0,0 +1,11 @@ +#ifndef __CHAR16__ +#define __CHAR16__ + +typedef short char16; + +int utf8_strlen(char *utf8_string); +char *char16toutf8(char16 *char16_string); +int char16_strlen(char16 *string); +char16 *utf8tochar16(char *utf8_string); + +#endif diff --git a/maca_common/include/feat_model.h b/maca_common/include/feat_model.h index f7c234cd0ea14ca4015cfa93f1826ecc2e7acc00..3febe35a4ee7c07a77dbc05ebe97a64badbb72d6 100644 --- a/maca_common/include/feat_model.h +++ b/maca_common/include/feat_model.h @@ -24,5 +24,5 @@ feat_desc *feat_model_add(feat_model *fm, feat_desc *fd); feat_model *feat_model_read(char *filename, feat_lib *fl, int verbose); void feat_model_compute_ranges(feat_model *fm, mcd *m, int mvt_nb); int feat_model_get_type_feat_n(feat_model *fm, int n); - +void catenate_int(char *string, int val); #endif diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index 881b98e80fff65bd5031a234cd16ebea0f9f14fc..0d7761d753741e12621a4590015fc10bb24ee437 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -8,7 +8,7 @@ #define MCD_INVALID_VALUE -1 -#define MCD_WF_NB 36 +#define MCD_WF_NB 47 #define MCD_WF_ID 0 #define MCD_WF_FORM 1 @@ -47,6 +47,81 @@ #define MCD_WF_Y 34 #define MCD_WF_Z 35 +#define MCD_WF_Aspect 36 +#define MCD_WF_Case 37 +#define MCD_WF_Clitic 38 +#define MCD_WF_Definite 39 +#define MCD_WF_Gender 40 +#define MCD_WF_Mood 41 +#define MCD_WF_NameType 42 +#define MCD_WF_NounType 43 +#define MCD_WF_Number 44 +#define MCD_WF_Person 45 +#define MCD_WF_Tense 46 + +/*Abbr +AdpType +AdvType +Animacy +Animacy[gram] +ConjType +Connegative +Degree +Derivation +Dialect +Echo +Evident +Foreign +Form +Gender[dat] +Gender[erg] +Gender[psor] +HebBinyan +HebExistential +HebSource +Hyph +InfForm + +Number[abs] +Number[dat] +Number[erg] +Number[psed] +Number[psor] +NumForm +NumType +NumValue +PartForm +PartType +Person[abs] +Person[dat] +Person[erg] +Person[psor] +Polarity +Polite +Polite[abs] +Polite[dat] +Polite[erg] +Position +Poss +Prefix +PrepCase +PrepForm +PronType +PunctSide +PunctType +Reflex +Strength +Style +Subcat +Typo +Variant +VerbForm +VerbType +Voice +Xtra*/ + + + #include "dico.h" #include "word_emb.h" #include "dico_vec.h" @@ -90,6 +165,9 @@ #define mcd_get_y_col(m) (m)->wf2col[MCD_WF_Y] #define mcd_get_z_col(m) (m)->wf2col[MCD_WF_Z] + + + #define mcd_set_form_col(m, v) (m)->wf[MCD_WF_FORM] = (v) @@ -121,6 +199,7 @@ mcd *mcd_build_conll07(void); mcd *mcd_build_ifpls(void); mcd *mcd_build_wplgf(void); mcd *mcd_build_wplgfs(void); +mcd *mcd_build_wpmlgfs(void); mcd *mcd_read(char *mcd_filename, int verbose); void mcd_link_to_dico(mcd *m, dico_vec *vocabs, int verbose); diff --git a/maca_common/include/word.h b/maca_common/include/word.h index 30074b7606988cfcefa4400b8f35acd958ea9807..33fd8c13605cbf2f9d47e58c57677089202a302e 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -2,6 +2,7 @@ #define __WORD__ #include "mcd.h" +#include "char16.h" #define WORD_INVALID_GOV 10000 @@ -12,10 +13,41 @@ typedef struct _word { int signature; /* pos tags that this form can have (represented as a boolean string) */ int label; char *form; + char16 *form_char16; int index; int is_root; } word; +/* +#define word_get_s1(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[strlen((w)->form) - 1]) +#define word_get_s2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[strlen((w)->form) - 2]) +#define word_get_s3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[strlen((w)->form) - 3]) +#define word_get_s4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[strlen((w)->form) - 4]) +#define word_get_s5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[strlen((w)->form) - 5]) +#define word_get_s6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 6))? -1 : (w)->form[strlen((w)->form) - 6]) +*/ +#define word_get_s1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 1]) +#define word_get_s2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 2]) +#define word_get_s3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 3]) +#define word_get_s4(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 4))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 4]) +#define word_get_s5(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 5]) +#define word_get_s6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 6))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 6]) + +/*#define word_get_p1(w) ((((w) == NULL) || ((w)->form == NULL) )? -1 : (w)->form[0]) +#define word_get_p2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[1]) +#define word_get_p3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[2]) +#define word_get_p4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[3]) +#define word_get_p5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[4]) +#define word_get_p6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[5]) +*/ + +#define word_get_p1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) )? -1 : (w)->form_char16[0]) +#define word_get_p2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[1]) +#define word_get_p3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[2]) +#define word_get_p4(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[3]) +#define word_get_p5(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 4))? -1 : (w)->form_char16[4]) +#define word_get_p6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[5]) + #define word_get_id(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID]) #define word_get_form(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM]) #define word_get_lemma(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LEMMA]) diff --git a/maca_common/src/char16.c b/maca_common/src/char16.c new file mode 100644 index 0000000000000000000000000000000000000000..311e618726db9c7e060513b47b67d91eb2323e5a --- /dev/null +++ b/maca_common/src/char16.c @@ -0,0 +1,93 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> + +typedef short char16; + +#define char_bit1(c) ((c) & 1) +#define char_bit2(c) (((c) & 2) >> 1) +#define char_bit3(c) (((c) & 4) >> 2) +#define char_bit4(c) (((c) & 8) >> 3) +#define char_bit5(c) (((c) & 16) >> 4) +#define char_bit6(c) (((c) & 32) >> 5) +#define char_bit7(c) (((c) & 64) >> 6) +#define char_bit8(c) (((c) & 128) >> 7) +#define length(c) ((!char_bit8((c)) || (char_bit8(c) && !char_bit7(c)))? 1 : 2) +/* +int length(char c) +{ + if(!char_bit8(c)) return 1; + if(char_bit8(c) && !char_bit7(c)) return 1; + if(char_bit7(c)) return 2; + if(char_bit6(c)) return 3; + if(char_bit5(4)) return 4; + +} +*/ +int utf8_strlen(char *utf8_string) +{ + int l = 0; + while(*utf8_string){ + l += (length(*utf8_string) == 1) ? 1 : 0; + utf8_string++; + } + return l; +} + +char *char16toutf8(char16 *char16_string) +{ + return NULL; +} + + +int char16_strlen(char16 *string) +{ + int i=0; + while(string[i]) i++; + return i; +} + +char16 *utf8tochar16(char *utf8_string) +{ + int i,j; + int utf8_length = strlen(utf8_string); + int char16_length = 0; + char16 *char16_string; + for(i=0; i < utf8_length; i++) + char16_length += length(utf8_string[i]); + + char16_string = (char16*) malloc((char16_length + 1)* sizeof(char16)); + for(i=0, j=0; i < utf8_length; i++, j++){ + if(length(utf8_string[i]) == 1){ + char16_string[j] = (char16)utf8_string[i]; + } + if(length(utf8_string[i]) == 2){ + char16_string[j] = utf8_string[i]; + char16_string[j] = char16_string[j] << 8; + char16_string[j] += utf8_string[++i]; + } + } + char16_string[j] = 0; + return char16_string; +} +/* +int main(void) +{ + int i; + char string[200]; + char16 *char16_string; + strcpy(string, "élémentaire"); + + printf("string = %s\n", string); + printf("length = %d\n", (int)strlen(string)); + printf("utf8 length = %d\n", (int)utf8_strlen(string)); + for(i=0; i < strlen(string); i++){ + printf("%d\t%c\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\tl=%d\n", i, string[i], (int)string[i], char_bit1(string[i]), char_bit2(string[i]), char_bit3(string[i]), char_bit4(string[i]), char_bit5(string[i]), char_bit6(string[i]), char_bit7(string[i]), char_bit8(string[i]), length(string[i])); + } + + + char16_string = utf8tochar16(string); + printf("char16_strlen = %d\n", char16_strlen(char16_string)); + +} +*/ diff --git a/maca_common/src/feat_model.c b/maca_common/src/feat_model.c index f28baecac3462b2587eac26d4ef4c45a3db027e8..4bf28ca9c1777129708bd26cb5f66cb3a7b3d288 100644 --- a/maca_common/src/feat_model.c +++ b/maca_common/src/feat_model.c @@ -131,7 +131,6 @@ void catenate_int(char *string, int val) } - feat_model *feat_model_new(char *name) { feat_model *fm = (feat_model *)memalloc(sizeof(feat_model)); diff --git a/maca_common/src/form2pos.c b/maca_common/src/form2pos.c index b200c780e12371d4473561e2690c44fc481e6c5c..853d06108167523713cd1f8a16d0b5d55b50b78b 100644 --- a/maca_common/src/form2pos.c +++ b/maca_common/src/form2pos.c @@ -6,13 +6,13 @@ form2pos *form2pos_new(int nbelem, int pos_nb, char *pos_list) { - form2pos *f2p = memalloc(sizeof(form2pos)); + form2pos *f2p = (form2pos *)memalloc(sizeof(form2pos)); char *token; f2p->nbelem = nbelem; f2p->pos_nb = pos_nb; - f2p->d_pos = dico_new("d_pos", pos_nb * 10); - f2p->d_signature = dico_new("d_signature", pos_nb * 10); + f2p->d_pos = dico_new((char *)"d_pos", pos_nb * 10); + f2p->d_signature = dico_new((char *)"d_signature", pos_nb * 10); f2p->h_form2signature = hash_new(nbelem * 4); token = strtok(pos_list, "\t"); do{ diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index ca924f23f1b3c13860f177e70555e4ed4b8c5494..69117407b18c2303402578a5499d18685f0fb30d 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -422,6 +422,63 @@ mcd *mcd_build_wplgfs(void) return m; } +mcd *mcd_build_wpmlgfs(void) +{ + mcd *m = mcd_new(7); + int col; + + col = 0; + m->wf[col]=MCD_WF_FORM; + m->wf_str[col]=strdup("FORM"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_FORM] = col; + + col = 1; + m->wf[col]=MCD_WF_POS; + m->wf_str[col]=strdup("POS"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_POS] = col; + + col = 2; + m->wf[col]=MCD_WF_FEATS; + m->wf_str[col]=strdup("FEATS"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_FEATS] = col; + + col = 3; + m->wf[col]=MCD_WF_LEMMA; + m->wf_str[col]=strdup("LEMMA"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_LEMMA] = col; + + col = 4; + m->wf[col]=MCD_WF_GOV; + m->wf_str[col]=strdup("GOV"); + m->representation[col]= MCD_REPRESENTATION_INT; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_GOV] = col; + + col = 5; + m->wf[col]=MCD_WF_LABEL; + m->wf_str[col]=strdup("LABEL"); + m->representation[col]= MCD_REPRESENTATION_VOCAB; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_LABEL] = col; + + col = 6; + m->wf[col]=MCD_WF_SENT_SEG; + m->wf_str[col]=strdup("SENT_SEG"); + m->representation[col]= MCD_REPRESENTATION_INT; + m->filename[col] = strdup("_"); + m->wf2col[MCD_WF_SENT_SEG] = col; + + return m; +} + /* returns a dico_vec containing the different dictionnaries found in an mcd structure */ diff --git a/maca_common/src/trie.c b/maca_common/src/trie.c index b25bca3df0e2b0a87abe36d74c5c6ed692db65b6..13412c7b552f617dacbb14dce755bc549317f74d 100644 --- a/maca_common/src/trie.c +++ b/maca_common/src/trie.c @@ -7,7 +7,7 @@ trie_state *trie_state_new(trie_trans *transitions, int is_accept) { - trie_state *state = memalloc(sizeof(trie_state)); + trie_state *state = (trie_state *) memalloc(sizeof(trie_state)); state->transitions = transitions; state->is_accept = is_accept; state->fail = 0; @@ -24,7 +24,7 @@ void trie_state_free(trie_state *state) trie *trie_new(void) { - trie *t = memalloc(sizeof(trie)); + trie *t = (trie *) memalloc(sizeof(trie)); t->states = NULL; t->size = 0; t->states_nb = 0; @@ -45,7 +45,7 @@ void trie_free(trie *t) trie_trans *trie_trans_new(int destination, int symbol, trie_trans *next) { - trie_trans *trans = memalloc(sizeof(trie_trans)); + trie_trans *trans = (trie_trans *)memalloc(sizeof(trie_trans)); trans->destination = destination; trans->symbol = symbol; trans->next = next; diff --git a/maca_common/src/word.c b/maca_common/src/word.c index b6420932d994a9d343fe01651984a2dd53b87b02..d4c01a61d27af0ee0ad48cecca8a00290bf6c0d6 100644 --- a/maca_common/src/word.c +++ b/maca_common/src/word.c @@ -19,6 +19,7 @@ word *word_new(char *input) w->wf_array[MCD_WF_GOV] = WORD_INVALID_GOV; w->form = NULL; + w->form_char16 = NULL; w->index = -1; w->signature = -1; @@ -59,11 +60,13 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct) w = word_new(buffer); token = strtok(buffer, "\t"); do{ - if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1) && (strcmp(token, "_"))){ + /* if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1) && (strcmp(token, "_"))){ */ + if((col < mcd_struct->nb_col) && (mcd_struct->wf[col] != -1)){ w->wf_array[mcd_struct->wf[col]] = mcd_get_code(mcd_struct, token, col); } if(mcd_struct->wf[col] == MCD_WF_FORM){ w->form = strdup(token); + w->form_char16 = utf8tochar16(w->form); w->U1 = isupper(token[0]) ? 1 : 0; } col++; @@ -95,6 +98,7 @@ void word_free(word *w) if(w == NULL) return; if(w->input) free(w->input); if(w->form) free(w->form); + if(w->form_char16) free(w->form_char16); free(w); } diff --git a/maca_graph_parser/array.c b/maca_graph_parser/array.c index d7c27c11c14b9a3b552692b00fcb4055c9db3d21..50f2ff6e2ab24f3bc401535c1811ce91de337a4b 100644 --- a/maca_graph_parser/array.c +++ b/maca_graph_parser/array.c @@ -2,7 +2,7 @@ #include "array.h" array_t* array_new() { - array_t* array = malloc(sizeof(array_t)); + array_t* array = (array_t *)malloc(sizeof(array_t)); array->num_elements = 0; array->data = NULL; return array; @@ -18,7 +18,7 @@ ARRAY_TYPE array_get(array_t* array, int element) { } void array_push(array_t* array, ARRAY_TYPE value) { - array->data = realloc(array->data, sizeof(ARRAY_TYPE) * (array->num_elements + 1)); + array->data = (ARRAY_TYPE *)realloc(array->data, sizeof(ARRAY_TYPE) * (array->num_elements + 1)); array->data[array->num_elements] = value; array->num_elements++; } diff --git a/maca_graph_parser/maca_graph_parser.c b/maca_graph_parser/maca_graph_parser.c index bc066ca3d83aac65d7c9b9176b8ec966ca6c8359..2b0d888900b94e7f7f1a0e44462f9a40715c2871 100644 --- a/maca_graph_parser/maca_graph_parser.c +++ b/maca_graph_parser/maca_graph_parser.c @@ -75,7 +75,7 @@ void maca_graph_parser_print_ctx(maca_graph_parser_ctx *ctx) maca_graph_parser_ctx * maca_graph_parser_InitCTX() { - maca_graph_parser_ctx * ctx = calloc(sizeof(maca_graph_parser_ctx), 1); + maca_graph_parser_ctx * ctx = (maca_graph_parser_ctx *)calloc(sizeof(maca_graph_parser_ctx), 1); ctx->cfg=MACA_DEFAULT_CFG; ctx->verbose_flag = maca_verbose; diff --git a/maca_graph_parser/maca_graph_parser_alphabet.c b/maca_graph_parser/maca_graph_parser_alphabet.c index c162196e3ab8237c457b76531e96c4d2a635739a..2f503486a1647a0e6b77dd290745b1e5cfd97f29 100644 --- a/maca_graph_parser/maca_graph_parser_alphabet.c +++ b/maca_graph_parser/maca_graph_parser_alphabet.c @@ -36,7 +36,7 @@ void maca_graph_parser_alphabet_free(maca_graph_parser_alphabet *a) maca_graph_parser_alphabet *maca_graph_parser_alphabet_new(char *name) { - maca_graph_parser_alphabet *a = malloc(sizeof(maca_graph_parser_alphabet)); + maca_graph_parser_alphabet *a = (maca_graph_parser_alphabet *)malloc(sizeof(maca_graph_parser_alphabet)); if(a == NULL){ fprintf(stderr, "memory allocation error\n"); exit(1); @@ -153,7 +153,7 @@ maca_graph_parser_alphabet **maca_graph_parser_alphabet_load4(char *filename) int i = 0; char symbol[1000]; maca_graph_parser_alphabet *a = NULL; - maca_graph_parser_alphabet **alpha_array = malloc(4 * sizeof(maca_graph_parser_alphabet*)); + maca_graph_parser_alphabet **alpha_array = (maca_graph_parser_alphabet **)malloc(4 * sizeof(maca_graph_parser_alphabet*)); for(i=0; i < 4; i++) alpha_array[i] = NULL; @@ -182,7 +182,7 @@ maca_graph_parser_alphabet **maca_graph_parser_alphabet_load5(char *filename) int i = 0; char symbol[1000]; maca_graph_parser_alphabet *a = NULL; - maca_graph_parser_alphabet **alpha_array = malloc(5 * sizeof(maca_graph_parser_alphabet*)); + maca_graph_parser_alphabet **alpha_array = (maca_graph_parser_alphabet **)malloc(5 * sizeof(maca_graph_parser_alphabet*)); for(i=0; i < 5; i++) alpha_array[i] = NULL; diff --git a/maca_lemmatizer/src/maca_lemmatizer.c b/maca_lemmatizer/src/maca_lemmatizer.c index 5d9cacdf050d2526a7b9bc046fae50ad3e525a10..e8aeecdda1ef9d29d0efa407e868bc1b2f03eb09 100644 --- a/maca_lemmatizer/src/maca_lemmatizer.c +++ b/maca_lemmatizer/src/maca_lemmatizer.c @@ -125,6 +125,7 @@ int main(int argc, char *argv[]) char *buffer_copy; char *form; char *pos; + char *feats; char *token; int column_nb; @@ -136,11 +137,16 @@ int main(int argc, char *argv[]) int form_column; int pos_column; int lemma_column; + int feats_column; FILE *f = NULL; ctx = context_read_options(argc, argv); maca_lemmatizer_check_options(ctx); + + feats_column = ctx->mcd_struct->wf2col[MCD_WF_FEATS]; + + if(ctx->pos_column != -1) pos_column = ctx->pos_column; else @@ -177,6 +183,7 @@ int main(int argc, char *argv[]) form = NULL; pos = NULL; lemma = NULL; + feats = NULL; do{ if(column_nb == lemma_column) /* lemma is present in the input file */ if(strcmp(token, "_")) /* and it is not an underscore */ @@ -188,6 +195,9 @@ int main(int argc, char *argv[]) if(column_nb == pos_column){ pos = strdup(token); } + if(column_nb == feats_column){ + feats = strdup(token); + } column_nb++; } while((token = strtok(NULL , "\t"))); @@ -215,11 +225,13 @@ int main(int argc, char *argv[]) /* print_word(buffer, ctx->mcd_struct, lemma); */ - /* printf("form = %s pos = %s (%s) lemma = %s\n", form, pos, form_pos, lemma); */ + printf("form = %s pos = %s (%s) feats = %s lemma = %s\n", form, pos, form_pos, feats, lemma); + printf("form = %s pos = %s (%s) feats = %s lemma = %s\n", form, pos, form_pos, feats, lemma); printf("\t%s\n", lemma); if(pos)free(pos); if(form)free(form); + if(feats)free(feats); } free(buffer_copy); free(lemma_array); diff --git a/maca_lexer/src/extract_mwe_from_fplm.c b/maca_lexer/src/extract_mwe_from_fplm.c index 800bed0478d786d5df37bd72d3e0562a0b12e1ef..b09defd06f557b016d2390d8326d8ccfb8310bc5 100644 --- a/maca_lexer/src/extract_mwe_from_fplm.c +++ b/maca_lexer/src/extract_mwe_from_fplm.c @@ -29,7 +29,7 @@ dico *decompose_mwe_in_fplm_file(char *fplm_filename, FILE *output_file, int deb char token[1000]; int l; int i, j; - dico *d_tokens = dico_new("TOKENS", 100000); + dico *d_tokens = dico_new((char *)"TOKENS", 100000); int token_code; while(fgets(buffer, 10000, f)){ fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho); @@ -71,6 +71,6 @@ int main(int argc, char *argv[]) dico *d_tokens; d_tokens = decompose_mwe_in_fplm_file(argv[1], stdout, 1); - dico_print("d_tokens.dico", d_tokens); + dico_print((char *)"d_tokens.dico", d_tokens); dico_free(d_tokens); } diff --git a/maca_morpho/CMakeLists.txt b/maca_morpho/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..eceae0e3bd3e790cfd5fff7ea8d6cdbfd227798d --- /dev/null +++ b/maca_morpho/CMakeLists.txt @@ -0,0 +1,30 @@ +set(SOURCES + src/maca_morpho_feat_fct.c + src/maca_morpho_context.c + src/vectorize.c +) + + + +#compiling library +include_directories(src) +add_library(maca_morpho STATIC ${SOURCES}) +target_link_libraries(maca_morpho perceptron) +target_link_libraries(maca_morpho maca_common) + + + +#compiling, linking and installing executables + +add_executable(fplm2cff ./src/fplm2cff.c) +target_link_libraries(fplm2cff perceptron) +target_link_libraries(fplm2cff maca_common) +target_link_libraries(fplm2cff maca_morpho) +install (TARGETS fplm2cff DESTINATION bin) + +add_executable(predict ./src/predict.c) +target_link_libraries(predict perceptron) +target_link_libraries(predict maca_common) +target_link_libraries(predict maca_morpho) +install (TARGETS predict DESTINATION bin) + diff --git a/maca_morpho/src/fplm2cff.c b/maca_morpho/src/fplm2cff.c new file mode 100644 index 0000000000000000000000000000000000000000..f2a311888fc0e22d03b69f20b4df71870ba5cf5b --- /dev/null +++ b/maca_morpho/src/fplm2cff.c @@ -0,0 +1,92 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "maca_morpho_context.h" +#include "feat_model.h" +#include "feat_vec.h" +#include "dico.h" +#include "util.h" +#include "vectorize.h" + +void decompose_feature_value(char *feature_value, char *feature, char *value) +{ + int i,j; + int l = strlen(feature_value); + int before = 1; + for(i=0; (i < l) && (feature_value[i] != '='); i++){ + feature[i] = feature_value[i]; + } + feature[i] = '\0'; + i++; + for(j=0; i<l; i++, j++){ + value[j] = feature_value[i]; + } + value[j] = '\0'; +} + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + if(ctx->help){ + context_general_help_message(ctx); + context_language_help_message(ctx); + context_fplm_help_message(ctx); + context_maca_data_path_help_message(ctx); + context_features_filename_help_message(ctx); + context_features_model_help_message(ctx); + exit(1); + } + feat_vec *fv = feat_vec_new(10); + dico *dico_features = dico_new("dico_features", 1000); + /* feat_model *fm = feat_model_read(ctx->fm_filename, feat_lib_build(), ctx->verbose); */ + char form[100]; + char pos[100]; + char lemma[100]; + char morpho[100]; + FILE *F_fplm = NULL; + char buffer[1000]; + char feature_value[100]; + char feature[100]; + char value[100]; + char *token; + + + F_fplm = myfopen(ctx->fplm_filename, "r"); + + + while(fgets(buffer, 1000, F_fplm)){ + if(feof(F_fplm)) + break; + // printf("%s", buffer); + buffer[strlen(buffer) - 1] = '\0'; + sscanf(buffer, "%[^\t]\t%[^\t]\t%[^\t]\t%[^\n]\n", form, pos, lemma, morpho); + //printf("form = %s pos = %s lemma = %s morpho = %s\n", form, pos, lemma, morpho); + token = strtok(morpho, "|"); + do{ + //printf("token = %s\n", token); + decompose_feature_value(token, feature, value); + //printf("feature = %s value = %s\n", feature, value); + }while((token = strtok(NULL, "|"))); + + + } + fclose(F_fplm); +} + + /* + while(strcmp(form, "end")){ + fscanf(stdin, "%s", form); + printf("form = %s\n", form); + form2fv(form, fv, fm, dico_features, ADD_MODE); + //void feat_vec_print_string(feat_vec *fv, dico *dico_features); + feat_vec_print(stdout, fv); + } + //dico_print_fh(stdout, dico_features); + if(ctx->features_filename) + dico_print(ctx->features_filename, dico_features); + */ + + + diff --git a/maca_morpho/src/maca_morpho_context.c b/maca_morpho/src/maca_morpho_context.c new file mode 100644 index 0000000000000000000000000000000000000000..5a82e3cb56d9e11b4f5682e5aa3b1a1f59ccc4c6 --- /dev/null +++ b/maca_morpho/src/maca_morpho_context.c @@ -0,0 +1,166 @@ +#include<stdlib.h> +#include<stdio.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include "maca_morpho_context.h" +#include "util.h" + + +void context_set_linguistic_resources_filenames(context *ctx); + +void context_free(context *ctx) +{ + if(ctx->program_name) free(ctx->program_name); + if(ctx->fplm_filename) free(ctx->fplm_filename); + if(ctx->cfw_filename) free(ctx->cfw_filename); + if(ctx->language) free(ctx->language); + if(ctx->maca_data_path) free(ctx->maca_data_path); + free(ctx); +} + +context *context_new(void) +{ + context *ctx = (context *)memalloc(sizeof(context)); + + ctx->help = 0; + ctx->verbose = 0; + ctx->debug_mode = 0; + ctx->program_name = NULL; + ctx->fplm_filename = NULL; + ctx->language = strdup("fr"); + ctx->maca_data_path = NULL; + ctx->features_filename = NULL; + ctx->cfw_filename = NULL; + return ctx; +} + +void context_general_help_message(context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n"); +} + +void context_fplm_help_message(context *ctx){ + fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n"); +} + +void context_language_help_message(context *ctx){ + fprintf(stderr, "\t-L --language : identifier of the language to use\n"); +} + +void context_maca_data_path_help_message(context *ctx){ + fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n"); +} + +void context_fm_help_message(context *ctx){ + fprintf(stderr, "\t-F --fm <file> : feature model file name\n"); +} + +void context_features_filename_help_message(context *ctx){ + fprintf(stderr, "\t-x --feat <file> : features dictionary file name\n"); +} + +void context_weights_matrix_filename_help_message(context *ctx){ + fprintf(stderr, "\t-w --weights <file> : weight matrix (cfw) filename\n"); +} + +void context_features_model_help_message(context *ctx){ + fprintf(stderr, "\t-F --feat_model <file> : feature model file name\n"); +} + +context *context_read_options(int argc, char *argv[]) +{ + int c; + int option_index = 0; + context *ctx = context_new(); + + ctx->program_name = strdup(argv[0]); + + static struct option long_options[10] = + { + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, + {"debug", no_argument, 0, 'd'}, + {"mcd", required_argument, 0, 'C'}, + {"language", required_argument, 0, 'L'}, + {"fplm", required_argument, 0, 'f'}, + {"maca_data_path", required_argument, 0, 'D'}, + {"fm", required_argument, 0, 'F'}, + {"feat", required_argument, 0, 'x'}, + {"weights", required_argument, 0, 'w'} + }; + optind = 0; + opterr = 0; + + while ((c = getopt_long (argc, argv, "hvdf:L:M:D:F:x:w:", long_options, &option_index)) != -1){ + switch (c) + { + case 'd': + ctx->debug_mode = 1; + break; + case 'h': + ctx->help = 1; + break; + case 'v': + ctx->verbose = 1; + break; + case 'f': + ctx->fplm_filename = strdup(optarg); + break; + case 'L': + ctx->language = strdup(optarg); + break; + case 'D': + ctx->maca_data_path = strdup(optarg); + break; + case 'F': + ctx->fm_filename = strdup(optarg); + break; + case 'x': + ctx->features_filename = strdup(optarg); + break; + case 'w': + ctx->cfw_filename = strdup(optarg); + break; + } + } + + context_set_linguistic_resources_filenames(ctx); + + return ctx; +} + +void context_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_path[500]; + char absolute_filename[500]; + + absolute_path[0] = '\0'; + + if(ctx->maca_data_path) + strcat(absolute_path, ctx->maca_data_path); + else { + char *e = getenv("MACAON_DIR"); + if (e != NULL) { + strcat(absolute_path, e); + } else { + fprintf(stderr, "ATTENTION: the environment variable MACAON_DIR is not defined\n"); + } + } + + + strcat(absolute_path, "/"); + strcat(absolute_path, ctx->language); + strcat(absolute_path, "/bin/"); + + if(!ctx->fplm_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_FPLM_FILENAME); + ctx->fplm_filename = strdup(absolute_filename); + } + +} diff --git a/maca_morpho/src/maca_morpho_context.h b/maca_morpho/src/maca_morpho_context.h new file mode 100644 index 0000000000000000000000000000000000000000..c1789a54631e3bcc3ba695573dc1cf784e177f32 --- /dev/null +++ b/maca_morpho/src/maca_morpho_context.h @@ -0,0 +1,37 @@ +#ifndef __MACA_MORPHO_CONTEXT__ +#define __MACA_MORPHO_CONTEXT__ + +#include "mcd.h" +#include <stdlib.h> + +#define DEFAULT_FPLM_FILENAME "fplm" + + + +typedef struct { + int help; + int verbose; + int debug_mode; + char *program_name; + char *fplm_filename; + char *language; + char *maca_data_path; + char *fm_filename; + char *features_filename; + char *cfw_filename; +} context; + + + +context *context_new(void); +void context_free(context *ctx); + +context *context_read_options(int argc, char *argv[]); +void context_general_help_message(context *ctx); +void context_language_help_message(context *ctx); +void context_fplm_help_message(context *ctx); +void context_maca_data_path_help_message(context *ctx); +void context_features_filename_help_message(context *ctx); +void context_weights_matrix_filename_help_message(context *ctx); +void context_features_model_help_message(context *ctx); +#endif diff --git a/maca_morpho/src/maca_morpho_feat_fct.c b/maca_morpho/src/maca_morpho_feat_fct.c new file mode 100644 index 0000000000000000000000000000000000000000..30b5ccab5b62b1350b4b33e84798eb90a8fde42f --- /dev/null +++ b/maca_morpho/src/maca_morpho_feat_fct.c @@ -0,0 +1,19 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"feat_lib.h" + + + +int s1(void *input){return(input == NULL)? -1 : ((char *)input)[strlen((char *)input) - 1];} +int s2(void *input){return(input == NULL)? -1 : ((char *)input)[strlen((char *)input) - 2];} + +feat_lib *feat_lib_build(void) +{ + feat_lib *fl = feat_lib_new(); + + feat_lib_add(fl, 1, (char *)"s1", s1); + feat_lib_add(fl, 1, (char *)"s2", s2); + return fl; +} + diff --git a/maca_morpho/src/predict.c b/maca_morpho/src/predict.c new file mode 100644 index 0000000000000000000000000000000000000000..af6de629ca83b31fc962e4f478544fd4ea78081f --- /dev/null +++ b/maca_morpho/src/predict.c @@ -0,0 +1,54 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "maca_morpho_context.h" +#include "feat_model.h" +#include "feat_vec.h" +#include "dico.h" +#include "util.h" +#include "vectorize.h" +#include "feature_table.h" + +void predict_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_language_help_message(ctx); + context_fplm_help_message(ctx); + context_maca_data_path_help_message(ctx); + context_features_filename_help_message(ctx); + context_weights_matrix_filename_help_message(ctx); + context_features_model_help_message(ctx); + exit(1); +} + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + if(ctx->help) predict_help_message(ctx); + feature_table *cfw = feature_table_load(ctx->cfw_filename, ctx->verbose); + feat_vec *fv = feat_vec_new(10); + dico *dico_features = dico_read(ctx->features_filename, 0.5); + feat_model *fm = feat_model_read(ctx->fm_filename, feat_lib_build(), ctx->verbose); + char form[100]; + int class; + float max; + + + while(strcmp(form, "end")){ + fscanf(stdin, "%s", form); + printf("form = %s\n", form); + form2fv(form, fv, fm, dico_features, LOOKUP_MODE); + class = feature_table_argmax(fv, cfw, &max); + feat_vec_print(stdout, fv); + printf("class = %d\n", class); + + } + + if(ctx->features_filename) + dico_print(ctx->features_filename, dico_features); + + + +} diff --git a/maca_morpho/src/vectorize.c b/maca_morpho/src/vectorize.c new file mode 100644 index 0000000000000000000000000000000000000000..f7f43136a5f0c6e1a7d3e53f3ff1ea406d589070 --- /dev/null +++ b/maca_morpho/src/vectorize.c @@ -0,0 +1,38 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"vectorize.h" + +int get_feat_value(feat_model *fm, char *form, dico *dico_features, int feat_nb, int mode) +{ + feat_desc *fd = fm->array[feat_nb]; + int i; + int feat_val; + char str[10]; + + /* the name of the feature is built in fm->string and its value in the dictionnary (dico_features) is returned */ + fm->string[0] = '\0'; + for(i=0; i < fd->nbelem; i++){ + strcat(fm->string, fd->array[i]->name); + feat_val = fd->array[i]->fct(form); + sprintf(str, "%d", feat_val); + strcat(fm->string, str); + + /* catenate_int(fm->string, feat_val); */ + } + if(mode == LOOKUP_MODE){ + if(fm->string) + return dico_string2int(dico_features, fm->string); + } + return dico_add(dico_features, fm->string); +} + + +feat_vec *form2fv(char *form, feat_vec *fv, feat_model *fm, dico *dico_features, int mode) +{ + int i; + feat_vec_empty(fv); + for(i=0; i < fm->nbelem; i++) + feat_vec_add(fv, get_feat_value(fm, form, dico_features, i, mode)); + return fv; +} diff --git a/maca_morpho/src/vectorize.h b/maca_morpho/src/vectorize.h new file mode 100644 index 0000000000000000000000000000000000000000..c859605c68cc9cbcfdc0ad169871047acfd0bec0 --- /dev/null +++ b/maca_morpho/src/vectorize.h @@ -0,0 +1,14 @@ +#ifndef __VECTORIZE__ +#define __VECTORIZE__ + +#include"dico.h" +#include"feat_model.h" +#include"feat_vec.h" + +#define LOOKUP_MODE 1 +#define ADD_MODE 2 + + +feat_vec *form2fv(char *form, feat_vec *fv, feat_model *fm, dico *dico_features, int mode); + +#endif diff --git a/maca_tools/src/mcf2conll.c b/maca_tools/src/mcf2conll.c index 078be933aab23179c11fd238430864c720fa8f58..8228022002369a1103bd568f07b3e40011d0c917 100644 --- a/maca_tools/src/mcf2conll.c +++ b/maca_tools/src/mcf2conll.c @@ -116,7 +116,7 @@ context *context_read_options(int argc, char *argv[]) ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); } else{ - ctx->mcd_struct = mcd_build_wplgfs(); + ctx->mcd_struct = mcd_build_wpmlgfs(); } return ctx; diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 8864690f5dc0def405570b0921d97b8229d5f609..be494e50df9fd828c3b9040d4816077bc9e3d343 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -17,14 +17,13 @@ set(SOURCES src/context.c # src/simple_decoder_tagger_bt.c src/stack.c src/config2feat_vec.c - src/depset.c +# src/depset.c src/config.c # src/queue.c # src/beam.c src/feat_types.c src/mvt.c src/mvt_stack.c - ) #compiling library @@ -33,6 +32,11 @@ add_library(transparse STATIC ${SOURCES}) target_link_libraries(transparse perceptron) #compiling, linking and installing executables +add_executable(maca_trans_parser_nn ./src/maca_trans_parser_nn.cc) +target_link_libraries(maca_trans_parser_nn perceptron) +target_link_libraries(maca_trans_parser_nn transparse) +target_link_libraries(maca_trans_parser_nn maca_common) +install (TARGETS maca_trans_parser_nn DESTINATION bin) add_executable(maca_trans_tagger_mcf2cff ./src/maca_trans_tagger_mcf2cff.c) target_link_libraries(maca_trans_tagger_mcf2cff perceptron) @@ -40,17 +44,23 @@ target_link_libraries(maca_trans_tagger_mcf2cff transparse) target_link_libraries(maca_trans_tagger_mcf2cff maca_common) install (TARGETS maca_trans_tagger_mcf2cff DESTINATION bin) +add_executable(maca_trans_morpho_mcf2cff ./src/maca_trans_morpho_mcf2cff.c) +target_link_libraries(maca_trans_morpho_mcf2cff perceptron) +target_link_libraries(maca_trans_morpho_mcf2cff transparse) +target_link_libraries(maca_trans_morpho_mcf2cff maca_common) +install (TARGETS maca_trans_morpho_mcf2cff DESTINATION bin) + #add_executable(maca_trans_tagger_mcf2cff_bt ./src/maca_trans_tagger_mcf2cff_bt.c) #target_link_libraries(maca_trans_tagger_mcf2cff_bt perceptron) #target_link_libraries(maca_trans_tagger_mcf2cff_bt transparse) #target_link_libraries(maca_trans_tagger_mcf2cff_bt maca_common) #install (TARGETS maca_trans_tagger_mcf2cff_bt DESTINATION bin) -#add_executable(maca_trans_parser_mcf2fann ./src/maca_trans_parser_mcf2fann.c) -#target_link_libraries(maca_trans_parser_mcf2fann perceptron) -#target_link_libraries(maca_trans_parser_mcf2fann transparse) -#target_link_libraries(maca_trans_parser_mcf2fann maca_common) -#install (TARGETS maca_trans_parser_mcf2fann DESTINATION bin) +# add_executable(maca_trans_parser_mcf2fann ./src/maca_trans_parser_mcf2fann.c) +# target_link_libraries(maca_trans_parser_mcf2fann perceptron) +# target_link_libraries(maca_trans_parser_mcf2fann transparse) +# target_link_libraries(maca_trans_parser_mcf2fann maca_common) +# install (TARGETS maca_trans_parser_mcf2fann DESTINATION bin) #add_executable(maca_trans_parser_mcf2cff ./src/maca_trans_parser_mcf2cff.c) #target_link_libraries(maca_trans_parser_mcf2cff perceptron) @@ -100,6 +110,12 @@ target_link_libraries(maca_trans_tagger transparse) target_link_libraries(maca_trans_tagger maca_common) install (TARGETS maca_trans_tagger DESTINATION bin) +add_executable(maca_trans_morpho ./src/maca_trans_morpho.c) +target_link_libraries(maca_trans_morpho perceptron) +target_link_libraries(maca_trans_morpho transparse) +target_link_libraries(maca_trans_morpho maca_common) +install (TARGETS maca_trans_morpho DESTINATION bin) + #add_executable(maca_trans_tagger_bt ./src/maca_trans_tagger_bt.c) #target_link_libraries(maca_trans_tagger_bt perceptron) #target_link_libraries(maca_trans_tagger_bt transparse) diff --git a/maca_trans_parser/src/cff2fann.c b/maca_trans_parser/src/cff2fann.c index 93c49b32e47af548612b5b4b67263dd9ffa1cb5d..294d8215d3e3de25bd16503dc6f65dcefa637c4d 100644 --- a/maca_trans_parser/src/cff2fann.c +++ b/maca_trans_parser/src/cff2fann.c @@ -6,6 +6,8 @@ #include"context.h" #include"util.h" #include"cf_file.h" +#include"feat_lib.h" +#include"feat_types.h" void cff2fann_help_message(context *ctx) @@ -28,9 +30,14 @@ void cff2fann_help_message(context *ctx) void cff2fann_check_options(context *ctx) { - if(!ctx->input_filename + if(ctx->cff_filename) fprintf(stderr, "cff filename = %s\n", ctx->cff_filename); + if(ctx->mcd_filename) fprintf(stderr, "mcd filename = %s\n", ctx->mcd_filename); + if(ctx->features_model_filename) fprintf(stderr, "fm filename = %s\n", ctx->features_model_filename); + + if(!ctx->cff_filename || ctx->help - /* || !ctx->mcd_filename */ + || !ctx->mcd_filename + || !ctx->features_model_filename /* || !(ctx->cff_filename || ctx->fann_filename) */ ){ cff2fann_help_message(ctx); @@ -45,6 +52,68 @@ void one_hot_print(FILE *f, int val, int dim) fprintf(f, "%d ", (i == val)? 1 : 0); } +void print_header(mcd *m, feat_model *fm) +{ + int i; + feat_desc *fd; + simple_feat_desc *sfd; + + printf("OUT"); + + for(i=0; i <fm->nbelem; i++){ + fd = fm->array[i]; + if(fd->nbelem > 1){ + printf("feature %d is a complex feature, skipping it\n", i); + } + else{ + sfd = fd->array[0]; + printf("\t%s", sfd->name); + } + } + + printf("\n"); + printf("OUT"); + for(i=0; i <fm->nbelem; i++){ + fd = fm->array[i]; + if(fd->nbelem > 1){ + printf("feature %d is a complex feature, skipping it\n", i); + } + else{ + sfd = fd->array[0]; + if(sfd->type == FEAT_TYPE_FORM){printf("\tFORM");continue;} + if(sfd->type == FEAT_TYPE_LEMMA){printf("\tLEMMA");continue;} + if(sfd->type == FEAT_TYPE_CPOS){printf("\tCPOS");continue;} + if(sfd->type == FEAT_TYPE_POS){printf("\tPOS");continue;} + if(sfd->type == FEAT_TYPE_LABEL){printf("\tLABEL");continue;} + if(sfd->type == FEAT_TYPE_INT){printf("\tINT");continue;} + printf("\tUNK"); + } + } + printf("\n"); + /* + for(i=0; i < m->nb_col; i++){ + if(m->representation[i] == MCD_REPRESENTATION_EMB){ + printf("\tEMB"); + continue; + } + + if(m->representation[i] == MCD_REPRESENTATION_NULL){ + continue; + } + + if(m->representation[i] == MCD_REPRESENTATION_VOCAB){ + printf("\t%s", m->wf_str[i]); + continue; + } + + if(m->representation[i] == MCD_REPRESENTATION_INT){ + printf("\tINT"); + continue; + } + } + printf("\n");*/ +} + void cff2fann(context *ctx) { char buffer[10000]; @@ -52,41 +121,63 @@ void cff2fann(context *ctx) int col_nb; int feat_type; mcd *m = ctx->mcd_struct; - FILE *f = myfopen(ctx->input_filename, "r"); + FILE *f = myfopen(ctx->cff_filename, "r"); int val; + dico *vocab; + char feature_type[64]; + int feature_valindex; + int count = 0; + + vocab = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + /* printf("%d %d\n", 1, ctx->features_model->nbelem); */ + + print_header(m, ctx->features_model); while(fgets(buffer, 10000, f)){ - /* printf("%s", buffer); */ - /* printf("\n"); */ + /* printf("%s", buffer); */ + /* printf("\n"); */ token = strtok(buffer, "\t"); col_nb = 0; + if (count % 100 == 0) + fprintf(stderr, "%d\r", count); while(token){ - /* printf("col = %d token = %s max = %d\n", col_nb, token, max_array[col_nb]); */ + /* printf("col = %d token = %s max = %d\n", col_nb, token, max_array[col_nb]); */ val = atoi(token); if(col_nb == 0){ - one_hot_print(stdout, val, ctx->mvt_nb); - printf("\n"); - } - else{ - feat_type = feat_model_get_type_feat_n(ctx->features_model, col_nb - 1); - /* printf("feat_type = %d\n", feat_type); */ - int mcd_col = m->wf2col[feat_type]; - /* printf("representation = %d\n", m->representation[mcd_col]); */ - if(m->representation[mcd_col] == MCD_REPRESENTATION_EMB){ - /* printf("it is an embedding val = %d, file = %s\n", val, m->filename[mcd_col]); */ - word_emb_print(stdout, m->word_emb_array[mcd_col], val); - printf("\n"); - } - if(m->representation[mcd_col] == MCD_REPRESENTATION_VOCAB){ - /* printf("it is a vocab\n"); */ - one_hot_print(stdout, val, m->dico_array[mcd_col]->nbelem); - printf("\n"); - } + /* one_hot_print(stdout, val, ctx->mvt_nb); */ + /* printf("\n"); */ + printf("%d", val); + } else { + sscanf(dico_int2string(vocab, val), "%[^==]==%d", feature_type, &feature_valindex); + /* printf("feature_type = %s\n", feature_type); */ + feat_type = feat_model_get_type_feat_n(ctx->features_model, col_nb - 1); + /* printf("feat_type = %d\n", feat_type); */ + /* printf("%d: ", col_nb); */ + int mcd_col = m->wf2col[feat_type]; + /* printf("representation = %d\n", m->representation[mcd_col]); */ + if(m->representation[mcd_col] == MCD_REPRESENTATION_EMB){ + /* printf("it is an embedding val = %d, file = %s\n", val, m->filename[mcd_col]); */ + /* word_emb_print(stdout, m->word_emb_array[mcd_col], feature_valindex); */ + /* printf("\n"); */ + printf("\t%d", feature_valindex); + + } else if(m->representation[mcd_col] == MCD_REPRESENTATION_VOCAB){ + /* printf("it is a vocab\n"); */ + /* one_hot_print(stdout, feature_valindex, m->dico_array[mcd_col]->nbelem); */ + /* printf("\n"); */ + printf("\t%d", feature_valindex); + } else { + printf("\t%d", feature_valindex); + } } col_nb++; token = strtok(NULL , "\t"); } + printf("\n"); + count++; } + fclose(f); } int main(int argc, char *argv[]) @@ -102,11 +193,11 @@ int main(int argc, char *argv[]) ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); - look_for_number_of_features_and_classes(ctx->input_filename, &nb_feat, &nb_class); + look_for_number_of_features_and_classes(ctx->cff_filename, &nb_feat, &nb_class); ctx->mvt_nb = nb_class; mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, 1); - + cff2fann(ctx); return 0; } diff --git a/maca_trans_parser/src/compare_traces.c b/maca_trans_parser/src/compare_traces.c index f6567c3df39b0938ad658283241576387d3914b7..baea6fbbaf88b70e1f42302b93b07a26e6bb5299 100644 --- a/maca_trans_parser/src/compare_traces.c +++ b/maca_trans_parser/src/compare_traces.c @@ -26,7 +26,7 @@ int configuration_equal(configuration *c1, configuration *c2) configuration *configuration_new(int index, char *stack, char *movement, float score) { - configuration *c = malloc(sizeof(configuration)); + configuration *c = (configuration *)malloc(sizeof(configuration)); if(c == NULL) return NULL; c->index = index; c->stack = stack; diff --git a/maca_trans_parser/src/config2feat_vec.c b/maca_trans_parser/src/config2feat_vec.c index 0b63b1633ca98ed0d4b7c6d6eea27e37a3e7001d..362125b7c06a3751e5fcf665b3d0423f7228c877 100644 --- a/maca_trans_parser/src/config2feat_vec.c +++ b/maca_trans_parser/src/config2feat_vec.c @@ -5,7 +5,6 @@ #include"feat_model.h" #include"config2feat_vec.h" - int get_feat_value_fann(feat_model *fm, config *c, int feat_nb) { feat_desc *fd = fm->array[feat_nb]; @@ -23,6 +22,7 @@ int get_feat_value_cff(feat_model *fm, config *c, dico *dico_features, int feat_ fm->string[0] = '\0'; for(i=0; i < fd->nbelem; i++){ strcat(fm->string, fd->array[i]->name); + strcat(fm->string, "=="); feat_val = fd->array[i]->fct(c); catenate_int(fm->string, feat_val); } diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 72c2d61ab47097612cca44a7d908d4406fd2129a..2b8b165bd14e28565a1efe46fce3be0500e6f6e0 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -25,6 +25,10 @@ void context_free(context *ctx) if(ctx->vocabs_filename) free(ctx->vocabs_filename); if(ctx->fplm_filename) free(ctx->fplm_filename); + if(ctx->json_filename) free(ctx->json_filename); + if(ctx->dnn_model_filename) free(ctx->dnn_model_filename); + + if (ctx->mcd_struct) mcd_free(ctx->mcd_struct); @@ -44,6 +48,7 @@ void context_free(context *ctx) if(ctx->f2p) form2pos_free(ctx->f2p); + free(ctx); } @@ -92,6 +97,11 @@ context *context_new(void) ctx->ifpls = 1; ctx->trace_mode = 0; + + + ctx->json_filename = NULL; + ctx->dnn_model_filename = NULL; + return ctx; } @@ -167,6 +177,14 @@ void context_trace_mode_help_message(context *ctx){ void context_debug_help_message(context *ctx){ fprintf(stderr, "\t-d --debug : activate debug mode (default is false)\n"); } +void context_json_help_message(context *ctx){ + fprintf(stderr, "\t-J --json : json description of keras model\n"); +} +void context_dnn_model_help_message(context *ctx){ + fprintf(stderr, "\t-N --dnn_model : weight file for dnn\n"); +} + + context *context_read_options(int argc, char *argv[]) { @@ -176,7 +194,7 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[22] = + static struct option long_options[24] = { {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, @@ -199,13 +217,15 @@ context *context_read_options(int argc, char *argv[]) {"maca_data_path", required_argument, 0, 'D'}, {"root_label", required_argument, 0, 'R'}, {"f2p", required_argument, 0, 'P'}, - {"traces", required_argument, 0, 'T'} + {"traces", required_argument, 0, 'T'}, + {"json", required_argument, 0, 'J'}, + {"dnn_model", required_argument, 0, 'N'} }; optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "hvdcSTm:i:n:x:u:r:M:b:f:s:C:F:V:L:D:R:P:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "hvdcSTm:i:n:x:u:r:M:b:f:s:C:F:V:L:D:R:P:J:N:", long_options, &option_index)) != -1){ switch (c) { case 'h': @@ -277,13 +297,20 @@ context *context_read_options(int argc, char *argv[]) ctx->f2p_filename = strdup(optarg); ctx->f2p = form2pos_read(ctx->f2p_filename); break; + case 'N': + ctx->dnn_model_filename = strdup(optarg); + break; + case 'J': + ctx->json_filename = strdup(optarg); + break; } } if(ctx->mcd_filename) ctx->mcd_struct = mcd_read(ctx->mcd_filename, ctx->verbose); else - ctx->mcd_struct = mcd_build_wplgfs(); + ctx->mcd_struct = mcd_build_wpmlgfs(); + /* ctx->mcd_struct = mcd_build_wplgfs(); */ /* initialize maca_data_path field */ diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 932e6717e28ebcec122951f69eb69ba05723869f..752d7604488818ef4a6aa109065c88317ebbfa0a 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -14,11 +14,24 @@ #define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab" #define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model" +#define DEFAULT_MULTI_COL_DESC_MORPHO_FILENAME "maca_trans_morpho.mcd" +#define DEFAULT_FEATURES_MODEL_MORPHO_FILENAME "maca_trans_morpho.fm" +#define DEFAULT_VOCABS_MORPHO_FILENAME "maca_trans_morpho.vocab" +#define DEFAULT_MODEL_MORPHO_FILENAME "maca_trans_morpho.model" + #define DEFAULT_MULTI_COL_DESC_TAGPARSER_FILENAME "maca_trans_tagparser.mcd" #define DEFAULT_FEATURES_MODEL_TAGPARSER_FILENAME "maca_trans_tagparser.fm" #define DEFAULT_VOCABS_TAGPARSER_FILENAME "maca_trans_tagparser.vocab" #define DEFAULT_MODEL_TAGPARSER_FILENAME "maca_trans_tagparser.model" +#define DEFAULT_MULTI_COL_DESC_PARSER_NN_FILENAME "maca_trans_parser_nn.mcd" +#define DEFAULT_FEATURES_MODEL_PARSER_NN_FILENAME "maca_trans_parser_nn.fm" +#define DEFAULT_VOCABS_PARSER_NN_FILENAME "maca_trans_parser_nn.vocab" +#define DEFAULT_MODEL_PARSER_NN_FILENAME "maca_trans_parser_nn.weights" +#define DEFAULT_JSON_PARSER_NN_FILENAME "maca_trans_parser_nn.json" + + + #define DEFAULT_F2P_FILENAME "fP" #define DEFAULT_FPLM_FILENAME "fplm" @@ -33,7 +46,6 @@ typedef struct { char *program_name; char *input_filename; char *perc_model_filename; - char *dnn_model_filename; char *cff_filename; char *fann_filename; char *stag_desc_filename; @@ -67,6 +79,10 @@ typedef struct { int conll; int ifpls; int trace_mode; + + char *json_filename; + char *dnn_model_filename; + } context; context *context_new(void); @@ -103,6 +119,7 @@ void context_input_help_message(context *ctx); void context_root_label_help_message(context *ctx); void context_debug_help_message(context *ctx); - +void context_json_help_message(context *ctx); +void context_dnn_model_help_message(context *ctx); #endif diff --git a/maca_trans_parser/src/feat_fct.c b/maca_trans_parser/src/feat_fct.c index fdb525e33391bb91f27236bcfe8b80c36ac69bf4..e4e02de4707a7fac892195ded4ae4907929efcf3 100644 --- a/maca_trans_parser/src/feat_fct.c +++ b/maca_trans_parser/src/feat_fct.c @@ -154,6 +154,20 @@ int s3Z(void *c) {return word_get_Z(stack_s3(config_get_stack((config *) c)));} /* words in the buffer */ +int b0s1(void *c){return word_get_s1(word_buffer_b0(config_get_buffer((config *) c)));} +int b0s2(void *c){return word_get_s2(word_buffer_b0(config_get_buffer((config *) c)));} +int b0s3(void *c){return word_get_s3(word_buffer_b0(config_get_buffer((config *) c)));} +int b0s4(void *c){return word_get_s4(word_buffer_b0(config_get_buffer((config *) c)));} +int b0s5(void *c){return word_get_s5(word_buffer_b0(config_get_buffer((config *) c)));} +int b0s6(void *c){return word_get_s6(word_buffer_b0(config_get_buffer((config *) c)));} + +int b0p1(void *c){return word_get_p1(word_buffer_b0(config_get_buffer((config *) c)));} +int b0p2(void *c){return word_get_p2(word_buffer_b0(config_get_buffer((config *) c)));} +int b0p3(void *c){return word_get_p3(word_buffer_b0(config_get_buffer((config *) c)));} +int b0p4(void *c){return word_get_p4(word_buffer_b0(config_get_buffer((config *) c)));} +int b0p5(void *c){return word_get_p5(word_buffer_b0(config_get_buffer((config *) c)));} +int b0p6(void *c){return word_get_p6(word_buffer_b0(config_get_buffer((config *) c)));} + int b0g(void *c) {return (word_get_gov(word_buffer_b0(config_get_buffer((config *) c))) == WORD_INVALID_GOV) ? 0 : 1;} int b0sf(void *c) {return word_get_label(word_buffer_b0(config_get_buffer((config *) c)));} @@ -416,7 +430,7 @@ int bm3Z(void *c) {return word_get_Z(word_buffer_bm3(config_get_buffer((config * /* structural features */ int ldep_s0r(void *input){ - config *c = input; + config *c = (config *)input; word *gov = stack_s0(config_get_stack((config *) c)); int i; word *dep; @@ -436,7 +450,7 @@ int ldep_s0r(void *input){ } int ldep_s0p(void *input){ - config *c = input; + config *c = (config *)input; word *gov = stack_s0(config_get_stack((config *) c)); int i; word *dep; @@ -456,7 +470,7 @@ int ldep_s0p(void *input){ } int ldep_s1r(void *input){ - config *c = input; + config *c = (config *)input; word *gov = stack_s1(config_get_stack((config *) c)); int i; word *dep; @@ -476,7 +490,7 @@ int ldep_s1r(void *input){ } int ldep_s1p(void *input){ - config *c = input; + config *c = (config *)input; word *gov = stack_s1(config_get_stack((config *) c)); int i; word *dep; @@ -496,7 +510,7 @@ int ldep_s1p(void *input){ } int ldep_b0r(void *input){ - config *c = input; + config *c = (config *)input; word *gov = word_buffer_b0(config_get_buffer((config *) c)); int i; word *dep; @@ -516,7 +530,7 @@ int ldep_b0r(void *input){ } int ldep_b0p(void *input){ - config *c = input; + config *c = (config *)input; word *gov = word_buffer_b0(config_get_buffer((config *) c)); int i; word *dep; @@ -536,7 +550,7 @@ int ldep_b0p(void *input){ } int rdep_s0r(void *input){ - config *c = input; + config *c = (config *)input; word *gov = stack_s0(config_get_stack((config *) c)); int i; word *dep; @@ -556,7 +570,7 @@ int rdep_s0r(void *input){ } int rdep_s0p(void *input){ - config *c = input; + config *c = (config *)input; word *gov = stack_s0(config_get_stack((config *) c)); int i; word *dep; @@ -576,7 +590,7 @@ int rdep_s0p(void *input){ } int rdep_s1p(void *input){ - config *c = input; + config *c = (config *)input; word *gov = stack_s1(config_get_stack((config *) c)); int i; word *dep; @@ -596,7 +610,7 @@ int rdep_s1p(void *input){ } int rdep_s1r(void *input){ - config *c = input; + config *c = (config *)input; word *gov = stack_s1(config_get_stack((config *) c)); int i; word *dep; @@ -616,7 +630,7 @@ int rdep_s1r(void *input){ } int rdep_b0r(void *input){ - config *c = input; + config *c = (config *)input; word *gov = word_buffer_b0(config_get_buffer((config *) c)); int i; word *dep; @@ -636,7 +650,7 @@ int rdep_b0r(void *input){ } int rdep_b0p(void *input){ - config *c = input; + config *c = (config *)input; word *gov = word_buffer_b0(config_get_buffer((config *) c)); int i; word *dep; @@ -657,7 +671,7 @@ int rdep_b0p(void *input){ int ndep_b0(void *input){ - config *c = input; + config *c = (config *)input; word *gov = word_buffer_b0(config_get_buffer((config *) c)); int i; int n = 0; @@ -684,7 +698,7 @@ int ndep_b0(void *input){ } int ndep_s0(void *input){ - config *c = input; + config *c = (config *)input; word *gov = stack_s0(config_get_stack((config *) c)); int i; int n = 0; @@ -714,7 +728,7 @@ int ndep_s0(void *input){ /* distance features */ int dist_s0_b0(void *input){ - config *c = input; + config *c = (config *)input; int dist; if(stack_is_empty(config_get_stack((config *) c)) || word_buffer_is_empty(config_get_buffer((config *) c))) @@ -729,7 +743,7 @@ int dist_s0_b0(void *input){ /* stack height */ int sh(void *input) { -config *c = input; +config *c = (config *)input; return (config_get_stack((config *) c)->top > 7)? 7 : config_get_stack((config *) c)->top; /* return (stack_nbelem(config_get_stack((config *) c)) > 0)? 1 : 0; */ @@ -737,7 +751,7 @@ config *c = input; /* buffer size */ int bh(void *input) { -config *c = input; +config *c = (config *)input; return (config_get_buffer((config *) c)->size > 7)? 7 : config_get_buffer((config *) c)->size; } @@ -779,21 +793,21 @@ int t4(void *c) int mvt0(void *input) { - config *c = input; + config *c = (config *)input; if(c->vcode_array == NULL) return -1; return c->vcode_array[0].class_code; } int mvt1(void *input) { - config *c = input; + config *c = (config *)input; if(c->vcode_array == NULL) return -1; return c->vcode_array[1].class_code; } int delta1(void *input) { - config *c = input; + config *c = (config *)input; if(c->vcode_array == NULL) return -1; int delta = (int) (c->vcode_array[0].score - c->vcode_array[1].score); return (delta >= 10)? 10: delta; @@ -801,14 +815,14 @@ int delta1(void *input) int mvt2(void *input) { - config *c = input; + config *c = (config *)input; if(c->vcode_array == NULL) return -1; return c->vcode_array[2].class_code; } int delta2(void *input) { - config *c = input; + config *c = (config *)input; if(c->vcode_array == NULL) return -1; int delta = (int) (c->vcode_array[0].score - c->vcode_array[2].score); return (delta >= 10)? 10: delta; @@ -816,14 +830,14 @@ int delta2(void *input) int mvt3(void *input) { - config *c = input; + config *c = (config *)input; if(c->vcode_array == NULL) return -1; return c->vcode_array[3].class_code; } int delta3(void *input) { - config *c = input; + config *c = (config *)input; if(c->vcode_array == NULL) return -1; int delta = (int) (c->vcode_array[0].score - c->vcode_array[3].score); return (delta >= 10)? 10: delta; @@ -869,7 +883,8 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_X, (char *)"s0X", s0X); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"s0Y", s0Y); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"s0Z", s0Z); - feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"s0U1", s0U1); + /* feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"s0U1", s0U1); */ + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"s0U1", s0U1); feat_lib_add(fl, FEAT_TYPE_INT, (char *)"s0sgn", s0sgn); feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"s1g", s1g); @@ -1015,9 +1030,25 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_X, (char *)"b0X", b0X); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b0Y", b0Y); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b0Z", b0Z); - feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"b0U1", b0U1); + /* feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"b0U1", b0U1); */ + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0U1", b0U1); feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0sgn", b0sgn); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0s1", b0s1); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0s2", b0s2); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0s3", b0s3); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0s4", b0s4); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0s5", b0s5); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0s6", b0s6); + + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0p1", b0p1); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0p2", b0p2); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0p3", b0p3); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0p4", b0p4); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0p5", b0p5); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0p6", b0p6); + + feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"bm1f", bm1f); feat_lib_add(fl, FEAT_TYPE_LEMMA, (char *)"bm1l", bm1l); feat_lib_add(fl, FEAT_TYPE_CPOS, (char *)"bm1c", bm1c); @@ -1051,7 +1082,8 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_X, (char *)"bm1X", bm1X); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"bm1Y", bm1Y); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"bm1Z", bm1Z); - feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"bm1U1", bm1U1); + /* feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"bm1U1", bm1U1); */ + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"bm1U1", bm1U1); feat_lib_add(fl, FEAT_TYPE_INT, (char *)"bm1sgn", bm1sgn); feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"bm2f", bm2f); @@ -1159,7 +1191,8 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_X, (char *)"b1X", b1X); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b1Y", b1Y); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b1Z", b1Z); - feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"b1U1", b1U1); + /* feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"b1U1", b1U1); */ + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b1U1", b1U1); feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b1sgn", b1sgn); feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"b2f", b2f); @@ -1249,22 +1282,35 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_b0p", ldep_b0p); feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_b0p", rdep_b0p); - feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"ndep_b0", ndep_b0); - feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"ndep_s0", ndep_s0); + /* feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"ndep_b0", ndep_b0); */ + /* feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"ndep_s0", ndep_s0); */ + + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"ndep_b0", ndep_b0); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"ndep_s0", ndep_s0); /* distance features */ - feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"dist_s0_b0", dist_s0_b0); + /* feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"dist_s0_b0", dist_s0_b0); */ + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"dist_s0_b0", dist_s0_b0); /* configurational features */ - feat_lib_add(fl, FEAT_TYPE_INT_8, (char *)"sh", sh); - feat_lib_add(fl, FEAT_TYPE_INT_8, (char *)"bh", bh); + /* feat_lib_add(fl, FEAT_TYPE_INT_8, (char *)"sh", sh); */ + /* feat_lib_add(fl, FEAT_TYPE_INT_8, (char *)"bh", bh); */ + + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"sh", sh); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"bh", bh); + /* feat_lib_add(fl, FEAT_TYPE_INT_8, (char *)"dh", dh); */ - feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t1", t1); - feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t2", t2); - feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t3", t3); - feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t4", t4); + /* feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t1", t1); */ + /* feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t2", t2); */ + /* feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t3", t3); */ + /* feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t4", t4); */ + + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"t1", t1); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"t2", t2); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"t3", t3); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"t4", t4); feat_lib_add(fl, FEAT_TYPE_INT, (char *)"mvt0", mvt0); diff --git a/maca_trans_parser/src/global_feat_vec.c b/maca_trans_parser/src/global_feat_vec.c index c8eebb332e99437de68562d10d95b97a15c93895..94ecffee668b6fab8081d39a427baf3edab78363 100644 --- a/maca_trans_parser/src/global_feat_vec.c +++ b/maca_trans_parser/src/global_feat_vec.c @@ -16,7 +16,7 @@ void global_feat_vec_print(global_feat_vec *gfv) global_feat_vec *global_feat_vec_new(void) { - global_feat_vec *gfv = memalloc(sizeof(global_feat_vec)); + global_feat_vec *gfv = (global_feat_vec *)memalloc(sizeof(global_feat_vec)); gfv->nbelem = 0; gfv->array = NULL; return gfv; @@ -24,7 +24,7 @@ global_feat_vec *global_feat_vec_new(void) void global_feat_vec_add(global_feat_vec *gfv, int pred_mvt, feat_vec *fv) { - global_feat_vec_elt *elt = memalloc(sizeof(global_feat_vec_elt)); + global_feat_vec_elt *elt = (global_feat_vec_elt *) memalloc(sizeof(global_feat_vec_elt)); elt->pred_mvt = pred_mvt; /* elt->oracle_mvt = oracle_mvt; */ elt->fv = fv; diff --git a/maca_trans_parser/src/json.h b/maca_trans_parser/src/json.h new file mode 100644 index 0000000000000000000000000000000000000000..b4087ac609aa79ef290496c952da93a7818d916a --- /dev/null +++ b/maca_trans_parser/src/json.h @@ -0,0 +1,243 @@ +#pragma once + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#include <iostream> +#include <string> +#include <sstream> +#include <vector> +#include <map> + +namespace json { + + typedef enum { Null, True, False, Number, String, List, Object } Type; + + std::string replace_all(std::string subject, const std::string& search, const std::string& replace) { + size_t pos = 0; + while ((pos = subject.find(search, pos)) != std::string::npos) { + subject.replace(pos, search.length(), replace); + pos += replace.length(); + } + return subject; + } + + class Value { + public: + Type type; + double number; + std::string string; + std::vector<Value> list; + std::map<std::string, Value> object; + public: + Value() : type(Null) { } + Value(Type _type) : type(_type) { } + Value(const std::string& text) : type(String), string(text) { } + Value(double _number) : type(Number), number(_number) { } + void append(const Value& v) { list.push_back(v); } + Value& operator[](int index) { return list[index]; } + Value& operator[](const std::string& key) { + return object[key]; } + int length() const { + if(type == List) { return list.size(); } + if(type == Object) { return object.size(); } + if(type == String) { return string.length(); } + return 1; + } + std::string to_json() const { + std::stringstream out; + if(type == String) out << "\"" << replace_all(string, "\"", "\\\"") << "\""; + else if(type == Null) out << "null"; + else if(type == True) out << "true"; + else if(type == False) out << "false"; + else if(type == Number) out << number; + else if(type == List) { + out << "["; + for(int i = 0; i < length(); i++) { + if(i > 0) out << ","; + out << list[i].to_json(); + } + out << "]"; + } else if(type == Object) { + out << "{"; + for(std::map<std::string, Value>::const_iterator i = begin(); i != end(); i++) { + if(i != begin()) out << ","; + out << "\"" << i->first << "\":" << i->second.to_json(); + } + out << "}"; + } else { + out << "error"; + } + return out.str(); + } + std::string to_string() const { return string; } + int to_int() const { return (int) number; } + double to_double() const { return (double) number; } + std::map<std::string, Value>::const_iterator begin() const { return object.begin(); } + std::map<std::string, Value>::const_iterator end() const { return object.end(); } + }; + + class Iterator { + const Value& backend; + std::map<std::string, Value>::const_iterator iterator; + public: + Iterator(const Value& target) : backend(target), iterator(target.begin()) { } + const std::string key() { return iterator->first; } + const Value value() { return iterator->second; } + bool hasNext() { return iterator != backend.end(); } + void next() { iterator++; } + }; + + class Parser { + const char* input; + + void error() { + throw std::string("error at \"" + std::string(input) + "\""); + } + + void space() { + while(*input != '\0' && (*input == ' ' || *input == '\t' || *input == '\n' || *input == '\r')) input++; + } + + Value null() { + input += 4; + return Value(Null); + } + + Value _false() { + input += 5; + return Value(False); + } + + Value _true() { + input += 4; + return Value(True); + } + + Value number() { + char* end = NULL; + double value = strtod(input, &end); + input = end; + return Value(value); + } + + Value string() { + input++; + char prev = '"'; + const char* start = input; + while(*input != '\0') { + if(*input == '"' && prev != '\\') { + char text[input - start + 1]; + strncpy(text, start, input - start); + text[input - start] = '\0'; + input++; + return Value(replace_all(text, "\\\"", "\"")); + } + prev = *input; + input++; + } + error(); + return Value(Null); + } + + Value list() { + input++; + Value l(List); + while(*input != '\0') { + space(); + if(*input == ']') { + input++; + return l; + } + l.append(value()); + space(); + if(*input == ']') { + input++; + return l; + } else if(*input == ',') { + input++; + } else { + error(); + } + } + error(); + return Value(Null); + } + + Value object() { + input++; + Value o(Object); + while(*input != '\0') { + space(); + if(*input == '}') { + input++; + return o; + } + std::string key = string().to_string(); + space(); + if(*input != ':') error(); + else { + input++; + space(); + o[key] = value(); + space(); + } + if(*input == '}') { + input++; + return o; + } else if(*input == ',') { + input++; + } else { + error(); + return Value(Null); + } + } + error(); + return Value(Null); + } + + Value value() { + if(*input == '{') return object(); + else if(*input == '[') return list(); + else if(*input == '"') return string(); + else if(!strncmp(input, "true", 4)) return _true(); + else if(!strncmp(input, "false", 5)) return _false(); + else if(!strncmp(input, "null", 4)) return null(); + else return number(); + } + + public: + Value parse(const char* p) { + input = p; + space(); + Value v = value(); + space(); + if(*input != '\0') error(); + return v; + } + }; + + Value parse(const std::string& str) { + return Parser().parse(str.c_str()); + } + Value parse(const char* input) { + return Parser().parse(input); + } + Value parse_file(const char* filename) { + FILE* fp = fopen(filename, "r"); + if(!fp) { std::cerr << "ERROR: cannot load file \"" << filename << "\"\n"; exit(1); } + fseek(fp, 0, SEEK_END); + off_t length = ftell(fp); + char* content = new char[length + 1]; + fseek(fp, 0, SEEK_SET); + size_t read = fread(content, length, 1, fp); + if(read != 1) { std::cerr << "ERROR: could not read content of \"" << filename << "\"\n"; exit(1); } + fclose(fp); + content[length] = '\0'; + Value v = parse(content); + delete[] content; + return v; + } + +} diff --git a/maca_trans_parser/src/keras.h b/maca_trans_parser/src/keras.h new file mode 100644 index 0000000000000000000000000000000000000000..bd18939cd0ff994e59134fc4350afd5807509f15 --- /dev/null +++ b/maca_trans_parser/src/keras.h @@ -0,0 +1,208 @@ +#include <iostream> +#include <cassert> + +#include "json.h" +#include "matrix.h" + +class Node { + protected: + json::Value config; + std::vector<Node*> inbound; + std::string name; + public: + Node() { name = "NONE"; } + virtual ~Node() { } + Node(const json::Value& _config) : config(_config) { name = config["name"].to_string();} + void setup(std::map<std::string, Node*>& nodes) { + json::Value inbound_nodes = config["inbound_nodes"]; + if(inbound_nodes.length() > 0) { + for(int j = 0; j < inbound_nodes[0].length(); j++) { + std::string node_name = inbound_nodes[0][j][0].to_string(); + //std::cerr << node_name << "->" << name << "\n"; + if(nodes.find(node_name) != nodes.end()) { + inbound.push_back(nodes[node_name]); + } else { + std::cerr << "ERROR: cannot find inbound layer \"" << node_name << "\" when setting up layer \"" << name << "\"\n"; + exit(1); + } + } + } + } + + virtual void set_input(const Matrix<float> & x) { } + virtual Matrix<float> get_output() { + assert(inbound.size() == 1); + assert(inbound[0] != NULL); + return forward(inbound[0]->get_output()); + } + virtual Matrix<float> forward(const Matrix<float> & x) { return x; } +}; + +class Embedding : public Node { + protected: + Matrix<float> W; + public: + Embedding(const json::Value& _config, FILE* storage) : Node(_config) { + fseek(storage, config["weights"]["W"].to_int(), SEEK_SET); + W.load(storage); + //W.print("W"); + } + virtual Matrix<float> forward(const Matrix<float> & x) { + Matrix<float> output(x.rows, x.cols * W.cols); + for(int i = 0; i < x.rows; i++) { + for(int j = 0; j < x.cols; j++) { + int id = (int)x.at(i, j); + if(id < 0 || id >= W.rows) { + std::cerr << "WARNING: unexpected embedding id " << id << " for row " << i << " in layer " << name << ", mapping to 0\n"; + id = 0; + } + //assert(id >= 0 && id < W.rows); + output[i].slice(j * W.cols, W.cols) = W[id]; + } + } + return output; + } +}; + +class Dense : public Node { + protected: + Matrix<float> W, b; + Matrix<float> (*activation)(const Matrix<float>&); + public: + Dense(const json::Value& _config, FILE* storage) : Node(_config) { + fseek(storage, config["weights"]["W"].to_int(), SEEK_SET); + W.load(storage); + fseek(storage, config["weights"]["b"].to_int(), SEEK_SET); + b.load(storage); + std::string function = config["config"]["activation"].to_string(); + if(function == "linear") activation = Matrix<float>::identity; + else if(function == "tanh") activation = Matrix<float>::tanh; + else if(function == "sigmoid") activation = Matrix<float>::sigmoid; + else if(function == "relu") activation = Matrix<float>::relu; + else if(function == "softmax") activation = Matrix<float>::softmax; + else { + std::cerr << "ERROR: unsupported activation function \"" << function << "\"\n"; + exit(1); + } + } + virtual Matrix<float> forward(const Matrix<float> & x) { + return activation(x.dot(W) + b); + } +}; + +class Input : public Node { + Matrix<float> input; + public: + Input(const json::Value& _config) : Node(_config) { } + void set_input(const Matrix<float> &x) { input = x; } + Matrix<float> get_output() { return input; } +}; + +class Merge : public Node { + public: + Merge(const json::Value& _config) : Node(_config) { } + Matrix<float> get_output() { + int cols = 0; + int rows = 0; + std::vector<Matrix<float> > inputs(inbound.size()); + for(size_t i = 0; i < inbound.size(); i++) { + inputs[i] = inbound[i]->get_output(); + cols += inputs[i].cols; + if(rows < inputs[i].rows) rows = inputs[i].rows; + } + //std::cerr << "Merge: " << rows << "x" << cols << "\n"; + Matrix<float> output(rows, cols); + int offset = 0; + for(size_t i = 0; i < inputs.size(); i++) { + for(int j = 0; j < inputs[i].rows; j++) { + output[j].slice(offset, inputs[i].cols) = inputs[i][j]; + } + offset += inputs[i].cols; + } + return output; + } +}; + +class Identity : public Node { + public: + Identity(const json::Value& _config) : Node(_config) { } + Matrix<float> get_output() { + assert(inbound.size() == 1); + return inbound[0]->get_output(); + } +}; + +class Model { + std::map<std::string, Node*> nodes; + std::vector<Node*> output_nodes; + std::vector<Node*> input_nodes; + + public: + + ~Model() { + for(std::map<std::string, Node*>::iterator i = nodes.begin(); i != nodes.end(); i++) { + delete i->second; + } + } + + int num_inputs() { return input_nodes.size(); } + int num_outputs() { return output_nodes.size(); } + + std::vector<Matrix<float> > forward(std::vector<Matrix<float> > input) { + assert(input_nodes.size() == input.size()); + for(size_t i = 0; i < input.size(); i++) { + input_nodes[i]->set_input(input[i]); + } + std::vector<Matrix<float> > output(output_nodes.size()); + for(size_t i = 0; i < output_nodes.size(); i++) { + output[i] = output_nodes[i]->get_output(); + } + return output; + } + Matrix<float> forward(const Matrix<float>& input) { + assert(input_nodes.size() == 1); + assert(output_nodes.size() == 1); + input_nodes[0]->set_input(input); + return output_nodes[0]->get_output(); + } + + static Model load(const char* json_filename, const char* storage_filename) { + Model model; + + json::Value config = json::parse_file(json_filename); + FILE* storage = fopen(storage_filename, "r"); + + for(int i = 0; i < config["config"]["layers"].length(); i++) { + json::Value layer = config["config"]["layers"][i]; + std::string name = layer["name"].to_string(); + std::string class_name = layer["class_name"].to_string(); + std::vector<std::string> inbound; + if(class_name == "Dense") model.nodes[name] = new Dense(layer, storage); + else if(class_name == "Embedding") model.nodes[name] = new Embedding(layer, storage); + else if(class_name == "Merge") model.nodes[name] = new Merge(layer); + else if(class_name == "Dropout") model.nodes[name] = new Identity(layer); + else if(class_name == "InputLayer") model.nodes[name] = new Input(layer); + else { + std::cerr << "ERROR: unsupported layer class \"" << class_name << "\"\n"; + exit(1); + } + } + for(std::map<std::string, Node*>::iterator i = model.nodes.begin(); i != model.nodes.end(); i++) { + i->second->setup(model.nodes); + } + + for(int i = 0; i < config["config"]["input_layers"].length(); i++) { + std::string name = config["config"]["input_layers"][i][0].to_string(); + model.input_nodes.push_back(model.nodes[name]); + } + for(int i = 0; i < config["config"]["output_layers"].length(); i++) { + std::string name = config["config"]["output_layers"][i][0].to_string(); + model.output_nodes.push_back(model.nodes[name]); + } + + fclose(storage); + return model; + } + +}; + diff --git a/maca_trans_parser/src/maca_trans_lemmatizer.c b/maca_trans_parser/src/maca_trans_lemmatizer.c index 351305c873ab062f8990aeb0c6a25f04d785655e..f0c86e5e8eedc7fb7f208e497c7fe199cb8cd800 100644 --- a/maca_trans_parser/src/maca_trans_lemmatizer.c +++ b/maca_trans_parser/src/maca_trans_lemmatizer.c @@ -1,3 +1,4 @@ + #include<stdio.h> #include<stdlib.h> #include<string.h> @@ -46,6 +47,7 @@ char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, in { char form[1000]; char pos[1000]; + char lemma[1000]; char morpho[1000]; int num = 0; @@ -79,7 +81,7 @@ char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, in if(num >= *lemma_array_size){ *lemma_array_size = 2 * (*lemma_array_size) + 1; - lemma_array = realloc(lemma_array, (*lemma_array_size) * sizeof(char *)); + lemma_array = (char **)realloc(lemma_array, (*lemma_array_size) * sizeof(char *)); // initialize in order to be able to free correctly and the end for(int i=num; i<*lemma_array_size; ++i) { lemma_array[i] = NULL; diff --git a/maca_trans_parser/src/maca_trans_morpho.c b/maca_trans_parser/src/maca_trans_morpho.c new file mode 100644 index 0000000000000000000000000000000000000000..be9db3a56d2b0d91437eaeff3317c2e65a9d6cd3 --- /dev/null +++ b/maca_trans_parser/src/maca_trans_morpho.c @@ -0,0 +1,177 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"feat_fct.h" +#include"feature_table.h" +#include"dico.h" +#include"config2feat_vec.h" + +void decode_morpho_help_message(context *ctx); +void decode_morpho_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_beam_help_message(ctx); + context_conll_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_input_help_message(ctx); + context_mcd_help_message(ctx); + context_model_help_message(ctx); + context_vocabs_help_message(ctx); + context_features_model_help_message(ctx); + context_f2p_filename_help_message(ctx); +} + +void decode_morpho_check_options(context *ctx){ + if(ctx->help + /*!ctx->conll_filename*/ + /* || !ctx->perc_model_filename + || !ctx->mcd_filename + || !ctx->vocabs_filename + || !ctx->features_model_filename*/ + ){ + decode_morpho_help_message(ctx); + exit(1); + } +} + +void decode_morpho_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_filename[500]; + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MODEL_MORPHO_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_VOCABS_MORPHO_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_MORPHO_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + } +} +void print_word(word *w, mcd *mcd_struct, dico *dico_morph, int postag) +{ + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + if(mcd_get_pos_col(mcd_struct) == -1){ + printf("%s\t%s\n", w->input, dico_int2string(dico_morph, postag)); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) printf("\t"); + if(col_nb == mcd_get_feats_col(mcd_struct)) + printf("%s", dico_int2string(dico_morph, postag)); + else + word_print_col_n(stdout, w, col_nb); + col_nb++; + token = strtok(NULL, "\t"); + } + if(col_nb <= mcd_get_feats_col(mcd_struct)) + printf("\t%s", dico_int2string(dico_morph, postag)); + printf("\n"); + free(buffer); + } +} + +int movement_morpho(config *c, int feats) +{ + word_set_feats(word_buffer_b0(c->bf), feats); + word_buffer_move_right(c->bf); + + return 1; +} + +void simple_decoder_morpho(context *ctx) +{ + config *c; + feat_vec *fv = feat_vec_new(feature_types_nb); + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + int feats; + float max; + word *b0; + dico *dico_feats = dico_vec_get_dico(ctx->vocabs, (char *)"FEATS"); + + c = config_new(f, ctx->mcd_struct, 5); + + while(!config_is_terminal(c)){ + b0 = word_buffer_b0(c->bf); + feats = word_get_feats(b0); + + if(ctx->debug_mode){ + fprintf(stderr, "***********************************\n"); + config_print(stderr, c); + } + + /* if feats is not specified in input it is predicted */ + if(feats == -1){ + /* config_print(stdout, c); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + + /* feat_vec_print(stdout, fv); */ + feats = feature_table_argmax(fv, ft, &max); + /* printf("feats = %d\n", feats); */ + + if(ctx->debug_mode){ + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + for(int i=0; i < 3; i++){ + fprintf(stderr, "%d\t", i); + fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_feats, vcode_array[i].class_code), vcode_array[i].score); + } + free(vcode_array); + } + } + + print_word(b0, ctx->mcd_struct, dico_feats, feats); + + movement_morpho(c, feats); + + } + /* config_print(stdout, c); */ + feat_vec_free(fv); + feature_table_free(ft); + config_free(c); + if (ctx->input_filename) fclose(f); +} + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + decode_morpho_check_options(ctx); + + decode_morpho_set_linguistic_resources_filenames(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + if(ctx->beam_width == 1) + simple_decoder_morpho(ctx); + + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/maca_trans_morpho_mcf2cff.c b/maca_trans_parser/src/maca_trans_morpho_mcf2cff.c new file mode 100644 index 0000000000000000000000000000000000000000..a821863280fca6295a0a85b139ea11d532c547da --- /dev/null +++ b/maca_trans_parser/src/maca_trans_morpho_mcf2cff.c @@ -0,0 +1,129 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"feat_fct.h" +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"config2feat_vec.h" + + +int oracle_morpho(config *c) +{ + return word_get_feats(word_buffer_b0(config_get_buffer(c))); +} + + +int movement_morpho(config *c, int feats) +{ + word_set_feats(word_buffer_b0(c->bf), feats); + word_buffer_move_right(c->bf); + + return 1; +} + +void maca_trans_morpho_mcf2cff_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + context_mcd_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_vocabs_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_vocabs_help_message(ctx); + + +} + +void maca_trans_morpho_mcf2cff_check_options(context *ctx) +{ + if(!ctx->input_filename + || ctx->help + /* || !ctx->mcd_filename */ + || !(ctx->cff_filename || ctx->fann_filename) + ){ + maca_trans_morpho_mcf2cff_help_message(ctx); + exit(1); + } +} + +void morpho_generate_training_file(FILE *output_file, context *ctx) +{ + config *c; + feat_vec *fv = feat_vec_new(feature_types_nb); + FILE *conll_file = myfopen(ctx->input_filename, "r"); + int feats; + /* dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */ + + c = config_new(conll_file, ctx->mcd_struct, 5); + + while(!config_is_terminal(c)){ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + feats = oracle_morpho(c); + + fprintf(output_file, "%d", feats); + feat_vec_print(output_file, fv); + movement_morpho(c, feats); + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + + ctx = context_read_options(argc, argv); + maca_trans_morpho_mcf2cff_check_options(ctx); + + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + + + if(ctx->mode == TRAIN_MODE){ + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + } + else if(ctx->mode == TEST_MODE){ + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + } + + /* in train mode create feature dictionnary for perceptron */ + if(ctx->mode == TRAIN_MODE) + ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); + + /* in test mode read feature dictionnary for perceptron */ + if(ctx->mode == TEST_MODE) + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + /* add the feature dictionnary to the dico vector */ + dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + + /* open output file */ + if(ctx->cff_filename) + output_file = myfopen(ctx->cff_filename, "w"); + else + output_file = stdout; + + morpho_generate_training_file(output_file, ctx); + + if(ctx->mode == TRAIN_MODE){ + /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ + dico_vec_print(ctx->vocabs_filename, ctx->vocabs); + + } + + if(ctx->cff_filename) + fclose(output_file); + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/maca_trans_parser_nn.cc b/maca_trans_parser/src/maca_trans_parser_nn.cc new file mode 100644 index 0000000000000000000000000000000000000000..61e9786086e049dfc48d6bb8c4270f02a7a82a26 --- /dev/null +++ b/maca_trans_parser/src/maca_trans_parser_nn.cc @@ -0,0 +1,300 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"movement_parser.h" +#include"oracle_parser_arc_eager.h" +#include"feat_fct.h" +#include"feature_table.h" +#include"dico.h" +#include "keras.h" +#include"movement_parser_arc_eager.h" +#include"feat_fct.h" +#include"feature_table.h" + + + + +void maca_trans_parser_nn_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_debug_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_input_help_message(ctx); + context_mcd_help_message(ctx); + context_vocabs_help_message(ctx); + context_features_model_help_message(ctx); + context_root_label_help_message(ctx); + context_json_help_message(ctx); + context_dnn_model_help_message(ctx); +} + +void maca_trans_parser_nn_check_options(context *ctx){ + if(ctx->help + /*!ctx->conll_filename*/ + /* || !ctx->perc_model_filename + || !ctx->mcd_filename + || !ctx->vocabs_filename + || !ctx->features_model_filename*/ + ){ + maca_trans_parser_nn_help_message(ctx); + exit(1); + } +} + +void set_linguistic_resources_filenames_parser(context *ctx) +{ + char absolute_filename[500]; + + if(!ctx->dnn_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MODEL_PARSER_NN_FILENAME); + ctx->dnn_model_filename = strdup(absolute_filename); + } + + if(!ctx->json_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_JSON_PARSER_NN_FILENAME); + ctx->json_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_VOCABS_PARSER_NN_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_PARSER_NN_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + if(ctx->verbose){ + fprintf(stderr, "dnn_model = %s\n", ctx->dnn_model_filename); + fprintf(stderr, "json = %s\n", ctx->json_filename); + fprintf(stderr, "vocabs %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd = %s\n", ctx->mcd_filename); + fprintf(stderr, "features_model = %s\n", ctx->features_model_filename); + } +} + + + +void print_word_buffer(config *c, dico *dico_labels, mcd *mcd_struct) +{ + int i; + word *w; + char *label; + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + + + for(i=0; i < config_get_buffer(c)->nbelem; i++){ + w = word_buffer_get_word_n(config_get_buffer(c), i); + + if((mcd_get_gov_col(mcd_struct) == -1) + && (mcd_get_label_col(mcd_struct) == -1) + && (mcd_get_sent_seg_col(mcd_struct) == -1)){ + printf("%s\t", word_get_input(w)); + printf("%d\t", word_get_gov(w)); + label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w)); + if(label != NULL) + printf("%s\t", label) ; + else + printf("_\t"); + if(word_get_sent_seg(w) == 1) + printf("1\n") ; + else + printf("0\n"); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) printf("\t"); + if(col_nb == mcd_get_gov_col(mcd_struct)){ + printf("%d", word_get_gov(w)); + } + else + if(col_nb == mcd_get_label_col(mcd_struct)){ + label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w)); + if(label != NULL) + printf("%s", label) ; + else + printf("_"); + } + else + if(col_nb == mcd_get_sent_seg_col(mcd_struct)){ + if(word_get_sent_seg(w) == 1) + printf("1") ; + else + printf("0"); + } + else{ + word_print_col_n(stdout, w, col_nb); + } + col_nb++; + token = strtok(NULL, "\t"); + } + if((col_nb <= mcd_get_gov_col(mcd_struct)) || (mcd_get_gov_col(mcd_struct) == -1)){ + printf("\t%d", word_get_gov(w)); + } + if((col_nb <= mcd_get_label_col(mcd_struct)) || (mcd_get_label_col(mcd_struct) == -1)){ + label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w)); + if(label != NULL) + printf("\t%s", label) ; + else + printf("\t_"); + } + if((col_nb <= mcd_get_sent_seg_col(mcd_struct)) || (mcd_get_sent_seg_col(mcd_struct) == -1)){ + if(word_get_sent_seg(w) == 1) + printf("\t1") ; + else + printf("\t0"); + } + printf("\n"); + free(buffer); + } + } +} + +std::vector<Matrix<float> > config2keras_vec(feat_model *fm, config *c) +{ + int i; + feat_desc *fd; + int feat_value; + std::vector<Matrix<float> > keras_vec(fm->nbelem, Matrix<float>(1, 1)); + for(i=0; i < fm->nbelem; i++){ + /* fm must be exclusively composed of simple features */ + /* if this is not the case, the first feature of a complex feature is take into account */ + fd = fm->array[i]; + feat_value = fd->array[0]->fct(c); + keras_vec[i][0][0] = feat_value + 1; + // printf("feature %d = %d\n", i, (int)keras_vec[i][0][0]); + } + return keras_vec; +} + +void simple_decoder_parser_arc_eager_nn(context *ctx, Model &model) +{ + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + // feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + int root_label; + int mvt_code; + int mvt_type; + int mvt_label; + config *c = NULL; + int result; + std::vector<Matrix<float> > keras_vec; + int sentence_nb = 0; + root_label = dico_string2int(ctx->dico_labels, ctx->root_label); + if(root_label == -1) root_label = 0; + + c = config_new(f, ctx->mcd_struct, 5); + while(!config_is_terminal(c)){ + + if(ctx->debug_mode){ + fprintf(stdout, "***********************************\n"); + config_print(stdout, c); + } + /* forced EOS (the element on the top of the stack is eos, but the preceding movement is not MVT_PARSER_EOS */ + /* which means that the top of the stack got its eos status from input */ + /* force the parser to finish parsing the sentence (perform all pending reduce actions) and determine root of the sentence */ + + if((word_get_sent_seg(stack_top(config_get_stack(c))) == 1) && (mvt_get_type(mvt_stack_top(config_get_history(c))) != MVT_PARSER_EOS)){ + word_set_sent_seg(stack_top(config_get_stack(c)), -1); + movement_parser_eos(c); + while(movement_parser_reduce(c)); + while(movement_parser_root(c, root_label)); + if(ctx->debug_mode) printf("force EOS\n"); + } + + /* normal behaviour, ask classifier what is the next movement to do and do it */ + else{ + keras_vec = config2keras_vec(ctx->features_model, c); + std::vector<Matrix<float> > y = model.forward(keras_vec); + Matrix<float> argmax = y[0].argmax(); + mvt_code = argmax.at(0, 0); + + // fprintf(stderr,"mvt code = %d\n", mvt_code); + mvt_type = movement_parser_type(mvt_code); + mvt_label = movement_parser_label(mvt_code); + + result = 0; + switch(mvt_type){ + case MVT_PARSER_LEFT : + result = movement_parser_left_arc(c, mvt_label); + break; + case MVT_PARSER_RIGHT: + result = movement_parser_right_arc(c, mvt_label); + break; + case MVT_PARSER_REDUCE: + result = movement_parser_reduce(c); + break; + case MVT_PARSER_ROOT: + result = movement_parser_root(c, root_label); + break; + case MVT_PARSER_EOS: + result = movement_parser_eos(c); + sentence_nb++; + if((sentence_nb % 2) == 0) + fprintf(stderr, "\rsentence %d", sentence_nb); + break; + case MVT_PARSER_SHIFT: + result = movement_parser_shift(c); + } + + if(result == 0){ + if(ctx->debug_mode) fprintf(stdout, "WARNING : movement cannot be executed doing a SHIFT instead !\n"); + result = movement_parser_shift(c); + if(result == 0){ /* SHIFT failed no more words to read, let's get out of here ! */ + if(ctx->debug_mode) fprintf(stdout, "WARNING : cannot exectue a SHIFT emptying stack !\n"); + while(!stack_is_empty(config_get_stack(c))) + movement_parser_root(c, root_label); + } + } + } + + } + fprintf(stderr, "\n"); + print_word_buffer(c, ctx->dico_labels, ctx->mcd_struct); + + config_free(c); + if(ctx->input_filename) + fclose(f); +} + + + +int main(int argc, char *argv[]) +{ + context *ctx; + + ctx = context_read_options(argc, argv); + maca_trans_parser_nn_check_options(ctx); + + set_linguistic_resources_filenames_parser(ctx); + Model model = Model::load(ctx->json_filename, ctx->dnn_model_filename); + + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + + ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + + if(ctx->dico_labels == NULL){ + fprintf(stderr, "cannot find label names\n"); + return 1; + } + simple_decoder_parser_arc_eager_nn(ctx, model); + + // context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/matrix.h b/maca_trans_parser/src/matrix.h new file mode 100644 index 0000000000000000000000000000000000000000..67c839344bcbf2df29cb89524a58ac714ae4f1c1 --- /dev/null +++ b/maca_trans_parser/src/matrix.h @@ -0,0 +1,385 @@ +#pragma once + +//#define USE_CBLAS +#include <cstdio> +#include <cassert> +#include <cmath> +#include <cstdlib> +#include <cstring> +#include <iostream> + +#ifdef USE_CBLAS +extern "C" { +#include <cblas.h> +} +#endif + +#define error(...) fprintf(stderr, __VA_ARGS__); + +typedef enum { + BOTH, ROWS, COLS +} Axis; + +template <class T> +class MatrixRow { + private: + T* values; + int cols; + public: + MatrixRow(T* _values, int _cols) : values(_values), cols(_cols) { } + T& operator[](int y) { + return *(values + y); + } + MatrixRow& operator=(const MatrixRow& other) { + assert(cols == other.cols); + memcpy(values, other.values, cols * sizeof(T)); + return *this; + } + MatrixRow slice(int offset, int length) const { + assert(offset + length <= cols); + return MatrixRow(values + offset, length); + } + +}; + +template <class T> +class Matrix { + public: + int rows; + int cols; + private: + T* values; + public: + Matrix() : rows(0), cols(0), values(NULL) { } + Matrix(int _rows, int _cols) : rows(_rows), cols(_cols) { + if(rows * cols > 0) { + values = new T[rows * cols]; + } else { + values = NULL; + } + } + ~Matrix() { + if(values != NULL) delete[] values; + } + Matrix<T>(const Matrix<T>& other) : rows(other.rows), cols(other.cols) { + if(rows * cols > 0) { + values = new T[rows * cols]; + memcpy(values, other.values, sizeof(T) * rows * cols); + } else { + values = NULL; + } + } + + void info(const char* name) const { + printf("%s: %dx%d\n", name, rows, cols); + } + + void print(const char* name = NULL, const char* format = "%9f") const { + if(name != NULL) info(name); + for(int i = 0; i < rows; i++) { + for(int j = 0; j < cols; j++) { + printf(format, at(i, j)); + printf(" "); + } + printf("\n"); + } + } + + static Matrix zeros(int rows, int cols) { + Matrix result(rows, cols); + bzero(result.values, sizeof(T) * rows * cols); + return result; + } + + static Matrix ones(int rows, int cols) { + Matrix result(rows, cols); + for(int i = 0; i < rows * cols; i++) result.values[i] = 1; + return result; + } + + static Matrix rand(int rows, int cols) { + Matrix result(rows, cols); + for(int i = 0; i < rows * cols; i++) result.values[i] = (2.0f * std::rand()) / RAND_MAX - 1.0f; + return result; + } + + static Matrix zeros_like(const Matrix<T>& other) { + return zeros(other.rows, other.cols); + } + + static Matrix ones_like(const Matrix<T>& other) { + return ones(other.rows, other.cols); + } + + Matrix<T> broadcast(const Matrix<T>& other) const { + if(rows == other.rows && cols == other.cols) return *this; + assert(other.cols % cols == 0 && other.rows % rows == 0); + Matrix result(other.rows, other.cols); + for(int i = 0; i < other.rows; i++) { + for(int j = 0; j < other.cols; j++) { + result.at(i, j) = at(i % rows, j % cols); + } + } + return result; + } + + Matrix<T> slice(int start_row, int end_row) const { + return slice(start_row, end_row, 0, cols); + } + Matrix<T> slice(int start_row, int end_row, int start_col, int end_col) const { + Matrix<T> result(end_row - start_row, end_col - start_col); + for(int i = 0; i < end_row - start_row; i++) { + result[i] = (*this)[i + start_row].slice(start_col, end_col); + } + return result; + } + + /*Matrix<T>& operator=(const T& value) { + for(int i = 0; i < rows * cols; i++) values[i] = value; + }*/ + + const Matrix<T>& operator=(const Matrix<T>& other) { + if(cols != other.cols || rows != other.rows) { + delete[] values; + cols = other.cols; + rows = other.rows; + values = new T[cols * rows]; + } + memcpy(values, other.values, sizeof(T) * rows * cols); + return *this; + } + + MatrixRow<T> operator[](int x) { + return MatrixRow<T>(values + x * cols, cols); + } + const MatrixRow<T> operator[](int x) const { + return MatrixRow<T>(values + x * cols, cols); + } + T& at(int x, int y) { + return values[x * cols + y]; + } + const T& at(int x, int y) const { + return values[x * cols + y]; + } + // matrix-scalar operations + Matrix<T> operator-(T a) const { + assert(a.rows == rows && a.cols == cols); + Matrix<T> result(rows, cols); + for(int i = 0; i < rows * cols; i++) result.values[i] = values[i] - a; + return result; + } + Matrix<T> operator+(T a) const { + Matrix<T> result(rows, cols); + for(int i = 0; i < rows * cols; i++) result.values[i] = a + values[i]; + return result; + } + Matrix<T> operator*(T a) const { + assert(a.rows == rows && a.cols == cols); + Matrix<T> result(rows, cols); + for(int i = 0; i < rows * cols; i++) result.values[i] = a * values[i]; + return result; + } + Matrix<T> operator/(T a) const { + assert(a.rows == rows && a.cols == cols); + Matrix<T> result(rows, cols); + for(int i = 0; i < rows * cols; i++) result.values[i] = values[i] / a; + return result; + } + // matrix-matrix operations + Matrix<T> operator-(const Matrix<T>& other) const { + Matrix<T> a = other.broadcast(*this); + Matrix<T> result(rows, cols); + for(int i = 0; i < rows * cols; i++) result.values[i] = values[i] - a.values[i]; + return result; + } + Matrix<T> operator+(const Matrix<T>& other) const { + Matrix<T> a = other.broadcast(*this); + Matrix<T> result(rows, cols); + for(int i = 0; i < rows * cols; i++) result.values[i] = a.values[i] + values[i]; + return result; + } + Matrix<T> operator*(const Matrix<T>& other) const { + Matrix<T> a = other.broadcast(*this); + Matrix<T> result(rows, cols); + for(int i = 0; i < rows * cols; i++) result.values[i] = a.values[i] * values[i]; + return result; + } + Matrix<T> operator/(const Matrix<T>& other) const { + Matrix<T> a = other.broadcast(*this); + Matrix<T> result(rows, cols); + for(int i = 0; i < rows * cols; i++) result.values[i] = values[i] / a.values[i]; + return result; + } + + Matrix<T> dot(const Matrix<T>& a) const { + assert(a.rows == cols); + Matrix<T> result = zeros(rows, a.cols); +#ifdef USE_CBLAS + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, rows, a.cols, cols, 1, values, cols, a.values, a.cols, 1, result.values, result.cols); +#else + for(int i = 0; i < rows; i++) + for(int j = 0; j < a.cols; j++) + for(int k = 0; k < cols; k++) + result.at(i, j) += at(i, k) * a.at(k, j); + //result.values[i * result.cols + j] += values[i * cols + k] * a.values[k * a.cols + j]; +#endif + return result; + } + Matrix<T> transpose() const { + Matrix<T> result(cols, rows); + for(int i = 0; i < cols; i++) + for(int j = 0; j < rows; j++) + result.values[i * result.rows + j] = values[j * cols + i]; + return result; + } + + Matrix<T> sum(const Axis axis=BOTH) const { + if(axis == BOTH) { + Matrix<T> result = zeros(1, 1); + for(int i = 0; i < rows * cols; i++) result[0][0] += values[i]; + return result; + } else if(axis == ROWS) { + Matrix<T> result = zeros(1, cols); + for(int i = 0; i < rows; i++) + for(int j = 0; j < cols; j++) + result.values[j] += values[i * cols + j]; + return result; + } else if(axis == COLS) { + Matrix<T> result = zeros(rows, 1); + for(int i = 0; i < rows; i++) + for(int j = 0; j < cols; j++) + result.values[i] += values[i * cols + j]; + return result; + } + std::cerr << "ERROR: unsupported axis for Matrix::sum\n"; + exit(1); + return zeros(0, 0); + } + Matrix<T> max(const Axis axis=BOTH) const { + if(axis == BOTH) { + Matrix<T> result = zeros(1, 1); + for(int i = 0; i < rows * cols; i++) if(i == 0 || result.values[0] < values[i]) result.values[0] = values[i]; + return result; + } else if(axis == ROWS) { + Matrix<T> result = zeros(1, cols); + for(int i = 0; i < rows; i++) + for(int j = 0; j < cols; j++) + if(j == 0 || result.values[j] < values[i * cols + j]) result.values[j] = values[i * cols + j]; + return result; + } else if(axis == COLS) { + Matrix<T> result = zeros(rows, 1); + for(int i = 0; i < rows; i++) + for(int j = 0; j < cols; j++) + if(i == 0 || result.values[i] < values[i * cols + j]) result.values[i] = values[i * cols + j]; + return result; + } + std::cerr << "ERROR: unsupported axis for Matrix::max\n"; + exit(1); + return zeros(0, 0); + } + Matrix<T> min(const Axis axis=BOTH) const { + if(axis == BOTH) { + Matrix<T> result = zeros(1, 1); + for(int i = 0; i < rows * cols; i++) if(i == 0 || result.values[0] > values[i]) result.values[0] = values[i]; + return result; + } else if(axis == ROWS) { + Matrix<T> result = zeros(1, cols); + for(int i = 0; i < rows; i++) + for(int j = 0; j < cols; j++) + if(j == 0 || result.values[j] > values[i * cols + j]) result.values[j] = values[i * cols + j]; + return result; + } else if(axis == COLS) { + Matrix<T> result = zeros(rows, 1); + for(int i = 0; i < rows; i++) + for(int j = 0; j < cols; j++) + if(i == 0 || result.values[i] > values[i * cols + j]) result.values[i] = values[i * cols + j]; + return result; + } + std::cerr << "ERROR: unsupported axis for Matrix::min\n"; + exit(1); + return zeros(0, 0); + } + Matrix<T> argmax(const Axis axis=COLS) const { + if(axis == ROWS) { + Matrix<T> result = zeros(1, cols); + Matrix<T> max = zeros(1, cols); + for(int i = 0; i < rows; i++) { + for(int j = 0; j < cols; j++) { + if(i == 0 || max.at(0, j) < at(i, j)) { + max.at(0, j) = at(i, j); + result.at(0, j) = i; + } + } + } + return result; + } else if(axis == COLS) { + Matrix<T> result = zeros(rows, 1); + for(int i = 0; i < rows; i++) { + double max = 0; + for(int j = 0; j < cols; j++) { + if(j == 0 || max < at(i, j)) { + max = at(i, j); + result.at(i, 0) = j; + } + } + } + return result; + } + std::cerr << "ERROR: unsupported axis for Matrix::argmax\n"; + return zeros(0, 0); + } + + void load(FILE* fp) { + if(fread(&rows, sizeof(int), 1, fp) != 1) error("loading number of rows from fp\n"); + if(fread(&cols, sizeof(int), 1, fp) != 1) error("loading number of cols from fp\n"); + values = new T[rows * cols]; + if(fread(values, sizeof(T), rows * cols, fp) != (size_t) (rows * cols)) { + error("loading %dx%d matrix from fp\n", rows, cols); + } + } + + void save(FILE* fp) { + fwrite(&rows, sizeof(int), 1, fp); + fwrite(&cols, sizeof(int), 1, fp); + if(fwrite(values, sizeof(T), rows * cols, fp) != (size_t) (rows * cols)) { + error("saving %dx%d matrix to fp\n", rows, cols); + } + } + + Matrix<T> apply(T (*function)(T)) const { + Matrix<T> output(rows, cols); + for(int i = 0; i < rows * cols; i++) output.values[i] = function(values[i]); + return output; + } + + class Function { + public: + static T sigmoid(const T a) { return std::tanh(a * 0.5) * 0.5 + 0.5; } + static T hard_sigmoid(const T a) { T tmp = a * 0.2 + 0.5; return tmp < 0 ? 0 : tmp > 1 ? 1 : tmp; } + static T identity(const T a) { return a; } + static T tanh(const T a) { return std::tanh(a); } + static T exp(const T a) { return std::exp(a); } + static T log(const T a) { return std::log(a); } + static T relu(const T a) { return a > 0 ? a : 0; } + }; + + static Matrix<T> sigmoid(const Matrix<T>& x) { return x.apply(Function::sigmoid); } + static Matrix<T> hard_sigmoid(const Matrix<T>& x) { return x.apply(Function::hard_sigmoid); } + static Matrix<T> identity(const Matrix<T>& x) { return x.apply(Function::identity); } + static Matrix<T> tanh(const Matrix<T>& x) { return x.apply(Function::tanh); } + static Matrix<T> exp(const Matrix<T>& x) { return x.apply(Function::exp); } + static Matrix<T> log(const Matrix<T>& x) { return x.apply(Function::log); } + static Matrix<T> relu(const Matrix<T>& x) { return x.apply(Function::relu); } + static Matrix<T> softmax(const Matrix<T>& x) { + Matrix<T> r = Matrix::exp(x - x.max(COLS)); + return r / r.sum(COLS); + } + /*Matrix<T> operator=(Matrix<float> x) { + Matrix<T> result(x.rows, x.cols); + for(int i = 0; i < x.rows; i++) + for(int j = 0; j < y.cols; j++) + result.at(i, j) = (T) x.at(i, j); + return result; + }*/ +}; + diff --git a/maca_trans_parser/src/mvt.c b/maca_trans_parser/src/mvt.c index fd5c549fc2561777112c2d0714a9406d5847b119..dcedc2e895509bfd27c903a178d16e5574f66e51 100644 --- a/maca_trans_parser/src/mvt.c +++ b/maca_trans_parser/src/mvt.c @@ -5,7 +5,7 @@ mvt *mvt_new(int type, word *gov, word *dep) { - mvt *m = memalloc(sizeof(mvt)); + mvt *m = (mvt *)memalloc(sizeof(mvt)); m->type = type; m->gov = gov; m->dep = dep; diff --git a/maca_trans_parser/src/test_mvt_stack.c b/maca_trans_parser/src/test_mvt_stack.c index 711a3a49ffba93a1461c18af5ee6acf36ac63dee..110fbd914e476c0937813f0c42181ff7772951dd 100644 --- a/maca_trans_parser/src/test_mvt_stack.c +++ b/maca_trans_parser/src/test_mvt_stack.c @@ -11,9 +11,9 @@ int main(int argc, char *argv[]) int i; for(i=0; i < 10; i++){ - w1 = word_new("1"); + w1 = word_new((char *)"1"); word_set_index(w1, i + 1); - w2 = word_new("1"); + w2 = word_new((char *)"1"); word_set_index(w2, i + 2); mvt_stack_push(ms, mvt_new(i, w1, w2)); } diff --git a/perceptron/lib/src/cf_file.c b/perceptron/lib/src/cf_file.c index 9afcf344fe2a8934f5dd0b71f52d968552a1c0ac..0114a3dce03c55e2329a0ad6f75f1a77a16bf424 100644 --- a/perceptron/lib/src/cf_file.c +++ b/perceptron/lib/src/cf_file.c @@ -10,7 +10,7 @@ int *cff_max_value_per_column(char *cff_filename, int n) char *token; int i; int col; - int *max_array = memalloc(n * sizeof(int)); + int *max_array = (int *)memalloc(n * sizeof(int)); for(i = 0; i < n; i++){ max_array[i] = 0; }