diff --git a/CMakeLists.txt b/CMakeLists.txt index 389bdf0e217738811da3e08daf8eaac4d5703f23..e1556292f72e1a6de8cbcbfe0393909ebe27bdaa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,7 @@ add_subdirectory(maca_common) add_subdirectory(maca_tools) add_subdirectory(perceptron) #add_subdirectory(maca_lemmatizer) +add_subdirectory(maca_morpho) add_subdirectory(maca_tokenizer) add_subdirectory(maca_lexer) add_subdirectory(maca_trans_parser) diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index 881b98e80fff65bd5031a234cd16ebea0f9f14fc..392faf9d0f3c49bb9448754ed656d34f22ff05b8 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -8,7 +8,7 @@ #define MCD_INVALID_VALUE -1 -#define MCD_WF_NB 36 +#define MCD_WF_NB 47 #define MCD_WF_ID 0 #define MCD_WF_FORM 1 @@ -47,6 +47,81 @@ #define MCD_WF_Y 34 #define MCD_WF_Z 35 +#define MCD_WF_Aspect 36 +#define MCD_WF_Case 37 +#define MCD_WF_Clitic 38 +#define MCD_WF_Definite 39 +#define MCD_WF_Gender 40 +#define MCD_WF_Mood 41 +#define MCD_WF_NameType 42 +#define MCD_WF_NounType 43 +#define MCD_WF_Number 44 +#define MCD_WF_Person 45 +#define MCD_WF_Tense 46 + +/*Abbr +AdpType +AdvType +Animacy +Animacy[gram] +ConjType +Connegative +Degree +Derivation +Dialect +Echo +Evident +Foreign +Form +Gender[dat] +Gender[erg] +Gender[psor] +HebBinyan +HebExistential +HebSource +Hyph +InfForm + +Number[abs] +Number[dat] +Number[erg] +Number[psed] +Number[psor] +NumForm +NumType +NumValue +PartForm +PartType +Person[abs] +Person[dat] +Person[erg] +Person[psor] +Polarity +Polite +Polite[abs] +Polite[dat] +Polite[erg] +Position +Poss +Prefix +PrepCase +PrepForm +PronType +PunctSide +PunctType +Reflex +Strength +Style +Subcat +Typo +Variant +VerbForm +VerbType +Voice +Xtra*/ + + + #include "dico.h" #include "word_emb.h" #include "dico_vec.h" @@ -90,6 +165,9 @@ #define mcd_get_y_col(m) (m)->wf2col[MCD_WF_Y] #define mcd_get_z_col(m) (m)->wf2col[MCD_WF_Z] + + + #define mcd_set_form_col(m, v) (m)->wf[MCD_WF_FORM] = (v) diff --git a/maca_common/include/word.h b/maca_common/include/word.h index 30074b7606988cfcefa4400b8f35acd958ea9807..e70677618306262579ae471d8b91b91b93ce1b64 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -16,6 +16,21 @@ typedef struct _word { int is_root; } word; + +#define word_get_s1(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[strlen((w)->form) - 1]) +#define word_get_s2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[strlen((w)->form) - 2]) +#define word_get_s3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[strlen((w)->form) - 3]) +#define word_get_s4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[strlen((w)->form) - 4]) +#define word_get_s5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[strlen((w)->form) - 5]) +#define word_get_s6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 6))? -1 : (w)->form[strlen((w)->form) - 6]) + +#define word_get_p1(w) ((((w) == NULL) || ((w)->form == NULL) )? -1 : (w)->form[0]) +#define word_get_p2(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 1))? -1 : (w)->form[1]) +#define word_get_p3(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 2))? -1 : (w)->form[2]) +#define word_get_p4(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 3))? -1 : (w)->form[3]) +#define word_get_p5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 4))? -1 : (w)->form[4]) +#define word_get_p6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[5]) + #define word_get_id(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID]) #define word_get_form(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM]) #define word_get_lemma(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LEMMA]) diff --git a/maca_trans_parser/src/cff2fann.c b/maca_trans_parser/src/cff2fann.c index 86492988b16881cd2bb6d00cb9e61a75e928b11a..4a39ebc56b9f437a45376986758f270ce60321a2 100644 --- a/maca_trans_parser/src/cff2fann.c +++ b/maca_trans_parser/src/cff2fann.c @@ -7,6 +7,7 @@ #include"util.h" #include"cf_file.h" #include"feat_lib.h" +#include"feat_types.h" void cff2fann_help_message(context *ctx) @@ -29,9 +30,14 @@ void cff2fann_help_message(context *ctx) void cff2fann_check_options(context *ctx) { - if(!ctx->input_filename + if(ctx->cff_filename) fprintf(stderr, "cff filename = %s\n", ctx->cff_filename); + if(ctx->mcd_filename) fprintf(stderr, "mcd filename = %s\n", ctx->mcd_filename); + if(ctx->features_model_filename) fprintf(stderr, "fm filename = %s\n", ctx->features_model_filename); + + if(!ctx->cff_filename || ctx->help - /* || !ctx->mcd_filename */ + || !ctx->mcd_filename + || !ctx->features_model_filename /* || !(ctx->cff_filename || ctx->fann_filename) */ ){ cff2fann_help_message(ctx); @@ -46,6 +52,67 @@ void one_hot_print(FILE *f, int val, int dim) fprintf(f, "%d ", (i == val)? 1 : 0); } +void print_header(mcd *m, feat_model *fm) +{ + int i; + feat_desc *fd; + simple_feat_desc *sfd; + + printf("OUT"); + + for(i=0; i <fm->nbelem; i++){ + fd = fm->array[i]; + if(fd->nbelem > 1){ + printf("feature %d is a complex feature, skipping it\n", i); + } + else{ + sfd = fd->array[0]; + printf("\t%s", sfd->name); + } + } + + printf("\n"); + printf("OUT"); + for(i=0; i <fm->nbelem; i++){ + fd = fm->array[i]; + if(fd->nbelem > 1){ + printf("feature %d is a complex feature, skipping it\n", i); + } + else{ + sfd = fd->array[0]; + if(sfd->type == FEAT_TYPE_FORM){printf("\tFORM");continue;} + if(sfd->type == FEAT_TYPE_LEMMA){printf("\tLEMMA");continue;} + if(sfd->type == FEAT_TYPE_CPOS){printf("\tCPOS");continue;} + if(sfd->type == FEAT_TYPE_POS){printf("\tPOS");continue;} + if(sfd->type == FEAT_TYPE_LABEL){printf("\tLABEL");continue;} + if(sfd->type == FEAT_TYPE_INT){printf("\tINT");continue;} + printf("\tUNK"); + } + } + printf("\n"); + /* + for(i=0; i < m->nb_col; i++){ + if(m->representation[i] == MCD_REPRESENTATION_EMB){ + printf("\tEMB"); + continue; + } + + if(m->representation[i] == MCD_REPRESENTATION_NULL){ + continue; + } + + if(m->representation[i] == MCD_REPRESENTATION_VOCAB){ + printf("\t%s", m->wf_str[i]); + continue; + } + + if(m->representation[i] == MCD_REPRESENTATION_INT){ + printf("\tINT"); + continue; + } + } + printf("\n");*/ +} void cff2fann(context *ctx) { @@ -54,7 +121,7 @@ void cff2fann(context *ctx) int col_nb; int feat_type; mcd *m = ctx->mcd_struct; - FILE *f = myfopen(ctx->input_filename, "r"); + FILE *f = myfopen(ctx->cff_filename, "r"); int val; dico *vocab; char feature_type[64]; @@ -63,44 +130,54 @@ void cff2fann(context *ctx) vocab = dico_vec_get_dico(ctx->vocabs, "d_perceptron_features"); - printf("%d %d\n", 1, ctx->features_model->nbelem); + /* printf("%d %d\n", 1, ctx->features_model->nbelem); */ + + print_header(m, ctx->features_model); while(fgets(buffer, 10000, f)){ - /* printf("%s", buffer); */ - /* printf("\n"); */ + /* printf("%s", buffer); */ + /* printf("\n"); */ token = strtok(buffer, "\t"); col_nb = 0; if (count % 100 == 0) fprintf(stderr, "%d\r", count); while(token){ - /* printf("col = %d token = %s max = %d\n", col_nb, token, max_array[col_nb]); */ + /* printf("col = %d token = %s max = %d\n", col_nb, token, max_array[col_nb]); */ val = atoi(token); if(col_nb == 0){ - one_hot_print(stdout, val, ctx->mvt_nb); - printf("\n"); + /* one_hot_print(stdout, val, ctx->mvt_nb); */ + /* printf("\n"); */ + printf("%d", val); } else { sscanf(dico_int2string(vocab, val), "%[^==]==%d", feature_type, &feature_valindex); + /* printf("feature_type = %s\n", feature_type); */ feat_type = feat_model_get_type_feat_n(ctx->features_model, col_nb - 1); - /* printf("feat_type = %d\n", feat_type); */ + /* printf("feat_type = %d\n", feat_type); */ + /* printf("%d: ", col_nb); */ int mcd_col = m->wf2col[feat_type]; /* printf("representation = %d\n", m->representation[mcd_col]); */ if(m->representation[mcd_col] == MCD_REPRESENTATION_EMB){ /* printf("it is an embedding val = %d, file = %s\n", val, m->filename[mcd_col]); */ - word_emb_print(stdout, m->word_emb_array[mcd_col], feature_valindex); - printf("\n"); + /* word_emb_print(stdout, m->word_emb_array[mcd_col], feature_valindex); */ + /* printf("\n"); */ + printf("\t%d", feature_valindex); + } else if(m->representation[mcd_col] == MCD_REPRESENTATION_VOCAB){ /* printf("it is a vocab\n"); */ - one_hot_print(stdout, feature_valindex, m->dico_array[mcd_col]->nbelem); - printf("\n"); + /* one_hot_print(stdout, feature_valindex, m->dico_array[mcd_col]->nbelem); */ + /* printf("\n"); */ + printf("\t%d", feature_valindex); } else { - printf("%d\n", feature_valindex); + printf("\t%d", feature_valindex); } } col_nb++; token = strtok(NULL , "\t"); } + printf("\n"); count++; } + fclose(f); } int main(int argc, char *argv[]) @@ -116,7 +193,7 @@ int main(int argc, char *argv[]) ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); - look_for_number_of_features_and_classes(ctx->input_filename, &nb_feat, &nb_class); + look_for_number_of_features_and_classes(ctx->cff_filename, &nb_feat, &nb_class); ctx->mvt_nb = nb_class; mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, 1); diff --git a/maca_trans_parser/src/feat_fct.c b/maca_trans_parser/src/feat_fct.c index fdb525e33391bb91f27236bcfe8b80c36ac69bf4..b65b2dd240d1bdff404e0b6c2ae54cb0d95be66b 100644 --- a/maca_trans_parser/src/feat_fct.c +++ b/maca_trans_parser/src/feat_fct.c @@ -154,6 +154,20 @@ int s3Z(void *c) {return word_get_Z(stack_s3(config_get_stack((config *) c)));} /* words in the buffer */ +int b0s1(void *c){return word_get_s1(word_buffer_b0(config_get_buffer((config *) c)));} +int b0s2(void *c){return word_get_s2(word_buffer_b0(config_get_buffer((config *) c)));} +int b0s3(void *c){return word_get_s3(word_buffer_b0(config_get_buffer((config *) c)));} +int b0s4(void *c){return word_get_s4(word_buffer_b0(config_get_buffer((config *) c)));} +int b0s5(void *c){return word_get_s5(word_buffer_b0(config_get_buffer((config *) c)));} +int b0s6(void *c){return word_get_s6(word_buffer_b0(config_get_buffer((config *) c)));} + +int b0p1(void *c){return word_get_p1(word_buffer_b0(config_get_buffer((config *) c)));} +int b0p2(void *c){return word_get_p2(word_buffer_b0(config_get_buffer((config *) c)));} +int b0p3(void *c){return word_get_p3(word_buffer_b0(config_get_buffer((config *) c)));} +int b0p4(void *c){return word_get_p4(word_buffer_b0(config_get_buffer((config *) c)));} +int b0p5(void *c){return word_get_p5(word_buffer_b0(config_get_buffer((config *) c)));} +int b0p6(void *c){return word_get_p6(word_buffer_b0(config_get_buffer((config *) c)));} + int b0g(void *c) {return (word_get_gov(word_buffer_b0(config_get_buffer((config *) c))) == WORD_INVALID_GOV) ? 0 : 1;} int b0sf(void *c) {return word_get_label(word_buffer_b0(config_get_buffer((config *) c)));} @@ -869,7 +883,8 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_X, (char *)"s0X", s0X); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"s0Y", s0Y); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"s0Z", s0Z); - feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"s0U1", s0U1); + /* feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"s0U1", s0U1); */ + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"s0U1", s0U1); feat_lib_add(fl, FEAT_TYPE_INT, (char *)"s0sgn", s0sgn); feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"s1g", s1g); @@ -1015,9 +1030,25 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_X, (char *)"b0X", b0X); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b0Y", b0Y); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b0Z", b0Z); - feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"b0U1", b0U1); + /* feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"b0U1", b0U1); */ + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0U1", b0U1); feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0sgn", b0sgn); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0s1", b0s1); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0s2", b0s2); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0s3", b0s3); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0s4", b0s4); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0s5", b0s5); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0s6", b0s6); + + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0p1", b0p1); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0p2", b0p2); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0p3", b0p3); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0p4", b0p4); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0p5", b0p5); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0p6", b0p6); + + feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"bm1f", bm1f); feat_lib_add(fl, FEAT_TYPE_LEMMA, (char *)"bm1l", bm1l); feat_lib_add(fl, FEAT_TYPE_CPOS, (char *)"bm1c", bm1c); @@ -1051,7 +1082,8 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_X, (char *)"bm1X", bm1X); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"bm1Y", bm1Y); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"bm1Z", bm1Z); - feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"bm1U1", bm1U1); + /* feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"bm1U1", bm1U1); */ + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"bm1U1", bm1U1); feat_lib_add(fl, FEAT_TYPE_INT, (char *)"bm1sgn", bm1sgn); feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"bm2f", bm2f); @@ -1159,7 +1191,8 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_X, (char *)"b1X", b1X); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b1Y", b1Y); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b1Z", b1Z); - feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"b1U1", b1U1); + /* feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"b1U1", b1U1); */ + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b1U1", b1U1); feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b1sgn", b1sgn); feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"b2f", b2f); @@ -1249,22 +1282,35 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"ldep_b0p", ldep_b0p); feat_lib_add(fl, FEAT_TYPE_LABEL, (char *)"rdep_b0p", rdep_b0p); - feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"ndep_b0", ndep_b0); - feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"ndep_s0", ndep_s0); + /* feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"ndep_b0", ndep_b0); */ + /* feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"ndep_s0", ndep_s0); */ + + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"ndep_b0", ndep_b0); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"ndep_s0", ndep_s0); /* distance features */ - feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"dist_s0_b0", dist_s0_b0); + /* feat_lib_add(fl, FEAT_TYPE_INT_7, (char *)"dist_s0_b0", dist_s0_b0); */ + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"dist_s0_b0", dist_s0_b0); /* configurational features */ - feat_lib_add(fl, FEAT_TYPE_INT_8, (char *)"sh", sh); - feat_lib_add(fl, FEAT_TYPE_INT_8, (char *)"bh", bh); + /* feat_lib_add(fl, FEAT_TYPE_INT_8, (char *)"sh", sh); */ + /* feat_lib_add(fl, FEAT_TYPE_INT_8, (char *)"bh", bh); */ + + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"sh", sh); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"bh", bh); + /* feat_lib_add(fl, FEAT_TYPE_INT_8, (char *)"dh", dh); */ - feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t1", t1); - feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t2", t2); - feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t3", t3); - feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t4", t4); + /* feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t1", t1); */ + /* feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t2", t2); */ + /* feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t3", t3); */ + /* feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t4", t4); */ + + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"t1", t1); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"t2", t2); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"t3", t3); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"t4", t4); feat_lib_add(fl, FEAT_TYPE_INT, (char *)"mvt0", mvt0);