From f50e7d93c0512664a9b6d287f9f1871e22946643 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Thu, 11 Jan 2018 15:56:12 +0100 Subject: [PATCH] fixed few little bugs --- maca_common/include/word.h | 7 +- maca_common/include/word_buffer.h | 14 +-- maca_trans_parser/CMakeLists.txt | 7 +- maca_trans_parser/src/cff2fann.c | 103 +++++++++++------- maca_trans_parser/src/maca_trans_lemmatizer.c | 10 +- .../src/movement_parser_arc_eager.h | 6 +- 6 files changed, 86 insertions(+), 61 deletions(-) diff --git a/maca_common/include/word.h b/maca_common/include/word.h index 88f32b6..7e8f30b 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -1,6 +1,6 @@ #ifndef __WORD__ #define __WORD__ - +#include<ctype.h> #include "mcd.h" #include "char16.h" @@ -28,6 +28,7 @@ typedef struct _word { #define word_get_s5(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 5))? -1 : (w)->form[strlen((w)->form) - 5]) #define word_get_s6(w) ((((w) == NULL) || ((w)->form == NULL) || (strlen((w)->form) < 6))? -1 : (w)->form[strlen((w)->form) - 6]) */ + #define word_get_s1(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 1))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 1]) #define word_get_s2(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 2))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 2]) #define word_get_s3(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 3))? -1 : (w)->form_char16[char16_strlen((w)->form_char16) - 3]) @@ -51,8 +52,8 @@ typedef struct _word { #define word_get_p6(w) ((((w) == NULL) || ((w)->form_char16 == NULL) || (char16_strlen((w)->form_char16) < 5))? -1 : (w)->form_char16[5]) #define word_get_id(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_ID]) -#define word_get_offset(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_OFFSET]) -#define word_get_length(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LENGTH]) +#define word_get_offset(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_OFFSET]) +#define word_get_length(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LENGTH]) #define word_get_form(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_FORM]) #define word_get_lemma(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_LEMMA]) #define word_get_cpos(w) (((w) == NULL) ? -1 : (w)->wf_array[MCD_WF_CPOS]) diff --git a/maca_common/include/word_buffer.h b/maca_common/include/word_buffer.h index eb995bd..136b560 100644 --- a/maca_common/include/word_buffer.h +++ b/maca_common/include/word_buffer.h @@ -32,13 +32,13 @@ #define word_buffer_is_empty(wb) (((wb)->nbelem == 0)? 1 : 0) typedef struct { - int size; /* size of the array used to store words */ - int nbelem; /* number of words in the buffer */ - int lookahead; /* number of words between the current word and the last word of the buffer */ - int current_index; /* position of the current word */ - word **array; /* array to store words */ - FILE *input_file; /* file to read the words from */ - mcd *mcd_struct; /* mcd describing the format of input_file */ + int size; /* size of the array used to store words */ + int nbelem; /* number of words in the buffer */ + int lookahead; /* number of words between the current word and the last word of the buffer */ + int current_index; /* position of the current word */ + word **array; /* array to store words */ + FILE *input_file; /* file to read the words from */ + mcd *mcd_struct; /* mcd describing the format of input_file */ } word_buffer; diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index a3cded9..fe60c66 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -210,9 +210,10 @@ install (TARGETS maca_trans_lemmatizer DESTINATION bin) #target_link_libraries(test_w2v transparse) #install (TARGETS test_w2v DESTINATION bin) -#add_executable(w2v_filter ./src/w2v_filter.c) -#target_link_libraries(w2v_filter transparse) -#install (TARGETS w2v_filter DESTINATION bin) +add_executable(w2v_filter ./src/w2v_filter.c) +target_link_libraries(w2v_filter transparse) +target_link_libraries(w2v_filter maca_common) +install (TARGETS w2v_filter DESTINATION bin) #add_executable(test_word_emb ./src/test_word_emb.c) #target_link_libraries(test_word_emb transparse) diff --git a/maca_trans_parser/src/cff2fann.c b/maca_trans_parser/src/cff2fann.c index 294d821..bcdf01f 100644 --- a/maca_trans_parser/src/cff2fann.c +++ b/maca_trans_parser/src/cff2fann.c @@ -52,6 +52,20 @@ void one_hot_print(FILE *f, int val, int dim) fprintf(f, "%d ", (i == val)? 1 : 0); } +void check_feature_model(feat_model *fm) +{ + int i; + feat_desc *fd; + + for(i=0; i <fm->nbelem; i++){ + fd = fm->array[i]; + if(fd->nbelem > 1){ + fprintf(stderr, "feature %d is a complex feature, aborting\n", i); + exit(1); + } + } +} + void print_header(mcd *m, feat_model *fm) { int i; @@ -62,33 +76,24 @@ void print_header(mcd *m, feat_model *fm) for(i=0; i <fm->nbelem; i++){ fd = fm->array[i]; - if(fd->nbelem > 1){ - printf("feature %d is a complex feature, skipping it\n", i); - } - else{ - sfd = fd->array[0]; - printf("\t%s", sfd->name); - } + sfd = fd->array[0]; + printf("\t%s", sfd->name); } printf("\n"); printf("OUT"); for(i=0; i <fm->nbelem; i++){ fd = fm->array[i]; - if(fd->nbelem > 1){ - printf("feature %d is a complex feature, skipping it\n", i); - } - else{ - sfd = fd->array[0]; - if(sfd->type == FEAT_TYPE_FORM){printf("\tFORM");continue;} - if(sfd->type == FEAT_TYPE_LEMMA){printf("\tLEMMA");continue;} - if(sfd->type == FEAT_TYPE_CPOS){printf("\tCPOS");continue;} - if(sfd->type == FEAT_TYPE_POS){printf("\tPOS");continue;} - if(sfd->type == FEAT_TYPE_LABEL){printf("\tLABEL");continue;} - if(sfd->type == FEAT_TYPE_INT){printf("\tINT");continue;} - printf("\tUNK"); - } + sfd = fd->array[0]; + if(sfd->type == FEAT_TYPE_FORM){printf("\tFORM");continue;} + if(sfd->type == FEAT_TYPE_LEMMA){printf("\tLEMMA");continue;} + if(sfd->type == FEAT_TYPE_CPOS){printf("\tCPOS");continue;} + if(sfd->type == FEAT_TYPE_POS){printf("\tPOS");continue;} + if(sfd->type == FEAT_TYPE_LABEL){printf("\tLABEL");continue;} + if(sfd->type == FEAT_TYPE_INT){printf("\tINT");continue;} + printf("\tUNK"); } + printf("\n"); /* for(i=0; i < m->nb_col; i++){ @@ -127,6 +132,7 @@ void cff2fann(context *ctx) char feature_type[64]; int feature_valindex; int count = 0; + char *feat_str = NULL; vocab = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); @@ -142,34 +148,46 @@ void cff2fann(context *ctx) if (count % 100 == 0) fprintf(stderr, "%d\r", count); while(token){ - /* printf("col = %d token = %s max = %d\n", col_nb, token, max_array[col_nb]); */ + /* printf("col = %d token = %s\n", col_nb, token); */ val = atoi(token); if(col_nb == 0){ /* one_hot_print(stdout, val, ctx->mvt_nb); */ /* printf("\n"); */ printf("%d", val); } else { - sscanf(dico_int2string(vocab, val), "%[^==]==%d", feature_type, &feature_valindex); - /* printf("feature_type = %s\n", feature_type); */ - feat_type = feat_model_get_type_feat_n(ctx->features_model, col_nb - 1); - /* printf("feat_type = %d\n", feat_type); */ - /* printf("%d: ", col_nb); */ - int mcd_col = m->wf2col[feat_type]; - /* printf("representation = %d\n", m->representation[mcd_col]); */ - if(m->representation[mcd_col] == MCD_REPRESENTATION_EMB){ - /* printf("it is an embedding val = %d, file = %s\n", val, m->filename[mcd_col]); */ - /* word_emb_print(stdout, m->word_emb_array[mcd_col], feature_valindex); */ - /* printf("\n"); */ - printf("\t%d", feature_valindex); - - } else if(m->representation[mcd_col] == MCD_REPRESENTATION_VOCAB){ - /* printf("it is a vocab\n"); */ - /* one_hot_print(stdout, feature_valindex, m->dico_array[mcd_col]->nbelem); */ - /* printf("\n"); */ + feat_str = dico_int2string(vocab, val); + if(feat_str){ + /* printf("feat str = %s\n", feat_str); */ + sscanf(feat_str, "%[^==]==%d", feature_type, &feature_valindex); + /* printf("feature_type = %s\n", feature_type); */ + feat_type = feat_model_get_type_feat_n(ctx->features_model, col_nb - 1); + /* printf("feat_type = %d\n", feat_type); */ + /* printf("%d: ", col_nb); */ + int mcd_col = m->wf2col[feat_type]; + + /* printf("representation = %d\n", m->representation[mcd_col]); */ + if(m->representation[mcd_col] == MCD_REPRESENTATION_EMB){ + /* printf("it is an embedding val = %d, file = %s\n", val, m->filename[mcd_col]); */ + /* word_emb_print(stdout, m->word_emb_array[mcd_col], feature_valindex); */ + /* printf("\n"); */ + printf("\t%d", feature_valindex); + + } else if(m->representation[mcd_col] == MCD_REPRESENTATION_VOCAB){ + /* printf("it is a vocab\n"); */ + /* one_hot_print(stdout, feature_valindex, m->dico_array[mcd_col]->nbelem); */ + /* printf("\n"); */ + printf("\t%d", feature_valindex); + } else { + printf("\t%d", feature_valindex); + } + } + else{ + fprintf(stderr, "WARNING cannot find the description of feature : %d\n", val); + feature_valindex = -1; + printf("\t%d", feature_valindex); - } else { - printf("\t%d", feature_valindex); - } + + } } col_nb++; token = strtok(NULL , "\t"); @@ -193,6 +211,9 @@ int main(int argc, char *argv[]) ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + + check_feature_model(ctx->features_model); + look_for_number_of_features_and_classes(ctx->cff_filename, &nb_feat, &nb_class); ctx->mvt_nb = nb_class; diff --git a/maca_trans_parser/src/maca_trans_lemmatizer.c b/maca_trans_parser/src/maca_trans_lemmatizer.c index 645be94..641ea6c 100644 --- a/maca_trans_parser/src/maca_trans_lemmatizer.c +++ b/maca_trans_parser/src/maca_trans_lemmatizer.c @@ -158,7 +158,8 @@ int main(int argc, char *argv[]) lemma_from_fplm = fplm_lookup_lemma(exceptions, form, pos, ctx->verbose); if(lemma_from_fplm){ // printf("lemma %s found in exceptions file\n", lemma_from_fplm); - print_word(b0, ctx->mcd_struct, lemma_from_fplm); + // print_word(b0, ctx->mcd_struct, to_lower_string(lemma_from_fplm)); + print_word(b0, ctx->mcd_struct, lemma_from_fplm); } // if lemma is not found in exception file, predict an l_rule else{ @@ -185,15 +186,16 @@ int main(int argc, char *argv[]) if(l_rule_is_applicable(form, l_rule)){ char *transformed_lemma = apply_l_rule(form, l_rule); // printf("transformed_lemma = %s\n", transformed_lemma); - // print_word(b0, ctx->mcd_struct, to_lower_string(transformed_lemma)); - print_word(b0, ctx->mcd_struct, transformed_lemma); + // print_word(b0, ctx->mcd_struct, to_lower_string(transformed_lemma)); + print_word(b0, ctx->mcd_struct, transformed_lemma); free(transformed_lemma); break; } } /* no rule applied */ if(i == 10){ - print_word(b0, ctx->mcd_struct, form); + // print_word(b0, ctx->mcd_struct, to_lower_string(form)); + print_word(b0, ctx->mcd_struct, form); } free(vcode_array); } diff --git a/maca_trans_parser/src/movement_parser_arc_eager.h b/maca_trans_parser/src/movement_parser_arc_eager.h index 3a040d5..2b11bf0 100644 --- a/maca_trans_parser/src/movement_parser_arc_eager.h +++ b/maca_trans_parser/src/movement_parser_arc_eager.h @@ -7,9 +7,9 @@ #define MVT_PARSER_SHIFT 0 #define MVT_PARSER_REDUCE 1 #define MVT_PARSER_ROOT 2 -#define MVT_PARSER_EOS -1 -#define MVT_PARSER_LEFT 3 -#define MVT_PARSER_RIGHT 4 +#define MVT_PARSER_EOS 3 +#define MVT_PARSER_LEFT 4 +#define MVT_PARSER_RIGHT 5 /* even movements are left movements (except 0, which is shift and 2 which is root) */ #define movement_parser_left_code(label) (2 * (label) + 4) -- GitLab