diff --git a/maca_trans_parser/src/feat_fct.c b/maca_trans_parser/src/feat_fct.c index 5c665a112a9733853578c851234138d99f0185ac..54cb8ee72da896eab9b28b57ae19bb20a5c581d6 100644 --- a/maca_trans_parser/src/feat_fct.c +++ b/maca_trans_parser/src/feat_fct.c @@ -261,6 +261,11 @@ int b2X(config *c) {return word_get_X(word_buffer_b2(config_get_buffer(c)));} int b2Y(config *c) {return word_get_Y(word_buffer_b2(config_get_buffer(c)));} int b2Z(config *c) {return word_get_Z(word_buffer_b2(config_get_buffer(c)));} +int b2U1(config *c) {return word_get_U1(word_buffer_b2(config_get_buffer(c)));} +int b2sgn(config *c) {return word_get_signature(word_buffer_b2(config_get_buffer(c)));} + + + int b3f(config *c) {return word_get_form(word_buffer_b3(config_get_buffer(c)));} int b3l(config *c) {return word_get_lemma(word_buffer_b3(config_get_buffer(c)));} int b3c(config *c) {return word_get_cpos(word_buffer_b3(config_get_buffer(c)));} @@ -576,14 +581,15 @@ int dist_s0_b0(config *c){ dist = word_get_index(word_buffer_b0(config_get_buffer(c))) - word_get_index(stack_top(config_get_stack(c))); - return (abs(dist) > 6)? 6 : dist; + return (abs(dist) > 6)? 6 : dist; } /* configurational features */ int sh(config *c) /* stack height */ { - return (config_get_stack(c)->size > 7)? 7 : config_get_stack(c)->size; + return (config_get_stack(c)->top > 7)? 7 : config_get_stack(c)->top; + /* return (stack_nbelem(config_get_stack(c)) > 0)? 1 : 0; */ } int bh(config *c) /* buffer size */ diff --git a/maca_trans_parser/src/feat_fct.h b/maca_trans_parser/src/feat_fct.h index 41af327a4465b5cf6c19b97ab82d23bbbe44c261..e654b10cda73ab593d7fd65e2b32b11d5e012e14 100644 --- a/maca_trans_parser/src/feat_fct.h +++ b/maca_trans_parser/src/feat_fct.h @@ -271,6 +271,10 @@ int b2Z(config *c); int b2r(config *c); +int b2U1(config *c); +int b2sgn(config *c); + + int b3f(config *c); int b3l(config *c); int b3c(config *c); diff --git a/maca_trans_parser/src/feat_lib.c b/maca_trans_parser/src/feat_lib.c index a60a51103fb1f2ce52f15d9af69a291e18cd567e..6342d4e602ba3cc54ce39434f156082fcf2fea31 100644 --- a/maca_trans_parser/src/feat_lib.c +++ b/maca_trans_parser/src/feat_lib.c @@ -394,6 +394,9 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b2Y", b2Y); feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b2Z", b2Z); + feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b2U1", b2U1); + feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b2sgn", b2sgn); + feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"b3f", b3f); feat_lib_add(fl, FEAT_TYPE_LEMMA, (char *)"b3l", b3l); feat_lib_add(fl, FEAT_TYPE_CPOS, (char *)"b3c", b3c); diff --git a/maca_trans_parser/src/movement_parser_arc_eager.c b/maca_trans_parser/src/movement_parser_arc_eager.c index 564da2a0e7c849eb3e6710286782b888a4e6a29a..757760eabe412fc8bbdca29f977edf5cfae68ba4 100644 --- a/maca_trans_parser/src/movement_parser_arc_eager.c +++ b/maca_trans_parser/src/movement_parser_arc_eager.c @@ -8,14 +8,14 @@ void movement_print(FILE *f, int mvt_code, dico *dico_labels){ int mvt_type = movement_type(mvt_code); int mvt_label = movement_label(mvt_code); char *label; - if(mvt_type == MVT_SHIFT) {fprintf(f, "SHIFT\n"); return;} - if(mvt_type == MVT_REDUCE) {fprintf(f, "REDUCE\n"); return;} - if(mvt_type == MVT_ROOT) {fprintf(f, "ROOT\n"); return;} - if(mvt_type == MVT_EOS) {fprintf(f, "EOS\n"); return;} + if(mvt_type == MVT_SHIFT) {fprintf(f, "SHIFT"); return;} + if(mvt_type == MVT_REDUCE) {fprintf(f, "REDUCE"); return;} + if(mvt_type == MVT_ROOT) {fprintf(f, "ROOT"); return;} + if(mvt_type == MVT_EOS) {fprintf(f, "EOS"); return;} if(mvt_type == MVT_RIGHT) fprintf(f, "RIGHT"); else fprintf(f, "LEFT"); label = dico_int2string(dico_labels, mvt_label); - fprintf(f, " %s\n", label); + fprintf(f, " %s", label); } int movement_type(int mvt) @@ -46,6 +46,8 @@ int movement_eos(config *c, float score) /* word on the top of the stack is sent_seg */ word_set_sent_seg(stack_top(config_get_stack(c)), 1); + /* (config_get_stack(c))->top = 0; */ + config_add_mvt(c, MVT_EOS); return 1; } @@ -101,7 +103,10 @@ int movement_shift(config *c, int stream, float score) int movement_reduce(config *c, float score) { - if(stack_is_empty(config_get_stack(c))) return 0; + if(stack_nbelem(config_get_stack(c)) <= 1) return 0; + + /* if(stack_is_empty(config_get_stack(c))) return 0; */ + /* word on top of stack must have a governor */ if(word_get_gov(stack_top(config_get_stack(c))) == WORD_INVALID_GOV) return 0; stack_pop(config_get_stack(c)); @@ -116,7 +121,9 @@ int movement_root(config *c, float score, int root_code) word_set_gov(s0, 0); word_set_label(s0, root_code); s0->is_root = 1; + stack_pop(config_get_stack(c)); + config_add_mvt(c, MVT_ROOT); return 1; } diff --git a/maca_trans_parser/src/oracle_parser_arc_eager.c b/maca_trans_parser/src/oracle_parser_arc_eager.c index c8651868c4e99346303a1440f3e08958d472e4c3..93d662b23fb97f5f03ea3cbd5527c08746e4a574 100644 --- a/maca_trans_parser/src/oracle_parser_arc_eager.c +++ b/maca_trans_parser/src/oracle_parser_arc_eager.c @@ -9,52 +9,44 @@ int check_all_dependents_of_word_in_ref_are_in_hyp(config *c, word_buffer *ref, int dep; int gov_ref; int gov_hyp; - /* int sentence_change; */ + int sentence_change; -#if 1 - for(dep = word_index - 1; (dep > 0) && (word_get_sent_seg(word_buffer_get_word_n(ref, dep)) == 0); dep--){ +#if 0 + for(dep = word_index - 1; (dep >= 0) && (word_get_sent_seg(word_buffer_get_word_n(ref, dep)) == 0); dep--){ gov_ref = word_get_gov_index(word_buffer_get_word_n(ref, dep)); - if(gov_ref == word_index){ /* found a dependent of word in ref */ - /* look for a dependency in hyp such that its dependent is dep */ + if(gov_ref == word_index){ /* dep is a dependent of word in ref */ + /* check that dep has the same governor in hyp */ gov_hyp = word_get_gov_index(word_buffer_get_word_n(config_get_buffer(c), dep)); - if(gov_hyp != gov_ref) return 0; } } for(dep = word_index + 1; ((dep < word_buffer_get_nbelem(ref)) && (word_get_sent_seg(word_buffer_get_word_n(ref, dep)) == 0)); dep++){ gov_ref = word_get_gov_index(word_buffer_get_word_n(ref, dep)); - if(gov_ref == word_index){ /* found a dependent of word in ref */ - /* look for a dependency in hyp such that its dependent is dep */ + if(gov_ref == word_index){ /* dep is a dependent of word in ref */ + /* check that dep has the same governor in hyp */ gov_hyp = word_get_gov_index(word_buffer_get_word_n(config_get_buffer(c), dep)); - if(gov_hyp != gov_ref) return 0; } } #endif -#if 0 - sentence_change = 0; - for(dep = word_index - 1; (dep > 0) && (word_get_sent_seg(word_buffer_get_word_n(ref, dep)) == 0); dep--){ - /* printf("dep = %d\n", dep); */ - if(word_get_sent_seg(word_buffer_get_word_n(ref, dep)) == 1) - sentence_change = 1; +#if 1 + for(dep = word_index - 1; (dep >= 0) && (word_get_sent_seg(word_buffer_get_word_n(ref, dep)) == 0); dep--){ gov_ref = word_get_gov_index(word_buffer_get_word_n(ref, dep)); - if(gov_ref == word_index){ /* found a dependent of word in ref */ - /* look for a dependency in hyp such that its dependent is dep */ + if(gov_ref == word_index){ /* dep is a dependent of word in ref */ + /* check that dep has the same governor in hyp */ gov_hyp = word_get_gov_index(word_buffer_get_word_n(config_get_buffer(c), dep)); if(gov_hyp != gov_ref) return 0; } } sentence_change = 0; - for(dep = word_index + 1; - (dep < word_buffer_get_nbelem(ref)) && (sentence_change == 0); dep++){ - /* printf("dep = %d\n", dep); */ + for(dep = word_index + 1; (dep < word_buffer_get_nbelem(ref)) && (sentence_change == 0); dep++){ if(word_get_sent_seg(word_buffer_get_word_n(ref, dep)) == 1) sentence_change = 1; gov_ref = word_get_gov_index(word_buffer_get_word_n(ref, dep)); - if(gov_ref == word_index){ /* found a dependent of word in ref */ + if(gov_ref == word_index){ /* dep is a dependent of word in ref */ /* look for a dependency in hyp such that its dependent is dep */ gov_hyp = word_get_gov_index(word_buffer_get_word_n(config_get_buffer(c), dep)); if(gov_hyp != gov_ref) return 0; @@ -100,10 +92,10 @@ int oracle_parser_arc_eager(config *c, word_buffer *ref, int root_label) } /* word on the top of the stack is an end of sentence marker */ - if((word_get_sent_seg(word_buffer_get_word_n(ref, s0_index)) == 1) + if((word_get_sent_seg(word_buffer_get_word_n(ref, s0_index)) == 1) && (word_get_sent_seg(word_buffer_get_word_n(config_get_buffer(c), s0_index)) != 1)){ return MVT_EOS; - } + } /* LEFT ARC b0 is the governor and s0 the dependent */ if(s0_gov_index == b0_index){ @@ -115,9 +107,9 @@ int oracle_parser_arc_eager(config *c, word_buffer *ref, int root_label) return movement_right_code(word_get_label(word_buffer_get_word_n(ref, b0_index))); } /* REDUCE */ - if((stack_height(config_get_stack(c)) > 2) - && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index) - && (word_get_gov(stack_top(config_get_stack(c))) != WORD_INVALID_GOV)){ /* word on top of the stack has a goveror */ + if((stack_nbelem(config_get_stack(c)) > 1) + && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index) /* word on top must have all its dependents */ + && (word_get_gov(stack_top(config_get_stack(c))) != WORD_INVALID_GOV)){ /* word on top of the stack has a governor */ return MVT_REDUCE; } } diff --git a/maca_trans_parser/src/simple_decoder_parser_arc_eager.c b/maca_trans_parser/src/simple_decoder_parser_arc_eager.c index 03b1f7c8863017810eda9edfc1b5872882a6b2d8..a071ed358418eaf54d0807af77adde0a96e939bc 100644 --- a/maca_trans_parser/src/simple_decoder_parser_arc_eager.c +++ b/maca_trans_parser/src/simple_decoder_parser_arc_eager.c @@ -10,6 +10,29 @@ #include"feature_table.h" #include"dico.h" +void print_word_buffer(config *c, dico *dico_labels) +{ + int i; + word *dep; + char *label; + + for(i=0; i < config_get_buffer(c)->nbelem; i++){ + dep = word_buffer_get_word_n(config_get_buffer(c), i); + printf("%s\t", word_get_input(dep)); + printf("%d\t", word_get_gov(dep)); + label = (word_get_label(dep) == -1)? NULL : dico_int2string(dico_labels, word_get_label(dep)); + if(label != NULL) + printf("%s\t", label) ; + else + printf("_\t"); + if(word_get_sent_seg(dep) == 1) + printf("1\n") ; + else + printf("0\n"); + } +} + +#if 0 void print_word_buffer(config *c, dico *dico_labels) { int i; @@ -21,20 +44,20 @@ void print_word_buffer(config *c, dico *dico_labels) dep = word_buffer_get_word_n(config_get_buffer(c), i); if(word_get_gov(dep) == 0) root_position = i; printf("%s\t", word_get_input(dep)); - if(word_get_sent_seg(dep) == 1){ + /* if(word_get_sent_seg(dep) == 1){ printf("%d\teos\t1\n", root_position - i); - } - else{ + } + else{*/ printf("%d\t", word_get_gov(dep)); label = (word_get_label(dep) == -1)? NULL : dico_int2string(dico_labels, word_get_label(dep)); if(label != NULL) printf("%s\t0\n", label) ; else printf("_\t0\n"); - } + /* } */ } } - +#endif void simple_decoder_parser_arc_eager(context *ctx) { @@ -48,7 +71,12 @@ void simple_decoder_parser_arc_eager(context *ctx) feat_vec *fv = feat_vec_new(feature_types_nb); config *c = NULL; int result; + float entropy; + float delta; + int argmax1, argmax2; + float max1, max2; + root_label = dico_string2int(ctx->dico_labels, ctx->root_label); if(root_label == -1) root_label = 0; @@ -59,10 +87,24 @@ void simple_decoder_parser_arc_eager(context *ctx) mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); + if(ctx->debug_mode){ fprintf(stdout, "***********************************\n"); config_print(stdout, c); - movement_print(stdout, mvt_code, ctx->dico_labels); + entropy = feature_table_entropy(fv, ft); + /* delta = feature_table_diff_scores(fv, ft); */ + feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2); + movement_print(stdout, argmax1, ctx->dico_labels); + printf(":\t%f\n", max1); + movement_print(stdout, argmax2, ctx->dico_labels); + printf(":\t%f\n", max2); + printf("delta = %f\n", max1 - max2); + + /* delta = feature_table_first_second(fv, ft); */ + /* printf("entropy = %f delta = %f\n", entropy, delta); */ + printf("entropy = %f\n",entropy); + + /* movement_print(stdout, mvt_code, ctx->dico_labels); */ } result = 0; switch(mvt_type){ diff --git a/perceptron/lib/CMakeLists.txt b/perceptron/lib/CMakeLists.txt index a8e202fe53bb90840051343bb4551f96d5a73665..1077b57392f34572a942ebe84962c3f166d23455 100644 --- a/perceptron/lib/CMakeLists.txt +++ b/perceptron/lib/CMakeLists.txt @@ -8,3 +8,6 @@ set(SOURCES src/cf_file.c #compiling library include_directories(src) add_library(perceptron STATIC ${SOURCES}) + +find_library(M_LIB m) +target_link_libraries(perceptron ${M_LIB}) diff --git a/perceptron/lib/include/feature_table.h b/perceptron/lib/include/feature_table.h index ff2ed6cd49df2fcdd3dcc55fbe7d96beec41fc0d..ef1bbe66e1b47d616b138a47daacfd153b6dd174 100644 --- a/perceptron/lib/include/feature_table.h +++ b/perceptron/lib/include/feature_table.h @@ -22,6 +22,9 @@ feature_table *feature_table_new(int features_nb, int classes_nb); void feature_table_print(char *filename, feature_table *ft); void feature_table_print_verbose(char *filename, feature_table *ft, dico *dico_features, dico *dico_classes); int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max); +float feature_table_entropy(feat_vec *fv, feature_table *ft); +float feature_table_diff_scores(feat_vec *fv, feature_table *ft); +float feature_table_argmax_1_2(feat_vec *fv, feature_table *ft, int *argmax1, float *max1, int *argmax2, float *max2); void feature_table_free(feature_table *ft); void feature_table_scores(feat_vec *fv, feature_table *ft, float *classes_score); vcode* feature_table_get_vcode_array(feat_vec *fv, feature_table* ft); diff --git a/perceptron/lib/src/feature_table.c b/perceptron/lib/src/feature_table.c index 2a7dd03819317d144e7a21a778c3400d231b4243..8fb1aed148d74d015fa7f407c0811ddcc6d20f95 100644 --- a/perceptron/lib/src/feature_table.c +++ b/perceptron/lib/src/feature_table.c @@ -8,6 +8,8 @@ feature_table *feature_table_load(char *filename, int verbose) { int i; + + feature_table *ft = NULL; int features_nb; int classes_nb; @@ -137,6 +139,97 @@ void feature_table_free(feature_table *ft) free(ft); } +float feature_table_diff_scores(feat_vec *fv, feature_table *ft) +{ + float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float)); + float first = 0; + float second = 0; + int cla; + int classes_nb = ft->classes_nb; + + feature_table_scores(fv, ft, classes_score); + + first = classes_score[0]; + for(cla=1; cla < classes_nb; cla++) + if(classes_score[cla] > first){ + second = first; + first = classes_score[cla]; + } + return (first - second); +} + +float feature_table_argmax_1_2(feat_vec *fv, feature_table *ft, int *argmax1, float *max1, int *argmax2, float *max2) +{ + float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float)); + int cla; + int classes_nb = ft->classes_nb; + + feature_table_scores(fv, ft, classes_score); + + *max1 = classes_score[0]; + *argmax1 = 0; + for(cla=1; cla < classes_nb; cla++) + if(classes_score[cla] > *max1){ + *max1 = classes_score[cla]; + *argmax1 = cla; + } + + if(*argmax1 != 0){ + *argmax2 = 0; + *max2 = classes_score[*argmax2]; + } + else{ + *argmax2 = 1; + *max2 = classes_score[*argmax2]; + } + + for(cla=0; cla < classes_nb; cla++) + if((cla != *argmax1) && (classes_score[cla] > *max2)){ + *max2 = classes_score[cla]; + *argmax2 = cla; + } + + /* printf("max1 = %f argmax1 = %d max2 = %f argmax2 = %d\n", *max1, *argmax1, *max2, *argmax2); */ + + return (*max1 - *max2); +} + +float feature_table_entropy(feat_vec *fv, feature_table *ft) +{ + float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float)); + int cla; + int classes_nb = ft->classes_nb; + float min; + float sum = 0; + float entropy = 0; + float proba; + + feature_table_scores(fv, ft, classes_score); + + min = classes_score[0]; + for(cla=1; cla < classes_nb; cla++) + if(classes_score[cla] < min) + min = classes_score[cla]; + + /* printf("min = %f\n", min); */ + + for(cla=0; cla < classes_nb; cla++){ + classes_score[cla] -= min; + sum += classes_score[cla]; + } + + /* printf("sum = %f\n", sum); */ + + for(cla=0; cla < classes_nb; cla++){ + proba = classes_score[cla] / sum; + /* printf("proba = %f entropy = %f\n", proba, entropy); */ + if(proba != 0.0) + entropy -= proba * log(proba); + } + free(classes_score); + return entropy; +} + int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max) { float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float));