diff --git a/maca_tools/src/mcf2conll.c b/maca_tools/src/mcf2conll.c index 7fb78ae84e5670574af824eaf6a4541f73052003..fa5f7cf7e3629e3b6b26971f2f24f7490af0f7bc 100644 --- a/maca_tools/src/mcf2conll.c +++ b/maca_tools/src/mcf2conll.c @@ -180,8 +180,12 @@ int main(int argc, char *argv[]) fprintf(output_file, "_"); fprintf(output_file, "\t"); - if(gov_col) - fprintf(output_file, "%d\t", word_get_gov(w) + index); + if(gov_col){ + if(word_get_gov(w) == 0) + fprintf(output_file, "0\t"); + else + fprintf(output_file, "%d\t", word_get_gov(w) + index); + } else fprintf(output_file, "_\t"); diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 495f31de62a8ebe90ac4f5c71d9c3f4a878c88ed..fb3e04dd772bfb808bcf522438c3cb49b89e54d3 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -53,6 +53,12 @@ target_link_libraries(maca_trans_parser_arc_eager_mcf2cff transparse) target_link_libraries(maca_trans_parser_arc_eager_mcf2cff maca_common) install (TARGETS maca_trans_parser_arc_eager_mcf2cff DESTINATION bin) +add_executable(compare_traces ./src/compare_traces.c) +target_link_libraries(compare_traces perceptron) +target_link_libraries(compare_traces transparse) +target_link_libraries(compare_traces maca_common) +install (TARGETS compare_traces DESTINATION bin) + add_executable(maca_trans_parser ./src/maca_trans_parser.c) target_link_libraries(maca_trans_parser perceptron) target_link_libraries(maca_trans_parser transparse) diff --git a/maca_trans_parser/src/compare_traces.c b/maca_trans_parser/src/compare_traces.c new file mode 100644 index 0000000000000000000000000000000000000000..a4d1875da2223b891457657293752502712b4f31 --- /dev/null +++ b/maca_trans_parser/src/compare_traces.c @@ -0,0 +1,172 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> + +typedef struct { + int index; + char *stack; + char *movement; + float score; +} configuration; + +typedef struct { + int size; /* size of the array used to store words */ + int nbelem; /* number of words in the buffer */ + configuration **array; /* array to store configurations */ +} trace; + + +int configuration_equal(configuration *c1, configuration *c2) +{ + if(c1->index != c2->index) return 0; + if(strcmp(c1->stack, c2->stack)) return 0; + return 1; +} + + +configuration *configuration_new(int index, char *stack, char *movement, float score) +{ + configuration *c = malloc(sizeof(configuration)); + if(c == NULL) return NULL; + c->index = index; + c->stack = stack; + c->movement = movement; + c->score = score; + return c; +} + +void configuration_print(FILE *f, configuration *c) +{ + fprintf(f, "%d\t%s\t%s\t%f\n", c->index, c->stack, c->movement, c->score); +} + + +trace *trace_new() +{ + trace *t = (trace *)malloc(sizeof(trace)); + t->size = 10; + t->nbelem = 0; + t->array = (configuration **)malloc(t->size * sizeof(configuration *)); + + return t; +} + + +int trace_add(trace *t, configuration *c) +{ + if(t->nbelem == t->size -1){ + t->size = 2 * (t->size + 1); + t->array = (configuration **)realloc(t->array, t->size * sizeof(configuration *)); + } + t->array[t->nbelem] = c; + + t->nbelem++; + return t->nbelem - 1; +} + +void trace_print(FILE *f, trace *t) +{ + int i; + for(i=0; i < t->nbelem; i++) + configuration_print(f, t->array[i]); +} + + + +trace *trace_load(char *trace_filename) +{ + FILE *f; + int index; + float score; + char stack[10000]; + char movement[100]; + char buffer[20000]; + if(trace_filename == NULL) + f = stdin; + else + f = fopen(trace_filename, "r"); + if(f == NULL){ + fprintf(stderr, "cannot open file %s aborting\n", trace_filename); + exit(1); + } + + + trace *t = trace_new(); + /* while(!feof(f)){ */ + while(fgets(buffer, 20000, f)){ + int r = sscanf(buffer, "%d\t%[^\t]\t%[^\t]\t%f\n", &index, stack, movement, &score); + if(r == 4){ + /* printf("index = %d stack = %s movement = %s score = %f\n", index, stack, movement, score); */ + trace_add(t, configuration_new(index, strdup(stack), strdup(movement), score)); + } + } + + if(trace_filename != NULL) + fclose(f); + return t; +} + + +void trace_compare(trace *ref, trace *hyp) +{ + int index_hyp = 0; + int index_ref = 0; + configuration *c_ref, *c_hyp; + int status = 1; + while(1){ + c_ref = ref->array[index_ref]; + c_hyp = hyp->array[index_hyp]; + if(!c_hyp || !c_ref) break; + printf("REF "); + configuration_print(stdout, c_ref); + printf("HYP "); + configuration_print(stdout, c_hyp); + + if(configuration_equal(c_ref, c_hyp)){ + status = 1; + fprintf(stdout, "EQUAL\n"); + if(strcmp(c_ref->movement, c_hyp->movement)){ + /* fprintf(stdout, "BAAD\t%s\t%s\t%f\n", c_ref->movement, c_hyp->movement, c_hyp->score); */ + /* fprintf(stdout, "BAAD\t%s\t%f\n", c_hyp->movement, c_hyp->score); */ + } + else{ + /* fprintf(stdout, "GOOD\t%s\t%s\t%f\n", c_ref->movement, c_hyp->movement, c_hyp->score); */ + /* fprintf(stdout, "GOOD\t%s\t%f\n", c_hyp->movement, c_hyp->score); */ + } + index_hyp++; + index_ref++; + } + else{ + fprintf(stdout, "DIFFERENT\n"); + status = 0; + if(c_ref->index > c_hyp->index) + index_hyp++; + else if(c_ref->index < c_hyp->index) + index_ref++; + else{ + index_hyp++; + index_ref++; + + } + + } + } +} + + +int main(int arc, char *argv[]) +{ + char *ref_filename = argv[1]; + char *hyp_filename = argv[2]; + + fprintf(stderr, "loading file %s\n", ref_filename); + trace *t_ref = trace_load(ref_filename); + fprintf(stderr, "loading file %s\n", hyp_filename); + trace *t_hyp = trace_load(hyp_filename); + + /* trace_print(stdout, t_ref); */ + + + trace_compare(t_ref, t_hyp); + +} diff --git a/maca_trans_parser/src/config.c b/maca_trans_parser/src/config.c index 02c392ecee9fe1d1e3e36c2f849cec6f749e156b..392299edc8070cd6b03f45028154fd4ca7b00ff6 100644 --- a/maca_trans_parser/src/config.c +++ b/maca_trans_parser/src/config.c @@ -111,16 +111,11 @@ void config_add_mvt(config *c, int mvt) void config_print(FILE *f, config *c) { - /* word *b0 = NULL; */ - /* word *s0 = NULL; */ if(c){ - if(!stack_is_empty(c->st)) - /* s0 = stack_elt_n(c->st, 0); */ - /* b0 = word_buffer_b0(c->bf); */ - /* if(s0) { printf("s0 = "); word_print2(stdout, s0);} */ - /* if(b0) { printf("b0 = "); word_print2(stdout, b0);} */ - - stack_print(f, c->st); + if(stack_is_empty(c->st)) + fprintf(f, "[ ]"); + else + stack_print(f, c->st); fprintf(f, "\n"); word_buffer_print_compact(f, c->bf); } diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 2a1941a876ccd47c459b9f0375bd40c482d516f6..e02095c8dc54cd47614e602b733f7ff38dd4609d 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -81,6 +81,8 @@ context *context_new(void) ctx->conll = 0; ctx->ifpls = 1; + ctx->trace_mode = 0; + return ctx; } @@ -149,6 +151,9 @@ void context_root_label_help_message(context *ctx){ void context_f2p_filename_help_message(context *ctx){ fprintf(stderr, "\t-P --f2p <file> : form to pos (f2p) filename\n"); } +void context_trace_mode_help_message(context *ctx){ + fprintf(stderr, "\t-T --traces : activate trace mode (default is false)\n"); +} context *context_read_options(int argc, char *argv[]) { @@ -180,13 +185,14 @@ context *context_read_options(int argc, char *argv[]) {"language", required_argument, 0, 'L'}, {"maca_data_path", required_argument, 0, 'D'}, {"root_label", required_argument, 0, 'R'}, - {"f2p", required_argument, 0, 'P'} + {"f2p", required_argument, 0, 'P'}, + {"traces", required_argument, 0, 'T'} }; optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "hvdcSm:i:n:x:u:r:M:b:f:s:C:F:V:L:D:R:P:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "hvdcSTm:i:n:x:u:r:M:b:f:s:C:F:V:L:D:R:P:", long_options, &option_index)) != -1){ switch (c) { case 'h': @@ -201,6 +207,9 @@ context *context_read_options(int argc, char *argv[]) case 'c': ctx->conll = 1; break; + case 'T': + ctx->trace_mode = 1; + break; case 'm': ctx->perc_model_filename = strdup(optarg); break; diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index b349449ada8182a8754a33cffd68d260db87d231..013a09645c0813c91c844d4db8081592ca5a3511 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -58,6 +58,7 @@ typedef struct { form2pos *f2p; int conll; int ifpls; + int trace_mode; } context; context *context_new(void); diff --git a/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c b/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c index ea9d8cf9fded0975ed3d27969a0164cf56dfb680..b93cd538cb8b38416aa96fc7bc8337d0251f7551 100644 --- a/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c @@ -80,9 +80,20 @@ void generate_training_file_stream(FILE *output_file, context *ctx) config_print(stdout,c); movement_print(stdout, mvt_code, ctx->dico_labels); } - fprintf(output_file, "%d", mvt_code); - feat_vec_print(output_file, fv); + if(ctx->trace_mode){ + fprintf(output_file, "%d\t", word_get_index(word_buffer_b0(config_get_buffer(c)))); + stack_print(output_file, c->st); + fprintf(output_file, "\t"); + + movement_print(output_file, mvt_code, ctx->dico_labels); + fprintf(output_file, "\t1\n"); + } + else{ + fprintf(output_file, "%d", mvt_code); + feat_vec_print(output_file, fv); + } + if(mvt_type == MVT_EOS){ movement_eos(c, 0); sentence_nb++; diff --git a/maca_trans_parser/src/simple_decoder_parser_arc_eager.c b/maca_trans_parser/src/simple_decoder_parser_arc_eager.c index a071ed358418eaf54d0807af77adde0a96e939bc..ad65e2eb813ccb2373eb02a902c907f5cffb7205 100644 --- a/maca_trans_parser/src/simple_decoder_parser_arc_eager.c +++ b/maca_trans_parser/src/simple_decoder_parser_arc_eager.c @@ -32,32 +32,6 @@ void print_word_buffer(config *c, dico *dico_labels) } } -#if 0 -void print_word_buffer(config *c, dico *dico_labels) -{ - int i; - word *dep; - char *label; - int root_position = 0; - - for(i=0; i < config_get_buffer(c)->nbelem; i++){ - dep = word_buffer_get_word_n(config_get_buffer(c), i); - if(word_get_gov(dep) == 0) root_position = i; - printf("%s\t", word_get_input(dep)); - /* if(word_get_sent_seg(dep) == 1){ - printf("%d\teos\t1\n", root_position - i); - } - else{*/ - printf("%d\t", word_get_gov(dep)); - label = (word_get_label(dep) == -1)? NULL : dico_int2string(dico_labels, word_get_label(dep)); - if(label != NULL) - printf("%s\t0\n", label) ; - else - printf("_\t0\n"); - /* } */ - } -} -#endif void simple_decoder_parser_arc_eager(context *ctx) { @@ -75,7 +49,7 @@ void simple_decoder_parser_arc_eager(context *ctx) float delta; int argmax1, argmax2; float max1, max2; - + int index; root_label = dico_string2int(ctx->dico_labels, ctx->root_label); if(root_label == -1) root_label = 0; @@ -86,7 +60,20 @@ void simple_decoder_parser_arc_eager(context *ctx) mvt_code = feature_table_argmax(fv, ft, &max); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); + + if(ctx->trace_mode){ + index = word_get_index(word_buffer_b0(config_get_buffer(c))); + fprintf(stdout, "%d\t", index); + stack_print(stdout, c->st); + fprintf(stdout, "\t"); + + movement_print(stdout, mvt_code, ctx->dico_labels); + fprintf(stdout, "\t"); + feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2); + printf("%f\n", max1 - max2); + + } if(ctx->debug_mode){ fprintf(stdout, "***********************************\n"); @@ -129,13 +116,14 @@ void simple_decoder_parser_arc_eager(context *ctx) if(result == 0){ if(ctx->debug_mode){ - fprintf(stdout, "WARNING : movement cannot be executed !\n"); + fprintf(stdout, "WARNING : movement cannot be executed doing a SHIFT instead !\n"); } movement_shift(c, 1, max); } } - print_word_buffer(c, ctx->dico_labels); + if(!ctx->trace_mode) + print_word_buffer(c, ctx->dico_labels); config_free(c); feat_vec_free(fv); @@ -143,3 +131,30 @@ void simple_decoder_parser_arc_eager(context *ctx) if(ctx->input_filename) fclose(f); } + +#if 0 +void print_word_buffer(config *c, dico *dico_labels) +{ + int i; + word *dep; + char *label; + int root_position = 0; + + for(i=0; i < config_get_buffer(c)->nbelem; i++){ + dep = word_buffer_get_word_n(config_get_buffer(c), i); + if(word_get_gov(dep) == 0) root_position = i; + printf("%s\t", word_get_input(dep)); + /* if(word_get_sent_seg(dep) == 1){ + printf("%d\teos\t1\n", root_position - i); + } + else{*/ + printf("%d\t", word_get_gov(dep)); + label = (word_get_label(dep) == -1)? NULL : dico_int2string(dico_labels, word_get_label(dep)); + if(label != NULL) + printf("%s\t0\n", label) ; + else + printf("_\t0\n"); + /* } */ + } +} +#endif diff --git a/perceptron/exec/CMakeLists.txt b/perceptron/exec/CMakeLists.txt index ad4c79a4e79e6cfea0fea777521517242b628f9c..1a9f2501022b9399bd0b1631cf2d6deff831625f 100644 --- a/perceptron/exec/CMakeLists.txt +++ b/perceptron/exec/CMakeLists.txt @@ -5,10 +5,10 @@ target_link_libraries(perceptron_train perceptron) target_link_libraries(perceptron_train maca_common) install (TARGETS perceptron_train DESTINATION bin) -#add_executable(maca_trans_parser_cff_cutoff cff_cutoff.c) -#target_link_libraries(maca_trans_parser_cff_cutoff perceptron) -#target_link_libraries(maca_trans_parser_cff_cutoff maca_common) -#install (TARGETS maca_trans_parser_cff_cutoff DESTINATION bin) +add_executable(cff_cutoff cff_cutoff.c) +target_link_libraries(cff_cutoff perceptron) +target_link_libraries(cff_cutoff maca_common) +install (TARGETS cff_cutoff DESTINATION bin) add_executable(perceptron_eval perceptron_eval.c) target_link_libraries(perceptron_eval perceptron) diff --git a/perceptron/exec/cff_cutoff.c b/perceptron/exec/cff_cutoff.c index 916df6e35061bd319cf2f5fd0de3d8404cc32aad..029701b697e887715a99fdf1d47a67cbd908cabb 100644 --- a/perceptron/exec/cff_cutoff.c +++ b/perceptron/exec/cff_cutoff.c @@ -5,25 +5,109 @@ #include<getopt.h> #include"feature_table.h" #include"dico.h" +#include"util.h" #include"perceptron.h" #include"perceptron_context.h" #include"cf_file.h" -void cff_cutoff_help_message(perceptron_context *ctx) +typedef struct { + int help; + int verbose; + char *program_name; + char *cff_filename; + int cutoff; + char *vocabs_filename; + dico_vec *vocabs; + float hash_ratio; + dico *d_perceptron_features; + +} cff_cutoff_context; + +cff_cutoff_context *cff_cutoff_context_new(void) +{ + cff_cutoff_context *ctx = (cff_cutoff_context *)memalloc(sizeof(cff_cutoff_context)); + + ctx->verbose = 0; + ctx->program_name = NULL; + ctx->vocabs_filename = NULL; + ctx->cff_filename = NULL; + ctx->cutoff = 1; + ctx->hash_ratio = 0.5; + ctx->vocabs = NULL; + ctx->d_perceptron_features = NULL; + return ctx; +} + +void cff_cutoff_context_free(cff_cutoff_context *ctx) +{ + if(ctx->program_name) free(ctx->program_name); + if(ctx->cff_filename) free(ctx->cff_filename); + free(ctx); +} + + +void cff_cutoff_help_message2(cff_cutoff_context *ctx) +{ + fprintf(stderr, "usage: %s [options]\n", ctx->program_name); + fprintf(stderr, "Options:\n"); + fprintf(stderr, "\t-h --help : print this message\n"); + fprintf(stderr, "\t-v --verbose : activate verbose mode\n"); + fprintf(stderr, "\t-i --input <file> : cff file name\n"); + fprintf(stderr, "\t-V --vocabs <file> : vocabs filename\n"); + fprintf(stderr, "\t-c --cutoff <int> : threshold (features appearing less than the threshold are ignored\n"); +} + +cff_cutoff_context *cff_cutoff_read_options(int argc, char *argv[]) { - perceptron_context_help_message(ctx); - fprintf(stderr, "INPUT\n"); - perceptron_context_cutoff_help_message(ctx); + int c; + int option_index = 0; + cff_cutoff_context *ctx = cff_cutoff_context_new(); + + ctx->program_name = strdup(argv[0]); + + static struct option long_options[5] = + { + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, + {"input", required_argument, 0, 'i'}, + {"vocabs", required_argument, 0, 'V'}, + {"cutoff", required_argument, 0, 'c'} + }; + optind = 0; + opterr = 0; + + + while ((c = getopt_long (argc, argv, "hvi:V:c:", long_options, &option_index)) != -1){ + switch (c) + { + case 'h': + ctx->help = 1; + break; + case 'v': + ctx->verbose = 1; + break; + case 'V': + ctx->vocabs_filename = strdup(optarg); + break; + case 'c': + ctx->cutoff = atoi(optarg); + break; + case 'i': + ctx->cff_filename = strdup(optarg); + break; + } + } + return ctx; } -void cff_cutoff_check_options(context *ctx) +void cff_cutoff_check_options(cff_cutoff_context *ctx) { if(ctx->help || !ctx->vocabs_filename || !ctx->cff_filename ){ - cff_cutoff_help_message(ctx); + cff_cutoff_help_message2(ctx); exit(1); } } @@ -44,9 +128,9 @@ int main(int argc, char *argv[]) dico *old_d_feat; dico *new_d_feat; - context *ctx; + cff_cutoff_context *ctx; - ctx = context_read_options(argc, argv); + ctx = cff_cutoff_read_options(argc, argv); cff_cutoff_check_options(ctx); ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); @@ -59,7 +143,7 @@ int main(int argc, char *argv[]) old2new = (int *)memalloc(n_feat * sizeof(int)); for(i=0; i < n_feat; i++) - if(occ_table[i] < ctx->feature_cutoff) + if(occ_table[i] < ctx->cutoff) old2new[i] = -1; else old2new[i] = dico_add(new_d_feat, dico_int2string(old_d_feat, i)); @@ -86,25 +170,20 @@ int main(int argc, char *argv[]) fprintf(stderr, "total number of features : %d\n", n_feat); - fprintf(stderr, "number of features removed : %d\n", feat_removed); - fprintf(stderr, "ratio : %.3f\n\n", (float)feat_removed / n_feat); + fprintf(stderr, "threshold : %d\n", ctx->cutoff); + fprintf(stderr, "after thresholding : %d\n", n_feat - feat_removed); + fprintf(stderr, "ratio : %.3f\n\n", (float)(n_feat - feat_removed) / n_feat); fprintf(stderr, "total number of feature occurrences : %d\n", f_occ); - fprintf(stderr, "feature occurrences removed : %d\n", occ_removed); - fprintf(stderr, "ratio : %.3f\n", (float)occ_removed / f_occ); + fprintf(stderr, "atfer thresholding : %d\n", f_occ - occ_removed); + fprintf(stderr, "ratio : %.3f\n", (float)(f_occ - occ_removed) / f_occ); dico_vec_replace_dico(ctx->vocabs, old_d_feat, new_d_feat); dico_vec_print(ctx->vocabs_filename, ctx->vocabs); - - /* dico_print(ctx->perceptron_features_filename, new_d_feat); */ - - dico_free(new_d_feat); free(old2new); - - - context_free(ctx); + cff_cutoff_context_free(ctx); return 0; } diff --git a/perceptron/lib/src/feature_table.c b/perceptron/lib/src/feature_table.c index 8fb1aed148d74d015fa7f407c0811ddcc6d20f95..82c6be2c5d88bb2cb3e6f4525b0084f7870b37dc 100644 --- a/perceptron/lib/src/feature_table.c +++ b/perceptron/lib/src/feature_table.c @@ -230,6 +230,8 @@ float feature_table_entropy(feat_vec *fv, feature_table *ft) return entropy; } + + int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max) { float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float)); @@ -266,6 +268,9 @@ int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max) return argmax; } +/* fill an array (classes_score) with the scores of the different classes */ +/* for the feature vector fv */ + void feature_table_scores(feat_vec *fv, feature_table *ft, float *classes_score) { int cla;