diff --git a/maca_common/include/word.h b/maca_common/include/word.h index 14d3c05a0461313c47f163a434f58a6c6f0fe28c..c1eb982862c9c4991cfe902631d96ea9a5ff3fd2 100644 --- a/maca_common/include/word.h +++ b/maca_common/include/word.h @@ -53,7 +53,7 @@ #define word_set_gov(w, val) (w)->wf_array[MCD_WF_GOV] = (val) #define word_set_label(w, val) (w)->wf_array[MCD_WF_LABEL] = (val) #define word_set_stag(w, val) (w)->wf_array[MCD_WF_STAG] = (val) -#define word_set_word_seg(w) (w)->wf_array[MCD_WF_WORD_SEG] = (val) +#define word_set_sent_seg(w, val) (w)->wf_array[MCD_WF_SENT_SEG] = (val) #define word_set_A(w, val) (w)->wf_array[MCD_WF_A] = (val) #define word_set_B(w, val) (w)->wf_array[MCD_WF_B] = (val) #define word_set_C(w, val) (w)->wf_array[MCD_WF_C] = (val) diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index f7d18f5fd462b2fac4e450356e9ca1bdda4403dc..c5dcf448efae49799baaf604a09a780a70200dad 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -8,6 +8,7 @@ set(SOURCES src/context.c src/oracle_parser_arc_eager.c src/oracle_tagger.c src/simple_decoder_parser.c + src/simple_decoder_parser_arc_eager.c src/simple_decoder_forrest.c src/simple_decoder_tagger.c src/feat_lib.c @@ -46,6 +47,12 @@ target_link_libraries(maca_trans_parser_mcf2cff transparse) target_link_libraries(maca_trans_parser_mcf2cff maca_common) install (TARGETS maca_trans_parser_mcf2cff DESTINATION bin) +add_executable(maca_trans_parser_arc_eager_mcf2cff ./src/maca_trans_parser_arc_eager_mcf2cff.c) +target_link_libraries(maca_trans_parser_arc_eager_mcf2cff perceptron) +target_link_libraries(maca_trans_parser_arc_eager_mcf2cff transparse) +target_link_libraries(maca_trans_parser_arc_eager_mcf2cff maca_common) +install (TARGETS maca_trans_parser_arc_eager_mcf2cff DESTINATION bin) + add_executable(maca_trans_parser ./src/maca_trans_parser.c) target_link_libraries(maca_trans_parser perceptron) target_link_libraries(maca_trans_parser transparse) diff --git a/maca_trans_parser/src/maca_trans_parser.c b/maca_trans_parser/src/maca_trans_parser.c index ce7673d2ee4da49ab4d4ba2abac92b623faa77a5..f65c10dd4ee3ac23498ae6204d34bec9a4d8e427 100644 --- a/maca_trans_parser/src/maca_trans_parser.c +++ b/maca_trans_parser/src/maca_trans_parser.c @@ -6,6 +6,7 @@ #include"context.h" #include"movement_parser.h" #include"oracle_parser.h" +#include"oracle_parser_arc_eager.h" #include"feat_fct.h" #include"feature_table.h" #include"dico.h" diff --git a/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c b/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c new file mode 100644 index 0000000000000000000000000000000000000000..ae143cd277cd38eadedbd5aa2086de5b31c073a3 --- /dev/null +++ b/maca_trans_parser/src/maca_trans_parser_arc_eager_mcf2cff.c @@ -0,0 +1,179 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"movement_parser_arc_eager.h" +#include"oracle_parser_arc_eager.h" +#include"feat_fct.h" +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"word_emb.h" +#include"config2feat_vec.h" + +void maca_trans_parser_mcf2cff_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_vocabs_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_vocabs_help_message(ctx); + +} + +void maca_trans_parser_mcf2cff_check_options(context *ctx) +{ + if(!ctx->input_filename + || ctx->help + /* || !ctx->mcd_filename */ + /* || !(ctx->cff_filename || ctx->fann_filename) */ + ){ + maca_trans_parser_mcf2cff_help_message(ctx); + exit(1); + } +} + +void generate_training_file_stream(FILE *output_file, context *ctx) +{ + config *c; + int mvt_code; + char mvt_type; + int mvt_label; + feat_vec *fv = feat_vec_new(feature_types_nb); + int sentence_nb = 0; + int root_label = dico_string2int(ctx->dico_labels, (char *) ctx->root_label); + word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct); + FILE *mcf_file = myfopen(ctx->input_filename, "r"); + int start_sentence_index = 0; + + /* create an mcd that corresponds to ctx->mcd_struct, but without gov and label */ + /* the idea is to ignore syntax in the mcf file that will be read */ + /* it is ugly !!! */ + + mcd *mcd_struct_hyp = mcd_copy(ctx->mcd_struct); + mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_GOV); + mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_LABEL); + + c = config_initial_no_dummy_word(mcf_file, mcd_struct_hyp, 5); + + while(!word_buffer_end(ref) && (sentence_nb < ctx->sent_nb)){ + /*printf("************ REF ************\n"); + word_buffer_print(stdout, ref); + printf("*****************************\n");*/ + + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + + mvt_code = oracle_parser_arc_eager(c, ref, start_sentence_index, root_label); + mvt_type = movement_type(mvt_code); + mvt_label = movement_label(mvt_code); + + /* config_print(stdout,c); */ + /* movement_print(stdout, mvt_code, ctx->dico_labels); */ + + fprintf(output_file, "%d", mvt_code); + feat_vec_print(output_file, fv); + + if(mvt_type == MVT_EOS){ + movement_eos(c, 0); + sentence_nb++; + start_sentence_index = word_get_index(word_buffer_b0(config_get_buffer(c))) - 1; + + + /* config_print(stdout,c); */ + if(word_buffer_is_last(ref)) + break; + } + + if(mvt_type == MVT_LEFT){ + movement_left_arc(c, mvt_label, 0); + continue; + } + + if(mvt_type == MVT_RIGHT){ + movement_right_arc(c, mvt_label, 0); + word_buffer_move_right(ref); + continue; + } + + if(mvt_type == MVT_REDUCE){ + movement_reduce(c, 0); + continue; + } + + if(mvt_type == MVT_ROOT){ + movement_root(c, 0, root_label); + continue; + } + + if(mvt_type == MVT_SHIFT){ + movement_shift(c, 1, 0); + word_buffer_move_right(ref); + continue; + } + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + + ctx = context_read_options(argc, argv); + maca_trans_parser_mcf2cff_check_options(ctx); + + ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); + + if(ctx->mode == TRAIN_MODE){ + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + } + else if(ctx->mode == TEST_MODE){ + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + } + + ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + + if(ctx->dico_labels == NULL){ + fprintf(stderr, "cannot find label names\n"); + return 1; + } + ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 1; + + feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); + + + /* in train mode create feature dictionnary for perceptron */ + if(ctx->mode == TRAIN_MODE) + ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); + + /* in test mode read feature dictionnary for perceptron */ + if(ctx->mode == TEST_MODE) + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + /* add the feature dictionnary to the dico vector */ + dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + + /* open output file */ + output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w") : stdout; + + generate_training_file_stream(output_file, ctx); + + if(ctx->mode == TRAIN_MODE) + dico_vec_print(ctx->vocabs_filename, ctx->vocabs); + + if(ctx->cff_filename) + fclose(output_file); + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/maca_trans_parser_mcf2cff.c b/maca_trans_parser/src/maca_trans_parser_mcf2cff.c index cb3d6b76aa6dfe2cf8c64ef55d08aad25e7617b7..6676fbef7a0c593517ee143a6cad566334aae93f 100644 --- a/maca_trans_parser/src/maca_trans_parser_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_parser_mcf2cff.c @@ -3,8 +3,8 @@ #include<string.h> #include<unistd.h> #include<getopt.h> -#include"movement_parser_arc_eager.h" -#include"oracle_parser_arc_eager.h" +#include"movement_parser.h" +#include"oracle_parser.h" #include"feat_fct.h" #include"context.h" #include"feat_vec.h" @@ -12,7 +12,7 @@ #include"word_emb.h" #include"config2feat_vec.h" -void maca_trans_parser_mcf2cff_help_message(context *ctx) +void maca_trans_parser_conll2cff_help_message(context *ctx) { context_general_help_message(ctx); context_mode_help_message(ctx); @@ -30,14 +30,14 @@ void maca_trans_parser_mcf2cff_help_message(context *ctx) } -void maca_trans_parser_mcf2cff_check_options(context *ctx) +void maca_trans_parser_conll2cff_check_options(context *ctx) { if(!ctx->input_filename || ctx->help /* || !ctx->mcd_filename */ /* || !(ctx->cff_filename || ctx->fann_filename) */ ){ - maca_trans_parser_mcf2cff_help_message(ctx); + maca_trans_parser_conll2cff_help_message(ctx); exit(1); } } @@ -49,86 +49,122 @@ void generate_training_file_stream(FILE *output_file, context *ctx) char mvt_type; int mvt_label; feat_vec *fv = feat_vec_new(feature_types_nb); + sentence *ref = NULL; int sentence_nb = 0; + /* int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), (char *) ctx->root_label); */ int root_label = dico_string2int(ctx->dico_labels, (char *) ctx->root_label); - int eos_label = dico_string2int(ctx->dico_labels, "eos"); - word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct); - FILE *mcf_file = myfopen(ctx->input_filename, "r"); - int start_sentence_index = 0; - - /* create an mcd that corresponds to ctx->mcd_struct, but without gov and label */ - /* the idea is to ignore syntax in the mcf file that will be read */ - /* it is ugly !!! */ - - mcd *mcd_struct_hyp = mcd_copy(ctx->mcd_struct); - mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_GOV); - mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_LABEL); - - c = config_initial_no_dummy_word(mcf_file, mcd_struct_hyp, 5); + FILE *conll_file = myfopen(ctx->input_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); + + c = config_initial(conll_file, ctx->mcd_struct, 5); - while(!word_buffer_end(ref)){ - /*printf("************ REF ************\n"); - word_buffer_print(stdout, ref); - printf("*****************************\n");*/ - - config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); - - /* feat_vec_print(stdout, fv); */ - - mvt_code = oracle_parser_arc_eager(c, ref, start_sentence_index, root_label); + while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ + /* sentence_print(stdout, ref, ctx->dico_labels); */ + while(1){ + /* config_print(stdout,c); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); - mvt_type = movement_type(mvt_code); - mvt_label = movement_label(mvt_code); - - /* config_print(stdout,c); */ - /* movement_print(stdout, mvt_code, ctx->dico_labels); */ - - fprintf(output_file, "%d", mvt_code); - feat_vec_print(output_file, fv); - - if(mvt_type == MVT_EOS){ - /* printf("************BEFORE *****************\n"); */ - /* config_print(stdout,c); */ - - movement_eos(c, 0); + /* feat_vec_print(stdout, fv); */ + + mvt_code = oracle_parser(c, ref); + + mvt_type = movement_type(mvt_code); + mvt_label = movement_label(mvt_code); - /* printf("************AFTER*****************\n"); */ - /* config_print(stdout,c); */ - start_sentence_index = word_get_index(word_buffer_b0(config_get_buffer(c))) - 1; - /* printf("%d\n", start_sentence_index); */ + /* printf("mvt code = %d\n", mvt_code); */ + /* movement_print(stdout, mvt_code, ctx->dico_labels); */ + + fprintf(output_file, "%d", mvt_code); + feat_vec_print(output_file, fv); + + if(queue_is_empty(c->bf)) break; - if(word_buffer_is_last(ref)){ - /* printf("it is the end\n"); */ + if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ /* sentence is complete */ + + /* create the root arc */ + movement_right_arc(c, mvt_label, 0); + + /* shift dummy word in stack */ + movement_shift(c, 1, 0); + + /* printf("sentence complete config : "); + config_print(stdout,c); */ + + /* empty depset */ + depset_free(c->ds); + c->ds = depset_new(); + sentence_free(ref); + sentence_nb++; + + c->current_index = queue_renumber_words(c->bf); + break; } + + if(mvt_type == MVT_LEFT){ + movement_left_arc(c, mvt_label, 0); + continue; + } + if(mvt_type == MVT_RIGHT){ + movement_right_arc(c, mvt_label, 0); + continue; + } + if(mvt_type == MVT_SHIFT){ + movement_shift(c, 1, 0); + continue; + } } - - if(mvt_type == MVT_LEFT){ - movement_left_arc(c, mvt_label, 0); - continue; - } - - if(mvt_type == MVT_RIGHT){ - movement_right_arc(c, mvt_label, 0); - word_buffer_move_right(ref); - continue; - } - - if(mvt_type == MVT_REDUCE){ - movement_reduce(c, 0); - continue; - } - - if(mvt_type == MVT_ROOT){ - movement_root(c, 0, root_label); - continue; - } + } +} + +void generate_training_file_buffer(FILE *output_file, context *ctx) +{ + config *c; + int mvt_code; + char mvt_type; + int mvt_label; + feat_vec *fv = feat_vec_new(feature_types_nb); + sentence *ref = NULL; + int sentence_nb = 0; + FILE *conll_file = myfopen(ctx->input_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); - if(mvt_type == MVT_SHIFT){ - movement_shift(c, 1, 0); - word_buffer_move_right(ref); - continue; + c = config_initial(conll_file, ctx->mcd_struct, 0); + + while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ + /* sentence_print(stdout, ref, NULL); */ + queue_read_sentence(c->bf, conll_file, ctx->mcd_struct); + while(!config_is_terminal(c)){ + /* config_print(stdout,c); */ + + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + + mvt_code = oracle_parser(c, ref); + + mvt_type = movement_type(mvt_code); + mvt_label = movement_label(mvt_code); + + /* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */ + + fprintf(output_file, "%d", mvt_code); + feat_vec_print(output_file, fv); + + if(mvt_type == MVT_LEFT){ + movement_left_arc(c, mvt_label, 0); + continue; + } + if(mvt_type == MVT_RIGHT){ + movement_right_arc(c, mvt_label, 0); + continue; + } + if(mvt_type == MVT_SHIFT){ + movement_shift(c, 0, 0); + continue; + } } + config_free(c); + c = config_initial(conll_file, ctx->mcd_struct, 0); + sentence_nb++; } } @@ -138,7 +174,7 @@ int main(int argc, char *argv[]) FILE *output_file; ctx = context_read_options(argc, argv); - maca_trans_parser_mcf2cff_check_options(ctx); + maca_trans_parser_conll2cff_check_options(ctx); ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); @@ -182,18 +218,12 @@ int main(int argc, char *argv[]) else output_file = stdout; - - generate_training_file_stream(output_file, ctx); -#if 0 -======= if(ctx->stream_mode){ generate_training_file_stream(output_file, ctx); } else{ generate_training_file_buffer(output_file, ctx); } ->>>>>>> master -#endif if(ctx->mode == TRAIN_MODE){ /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ dico_vec_print(ctx->vocabs_filename, ctx->vocabs); diff --git a/maca_trans_parser/src/movement_parser_arc_eager.c b/maca_trans_parser/src/movement_parser_arc_eager.c index df00df79c079bbc30fff53b65c95b26c97457c93..3941705e5e0ab38c70aac18847384dec3060f1e8 100644 --- a/maca_trans_parser/src/movement_parser_arc_eager.c +++ b/maca_trans_parser/src/movement_parser_arc_eager.c @@ -8,10 +8,11 @@ void movement_print(FILE *f, int mvt_code, dico *dico_labels){ int mvt_type = movement_type(mvt_code); int mvt_label = movement_label(mvt_code); char *label; - if(mvt_type == MVT_SHIFT) {fprintf(f, "SHIFT\n"); return;} + if(mvt_type == MVT_SHIFT) {fprintf(f, "SHIFT\n"); return;} if(mvt_type == MVT_REDUCE) {fprintf(f, "REDUCE\n"); return;} - if(mvt_type == MVT_ROOT) {fprintf(f, "ROOT\n"); return;} - if(mvt_type == MVT_RIGHT) fprintf(f, "RIGHT"); + if(mvt_type == MVT_ROOT) {fprintf(f, "ROOT\n"); return;} + if(mvt_type == MVT_EOS) {fprintf(f, "EOS\n"); return;} + if(mvt_type == MVT_RIGHT) fprintf(f, "RIGHT"); else fprintf(f, "LEFT"); label = dico_int2string(dico_labels, mvt_label); fprintf(f, " %s\n", label); @@ -19,20 +20,20 @@ void movement_print(FILE *f, int mvt_code, dico *dico_labels){ int movement_type(int mvt) { - if(mvt == 0) return MVT_SHIFT; /* 0 is the code of shift */ - if(mvt == 1) return MVT_REDUCE; /* 1 is the code of reduce */ - if(mvt == 2) return MVT_ROOT; /* 2 is the code of root */ - if(mvt == 3) return MVT_EOS; /* 2 is the code of root */ - if(mvt % 2 == 0) return MVT_LEFT; /* even movements are left movements */ - return MVT_RIGHT; /* odd movements are right movements */ + if(mvt == MVT_SHIFT) return MVT_SHIFT; /* 0 is the code of shift */ + if(mvt == MVT_REDUCE) return MVT_REDUCE; /* 1 is the code of reduce */ + if(mvt == MVT_ROOT) return MVT_ROOT; /* 2 is the code of root */ + if(mvt == MVT_EOS) return MVT_EOS; /* 3 is the code of root */ + if(mvt % 2 == 0) return MVT_LEFT; /* even movements are left movements */ + return MVT_RIGHT; /* odd movements are right movements */ } int movement_label(int mvt) { - if(mvt == 0) return -1; /* 0 is the code of shift */ - if(mvt == 1) return -1; /* 1 is the code of reduce */ - if(mvt == 2) return -1; /* 2 is the code of root */ - if(mvt == 3) return -1; /* 3 is the code of eos */ + if(mvt == MVT_SHIFT) return -1; /* 0 is the code of shift */ + if(mvt == MVT_REDUCE) return -1; /* 1 is the code of reduce */ + if(mvt == MVT_ROOT) return -1; /* 2 is the code of root */ + if(mvt == MVT_EOS) return -1; /* 3 is the code of eos */ if(mvt % 2 == 0) /* even codes correspond to left movements */ return mvt / 2 - 2; return (mvt - 1) / 2 - 2; /* odd codes correspond to right movements */ @@ -40,11 +41,18 @@ int movement_label(int mvt) int movement_eos(config *c, float score) { + if(stack_is_empty(config_get_stack(c))) return 1; + + /* word on the top of the stack is sent_seg */ + word_set_sent_seg(stack_top(config_get_stack(c)), 1); + /* perform all pending reduce */ while(movement_reduce(c,0)); /* remove root from stack */ stack_pop(config_get_stack(c)); + + config_add_mvt(c, MVT_EOS); return 1; } @@ -82,7 +90,6 @@ int movement_right_arc(config *c, int label, float score) /* printf("create right arc %d -> %d dist = %d\n", word_get_index(gov), word_get_index(dep), dist); */ - /* create a new dependency */ word_set_gov(dep, dist); word_set_label(dep, label); @@ -118,8 +125,6 @@ int movement_root(config *c, float score, int root_code) word *b0 = word_buffer_b0(config_get_buffer(c)); word_set_gov(b0, 0); word_set_label(b0, root_code); - /* stack_push(config_get_stack(c), b0); */ - /* word_buffer_move_right(config_get_buffer(c)); */ config_add_mvt(c, MVT_ROOT); return 1; } diff --git a/maca_trans_parser/src/simple_decoder_parser.c b/maca_trans_parser/src/simple_decoder_parser.c index b6f8888666cec3860b39522a97577639540f6b0c..f803a8105a6ee189c14249abe2c010ee1ff6c61b 100644 --- a/maca_trans_parser/src/simple_decoder_parser.c +++ b/maca_trans_parser/src/simple_decoder_parser.c @@ -4,12 +4,55 @@ #include<unistd.h> #include<getopt.h> #include"context.h" -#include"movement_parser_arc_eager.h" +#include"movement_parser.h" +#include"oracle_parser.h" #include"feat_fct.h" #include"config2feat_vec.h" #include"feature_table.h" #include"dico.h" +void simple_decoder_buffer(context *ctx, FILE *f, feature_table *ft, int root_label) +{ + int mvt_code; + int mvt_type; + int mvt_label; + float max; + feat_vec *fv = feat_vec_new(feature_types_nb); + config *c = config_initial(f, ctx->mcd_struct, 0); + + /* read a sentence and put it in the buffer */ + while(queue_read_sentence(c->bf, f, ctx->mcd_struct) > 1){ + while(!config_is_terminal(c)){ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + mvt_code = feature_table_argmax(fv, ft, &max); + mvt_type = movement_type(mvt_code); + mvt_label = movement_label(mvt_code); + + if(mvt_type == MVT_LEFT) + if(movement_left_arc(c, mvt_label, max)) + continue; + + if(mvt_type == MVT_RIGHT) + if(movement_right_arc(c, mvt_label, max)) + continue; + + movement_shift(c, 0, max); + } + + /* config_print(stdout, c); */ + + config_connect_subtrees(c, root_label); + depset_print2(stdout, c->ds, ctx->dico_labels); + + + /* config_free(c); */ + c = config_initial(f, ctx->mcd_struct, 0); + } + + feat_vec_free(fv); +} + + void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_label) { int mvt_code; @@ -18,68 +61,70 @@ void simple_decoder_stream(context *ctx, FILE *f, feature_table *ft, int root_la float max; feat_vec *fv = feat_vec_new(feature_types_nb); config *c = NULL; - word *dep; - - c = config_initial_no_dummy_word(f, ctx->mcd_struct, 5); - while(1){ + + c = config_initial(f, ctx->mcd_struct, 5); + while(!config_is_terminal(c)){ + /* config_print(stdout, c); */ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + /* feat_vec_print(stdout, fv); */ mvt_code = feature_table_argmax(fv, ft, &max); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); - /* config_print(stdout, c); */ - /* movement_print(stdout, mvt_code, ctx->dico_labels); */ - - if(mvt_type == MVT_LEFT){ - dep = stack_s0(config_get_stack(c)); - if(movement_left_arc(c, mvt_label, max)){ - /* printf("%d\t", word_get_index(dep)); - printf("%s\t", word_get_input(dep)); - printf("%d\t", word_get_gov(dep)); - printf("%s\n", dico_int2string(ctx->dico_labels, word_get_label(dep)));*/ - continue; - } - } + /* printf("code predicted = %d\n", mvt_code); */ + /* movement_print(stdout, mvt_code, ctx->dico_labels); */ - if(mvt_type == MVT_RIGHT){ - dep = word_buffer_b0(config_get_buffer(c)); - if(movement_right_arc(c, mvt_label, max)){ - /* printf("%d\t", word_get_index(dep)); - printf("%s\t", word_get_input(dep)); - printf("%d\t", word_get_gov(dep)); - printf("%s\n", dico_int2string(ctx->dico_labels, word_get_label(dep)));*/ - continue; - } + /* sentence is complete */ + if((stack_height(c->st)==1) && (mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ + /* if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ */ + /* if(mvt_label == root_label){ */ + /* printf("sentence complete\n"); */ + /*config_print(stdout, c); */ + + /* create the root arc */ + movement_right_arc(c, mvt_label, 0); + + /* shift dummy word in stack */ + movement_shift(c, 1, 0); + + /* config_print(stdout, c); */ + + /* config_connect_subtrees(c, root_label); */ + /* depset_print_new_index(stdout, c->ds, ctx->dico_labels); */ + + depset_print2(stdout, c->ds, ctx->dico_labels); + + /* pop the dummy word */ + stack_pop(c->st); + /* remplace it with a fresh one */ + stack_push(c->st, word_create_dummy(ctx->mcd_struct)); + + /* empty depset */ + depset_free(c->ds); + c->ds = depset_new(); + + /* renumber the words that are left in the buffer */ + c->current_index = queue_renumber_words(c->bf); + continue; } + + if(mvt_type == MVT_LEFT) + if(movement_left_arc(c, mvt_label, max)) + continue; - if(mvt_type == MVT_REDUCE) - if(movement_reduce(c, max)) + if(mvt_type == MVT_RIGHT) + if(movement_right_arc(c, mvt_label, max)) continue; - - if(mvt_type == MVT_ROOT) - if(movement_root(c, max, root_label)) - continue; movement_shift(c, 1, max); - - if(word_buffer_is_last(config_get_buffer(c))) break; } + + /* config_print(stdout, c); */ + + /* config_connect_subtrees(c, root_label); */ + + depset_print2(stdout, c->ds, ctx->dico_labels); - for(int i=0; i < config_get_buffer(c)->nbelem; i++){ - dep = word_buffer_get_word_n(config_get_buffer(c), i); - printf("%s\t", word_get_input(dep)); - printf("%d\t", word_get_gov(dep)); - /* printf("label = %d\n", word_get_label(dep)); */ - char *label = (word_get_label(dep) == -1)? NULL : dico_int2string(ctx->dico_labels, word_get_label(dep)); - if(label != NULL) - printf("%s\t", label) ; - else - printf("_\t"); - if((label != NULL) && !strcmp(label, "eos")) - printf("1\n"); - else - printf("0\n"); - } /* config_free(c); */ feat_vec_free(fv); @@ -95,10 +140,14 @@ void simple_decoder(context *ctx) root_label = dico_string2int(ctx->dico_labels, ctx->root_label); if(root_label == -1) root_label = 0; - simple_decoder_stream(ctx, f, ft, root_label); + if(ctx->stream_mode) + simple_decoder_stream(ctx, f, ft, root_label); + else + simple_decoder_buffer(ctx, f, ft, root_label); feature_table_free(ft); if(ctx->input_filename) fclose(f); } + diff --git a/maca_trans_parser/src/simple_decoder_parser_arc_eager.c b/maca_trans_parser/src/simple_decoder_parser_arc_eager.c new file mode 100644 index 0000000000000000000000000000000000000000..1be8bbf27129f9e0d3e51352888fcaadb81d2429 --- /dev/null +++ b/maca_trans_parser/src/simple_decoder_parser_arc_eager.c @@ -0,0 +1,109 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"movement_parser_arc_eager.h" +#include"feat_fct.h" +#include"config2feat_vec.h" +#include"feature_table.h" +#include"dico.h" + +void print_word_buffer(config *c, dico *dico_labels) +{ + int i; + word *dep; + char *label; + for(i=0; i < config_get_buffer(c)->nbelem; i++){ + dep = word_buffer_get_word_n(config_get_buffer(c), i); + printf("%s\t", word_get_input(dep)); + printf("%d\t", word_get_gov(dep)); + label = (word_get_label(dep) == -1)? NULL : dico_int2string(dico_labels, word_get_label(dep)); + if(label != NULL) + printf("%s\t", label) ; + else + printf("_\t"); + if((label != NULL) && !strcmp(label, "eos")) + printf("1\n"); + else + printf("0\n"); + } +} + + +void simple_decoder_parser_arc_eager(context *ctx) +{ + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + int root_label; + int mvt_code; + int mvt_type; + int mvt_label; + float max; + feat_vec *fv = feat_vec_new(feature_types_nb); + config *c = NULL; + + root_label = dico_string2int(ctx->dico_labels, ctx->root_label); + if(root_label == -1) root_label = 0; + + c = config_initial_no_dummy_word(f, ctx->mcd_struct, 5); + while(1){ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + mvt_code = feature_table_argmax(fv, ft, &max); + mvt_type = movement_type(mvt_code); + mvt_label = movement_label(mvt_code); + + config_print(stdout, c); + movement_print(stdout, mvt_code, ctx->dico_labels); + + if(mvt_type == MVT_LEFT) + if(movement_left_arc(c, mvt_label, max)){ + if(word_buffer_is_last(config_get_buffer(c))) + break; + continue; + } + + if(mvt_type == MVT_RIGHT) + if(movement_right_arc(c, mvt_label, max)){ + if(word_buffer_is_last(config_get_buffer(c))) + break; + continue; + } + + if(mvt_type == MVT_REDUCE) + if(movement_reduce(c, max)){ + if(word_buffer_is_last(config_get_buffer(c))) + break; + continue; + } + + if(mvt_type == MVT_ROOT) + if(movement_root(c, max, root_label)){ + if(word_buffer_is_last(config_get_buffer(c))) + break; + continue; + } + + /* if(mvt_type == MVT_EOS) + if(movement_eos(c, max)){ + if(word_buffer_is_last(config_get_buffer(c))) + break; + continue; + } + */ + if(word_buffer_is_last(config_get_buffer(c))) + break; + + movement_shift(c, 1, max); + + } + + print_word_buffer(c, ctx->dico_labels); + + /* config_free(c); */ + feat_vec_free(fv); + feature_table_free(ft); + if(ctx->input_filename) + fclose(f); +} diff --git a/maca_trans_parser/src/simple_decoder_parser_arc_eager.h b/maca_trans_parser/src/simple_decoder_parser_arc_eager.h new file mode 100644 index 0000000000000000000000000000000000000000..dd22abba8993c8a9f8298511895bf9e7e0fbe62e --- /dev/null +++ b/maca_trans_parser/src/simple_decoder_parser_arc_eager.h @@ -0,0 +1,7 @@ +#ifndef __SIMPLE_DECODER_PARSER_ARC_EAGER__ +#define __SIMPLE_DECODER_PARSER_ARC_EAGER__ +#include"context.h" + +void simple_decoder_parser_arc_eager(context *ctx); + +#endif