diff --git a/CMakeLists.txt b/CMakeLists.txt index 2ebd0e07bed36d276dad909ced25fa57f20b43db..bd2ed1c48dec24215fc2d5d5afab5ca11ea4884e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,5 +12,6 @@ add_subdirectory(perceptron) #add_subdirectory(maca_lemmatizer) add_subdirectory(maca_trans_parser) add_subdirectory(maca_crf_tagger) +add_subdirectory(maca_graph_parser) #set(CMAKE_INSTALL_PREFIX ../) diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 2b199d3f393cfc5dae84fe48956876d757d9b626..a64e8f8fb8c9ae4e1c0bae0f1755e4d83862a4c4 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -15,6 +15,7 @@ set(SOURCES src/context.c src/simple_decoder_tagparser_arc_eager.c # src/simple_decoder_forrest.c src/simple_decoder_tagger.c + src/simple_decoder_tagger_bt.c src/feat_lib.c src/stack.c src/config2feat_vec.c @@ -41,6 +42,12 @@ target_link_libraries(maca_trans_tagger_mcf2cff transparse) target_link_libraries(maca_trans_tagger_mcf2cff maca_common) install (TARGETS maca_trans_tagger_mcf2cff DESTINATION bin) +add_executable(maca_trans_tagger_mcf2cff_bt ./src/maca_trans_tagger_mcf2cff_bt.c) +target_link_libraries(maca_trans_tagger_mcf2cff_bt perceptron) +target_link_libraries(maca_trans_tagger_mcf2cff_bt transparse) +target_link_libraries(maca_trans_tagger_mcf2cff_bt maca_common) +install (TARGETS maca_trans_tagger_mcf2cff_bt DESTINATION bin) + #add_executable(maca_trans_parser_mcf2fann ./src/maca_trans_parser_mcf2fann.c) #target_link_libraries(maca_trans_parser_mcf2fann perceptron) #target_link_libraries(maca_trans_parser_mcf2fann transparse) diff --git a/maca_trans_parser/src/config.c b/maca_trans_parser/src/config.c index d4db0fe73a86d1d1ac8955152bd55e2d9806480b..da423cd7adba8920cde8437ab604eed9e4fdba39 100644 --- a/maca_trans_parser/src/config.c +++ b/maca_trans_parser/src/config.c @@ -11,6 +11,8 @@ config *config_new(FILE *f, mcd *mcd_struct, int lookahead) c->st = stack_new(); c->bf = word_buffer_new(f, mcd_struct, lookahead); c->history = mvt_stack_new(); + c->mvt_chosen = 0; + c->vcode_array = NULL; return c; } diff --git a/maca_trans_parser/src/config.h b/maca_trans_parser/src/config.h index e2075925265642efe2612b14ec5de129812ced03..c2ad6b81d74b98cc1645455160dc0792948babf2 100644 --- a/maca_trans_parser/src/config.h +++ b/maca_trans_parser/src/config.h @@ -8,6 +8,7 @@ #include"mcd.h" #include"word_buffer.h" #include"mvt_stack.h" +#include"feature_table.h" #define config_get_stack(c) (c)->st #define config_get_buffer(c) (c)->bf @@ -17,6 +18,8 @@ typedef struct { stack *st; /* the stack */ word_buffer *bf; /* the buffer */ mvt_stack *history; /* movement sequence that led to this configuration */ + int mvt_chosen; + vcode *vcode_array; } config; config *config_new(FILE *f, mcd *mcd_struct, int lookahead); diff --git a/maca_trans_parser/src/feat_fct.c b/maca_trans_parser/src/feat_fct.c index 2a24b2e5a90ba3b14315cb2e405f687a4be11c90..392fece07f8219134749ce759ad6e9de09e96ae3 100644 --- a/maca_trans_parser/src/feat_fct.c +++ b/maca_trans_parser/src/feat_fct.c @@ -156,6 +156,16 @@ int b0g(config *c) {return (word_get_gov(word_buffer_b0(config_get_buffer(c))) = int b0sf(config *c) {return word_get_label(word_buffer_b0(config_get_buffer(c)));} +int b0len(config *c) { + int len = 0; + word *w = word_buffer_b0(config_get_buffer(c)); + if(w->input) + len = strlen(w->input); + return (len > 7)? 7 : len; +} + + + int b0f(config *c) {return word_get_form(word_buffer_b0(config_get_buffer(c)));} int b0l(config *c) {return word_get_lemma(word_buffer_b0(config_get_buffer(c)));} int b0c(config *c) {return word_get_cpos(word_buffer_b0(config_get_buffer(c)));} @@ -628,3 +638,50 @@ int t4(config *c) /* previous transition */ mvt *m = mvt_stack_3(config_get_history(c)); return (m == NULL)? -1 : mvt_get_type(m); } + + + +int mvt0(config *c) +{ + if(c->vcode_array == NULL) return -1; + return c->vcode_array[0].class_code; +} + +int mvt1(config *c) +{ + if(c->vcode_array == NULL) return -1; + return c->vcode_array[1].class_code; +} + +int delta1(config *c) +{ + if(c->vcode_array == NULL) return -1; + int delta = (int) (c->vcode_array[0].score - c->vcode_array[1].score); + return (delta >= 10)? 10: delta; +} + +int mvt2(config *c) +{ + if(c->vcode_array == NULL) return -1; + return c->vcode_array[2].class_code; +} + +int delta2(config *c) +{ + if(c->vcode_array == NULL) return -1; + int delta = (int) (c->vcode_array[0].score - c->vcode_array[2].score); + return (delta >= 10)? 10: delta; +} + +int mvt3(config *c) +{ + if(c->vcode_array == NULL) return -1; + return c->vcode_array[3].class_code; +} + +int delta3(config *c) +{ + if(c->vcode_array == NULL) return -1; + int delta = (int) (c->vcode_array[0].score - c->vcode_array[3].score); + return (delta >= 10)? 10: delta; +} diff --git a/maca_trans_parser/src/feat_fct.h b/maca_trans_parser/src/feat_fct.h index 457232b42e4d4f08cba39710d151faff03396e1e..6d8cca86fbe9df9d6fe7e1e55ec214c1758f5c60 100644 --- a/maca_trans_parser/src/feat_fct.h +++ b/maca_trans_parser/src/feat_fct.h @@ -157,6 +157,7 @@ int s3r(config *c); int b0g(config *c); int b0sf(config *c); +int b0len(config *c); int b0f(config *c); @@ -443,4 +444,14 @@ int t3(config *c); int t4(config *c); + + +int mvt0(config *c); +int mvt1(config *c); +int delta1(config *c); +int mvt2(config *c); +int delta2(config *c); +int mvt3(config *c); +int delta3(config *c); + #endif diff --git a/maca_trans_parser/src/feat_lib.c b/maca_trans_parser/src/feat_lib.c index e76cc2ab249b0db71ad1d34553ab084f9400b857..83a0d748b7b23949637cc6f739c002dc28b1fcdb 100644 --- a/maca_trans_parser/src/feat_lib.c +++ b/maca_trans_parser/src/feat_lib.c @@ -180,6 +180,8 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"b0g", b0g); feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"b0sf", b0sf); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0len", b0len); + feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"b0f", b0f); feat_lib_add(fl, FEAT_TYPE_LEMMA, (char *)"b0l", b0l); feat_lib_add(fl, FEAT_TYPE_CPOS, (char *)"b0c", b0c); @@ -458,6 +460,16 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t3", t3); feat_lib_add(fl, FEAT_TYPE_TRANS, (char *)"t4", t4); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"mvt0", mvt0); + + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"mvt1", mvt1); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"delta1", delta1); + + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"mvt2", mvt2); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"delta2", delta2); + + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"mvt3", mvt3); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"delta3", delta3); return fl; } diff --git a/maca_trans_parser/src/maca_trans_tagger.c b/maca_trans_parser/src/maca_trans_tagger.c index 0bec1311ee5d70b0749ae260a9f80d83d46bf8fb..fb78f9cefa2e59934e95f5c84a953b9ae9112de0 100644 --- a/maca_trans_parser/src/maca_trans_tagger.c +++ b/maca_trans_parser/src/maca_trans_tagger.c @@ -98,7 +98,7 @@ int main(int argc, char *argv[]) ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); if(ctx->beam_width == 1) - simple_decoder_tagger(ctx); + simple_decoder_tagger2(ctx); context_free(ctx); return 0; diff --git a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c index 01342e6628ac3ffaf1fc2ff9c1d9911e9cf1e1c2..916863ba90807f16dc82b52cc9e6a183393b8e1a 100644 --- a/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c +++ b/maca_trans_parser/src/maca_trans_tagger_mcf2cff.c @@ -95,17 +95,16 @@ void generate_training_file(FILE *output_file, context *ctx) c = config_new(conll_file, ctx->mcd_struct, 5); - while(!config_is_terminal(c)){ - /* config_print(stdout,c); */ + while(!config_is_terminal(c)){ if(ctx->f2p) /*add_signature_to_words_in_word_buffer(c->bf, ctx->f2p, dico_pos); */ - add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); + add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); - postag = oracle_tagger(c, NULL); + postag = oracle_tagger(c); + fprintf(output_file, "%d", postag); feat_vec_print(output_file, fv); - int res = movement_tagger(c, postag); - if(res == 0) break; + movement_tagger(c, postag); } } diff --git a/maca_trans_parser/src/maca_trans_tagger_mcf2cff_bt.c b/maca_trans_parser/src/maca_trans_tagger_mcf2cff_bt.c new file mode 100644 index 0000000000000000000000000000000000000000..68eb95f7af388637b924ee40f99d300f4812c9e0 --- /dev/null +++ b/maca_trans_parser/src/maca_trans_tagger_mcf2cff_bt.c @@ -0,0 +1,207 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"movement_tagger.h" +#include"oracle_tagger.h" +#include"feat_fct.h" +#include"feature_table.h" +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"word_emb.h" +#include"config2feat_vec.h" + +void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p) +{ + int i; + word *w; + char lower_form[100]; + + for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){ + w = word_buffer_get_word_n(bf, i); + if(word_get_signature(w) != -1) break; + w->signature = form2pos_get_signature(f2p, w->form); + if(w->signature == -1){ + strcpy(lower_form, w->form); + to_lower_string(lower_form); + w->signature = form2pos_get_signature(f2p, lower_form); + } + } +} + +void maca_trans_parser_mcf2cff_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + context_mcd_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_vocabs_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_vocabs_help_message(ctx); + + +} + +void maca_trans_parser_mcf2cff_check_options(context *ctx) +{ + if(!ctx->input_filename + || ctx->help + /* || !ctx->mcd_filename */ + || !(ctx->cff_filename || ctx->fann_filename) + ){ + maca_trans_parser_mcf2cff_help_message(ctx); + exit(1); + } +} + +void generate_training_file(FILE *output_file, context *ctx) +{ + config *config_oracle; + feat_vec *fv = feat_vec_new(feature_types_nb); + FILE *conll_file = myfopen(ctx->input_filename, "r"); + FILE *conll_file_predicted = myfopen(ctx->input_filename, "r"); + int postag_oracle; + dico *dico_pos_oracle = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); + + feat_model *local_feat_model = feat_model_read("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.fm", ctx->verbose); + dico_vec *local_dico_vec = dico_vec_read("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.vocab", ctx->hash_ratio); + dico *dico_pos_local = dico_vec_get_dico(local_dico_vec, (char *)"POS"); + feature_table *local_ft = feature_table_load("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.model", ctx->verbose); + dico *local_perceptron_features = dico_vec_get_dico(local_dico_vec, (char *)"d_perceptron_features"); + config *config_predicted; + int postag_predicted; + int i; + char *postag_oracle_string; + char *postag_predicted_string; + + config_predicted = config_new(conll_file_predicted, ctx->mcd_struct, 5); + config_oracle = config_new(conll_file, ctx->mcd_struct, 5); + + while(!config_is_terminal(config_oracle)){ + if(ctx->f2p){ + add_signature_to_words_in_word_buffer(config_oracle->bf, ctx->f2p); + add_signature_to_words_in_word_buffer(config_predicted->bf, ctx->f2p); + } + + postag_oracle = word_get_pos(word_buffer_b0(config_get_buffer(config_oracle))); + postag_oracle_string = dico_int2string(dico_pos_oracle, postag_oracle); + + config2feat_vec_cff(local_feat_model, config_predicted, local_perceptron_features, fv, ctx->mode); + + if(config_predicted->vcode_array) + free(config_predicted->vcode_array); + config_predicted->vcode_array = feature_table_get_vcode_array(fv, local_ft); + + postag_predicted = config_predicted->vcode_array[0].class_code; + postag_predicted_string = dico_int2string(dico_pos_local, postag_predicted); + + if(ctx->debug_mode){ + if(strcmp(postag_oracle_string, postag_predicted_string)){ + fprintf(stdout, "**************** DIFFERENTS ***********\n"); + fprintf(stdout, "%s\n", word_get_input(word_buffer_b0(config_get_buffer(config_oracle)))); + } + } + + forward(config_predicted, postag_predicted); + forward(config_oracle, postag_oracle); + + if(!strcmp(postag_oracle_string, postag_predicted_string)){ + fprintf(output_file, "0"); + config2feat_vec_cff(ctx->features_model, config_predicted, ctx->d_perceptron_features, fv, ctx->mode); + feat_vec_print(output_file, fv); + } + + int choice = 1; + while(strcmp(postag_oracle_string, postag_predicted_string) && (choice < 3)){ + + if(ctx->debug_mode){ + fprintf(stdout, "%d postag oracle = %s postag predicted = %s\n", + word_buffer_get_current_index(config_get_buffer(config_oracle)), + dico_int2string(dico_pos_oracle, postag_oracle), + dico_int2string(dico_pos_local, postag_predicted)); + + for(i=0; i < 5; i++) + fprintf(stdout, "%d\t%s\t%.4f\t%.4f\n", i, + dico_int2string(dico_pos_local, config_predicted->vcode_array[i].class_code), + config_predicted->vcode_array[i].score, + config_predicted->vcode_array[i].score - config_predicted->vcode_array[0].score); + fprintf(stdout, "CHOICE %d\n", choice); + } + postag_predicted = config_predicted->vcode_array[choice].class_code; + postag_predicted_string = dico_int2string(dico_pos_local, postag_predicted); + + if(!strcmp(postag_predicted_string, postag_oracle_string)){ + if(ctx->debug_mode){ + printf("GOOD CHOICE\n"); + } + fprintf(output_file, "%d", choice); + config2feat_vec_cff(ctx->features_model, config_predicted, ctx->d_perceptron_features, fv, ctx->mode); + feat_vec_print(output_file, fv); + choice_n(config_predicted, choice); + } + choice++; + } + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + + ctx = context_read_options(argc, argv); + maca_trans_parser_mcf2cff_check_options(ctx); + + ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); + + if(ctx->mode == TRAIN_MODE){ + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + } + else if(ctx->mode == TEST_MODE){ + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + } + + feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); + + /* in train mode create feature dictionnary for perceptron */ + if(ctx->mode == TRAIN_MODE) + ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); + + /* in test mode read feature dictionnary for perceptron */ + if(ctx->mode == TEST_MODE) + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + /* add the feature dictionnary to the dico vector */ + dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + + /* open output file */ + if(ctx->cff_filename) + output_file = myfopen(ctx->cff_filename, "w"); + else + output_file = stdout; + + generate_training_file(output_file, ctx); + + if(ctx->mode == TRAIN_MODE){ + /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ + dico_vec_print(ctx->vocabs_filename, ctx->vocabs); + + } + + if(ctx->cff_filename) + fclose(output_file); + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/movement_tagger.c b/maca_trans_parser/src/movement_tagger.c index 8e7d89660958c78bac6efb439442c830df3ed6bd..77214b3381134acbfc36f114cb69d70f85dd9389 100644 --- a/maca_trans_parser/src/movement_tagger.c +++ b/maca_trans_parser/src/movement_tagger.c @@ -11,3 +11,26 @@ int movement_tagger(config *c, int postag) return 1; } + +int forward(config *c, int postag) +{ + word_set_pos(word_buffer_b0(c->bf), postag); + word_buffer_move_right(c->bf); + c->mvt_chosen = 0; + return 1; +} + +int choice_n(config *c, int n) +{ + word_set_pos(word_buffer_bm1(c->bf), c->vcode_array[n].class_code); + c->mvt_chosen = n; + return 1; +} + +int backward(config *c) +{ + word_set_pos(word_buffer_b0(c->bf), -1); + word_buffer_move_left(c->bf); + + return 1; +} diff --git a/maca_trans_parser/src/movement_tagger.h b/maca_trans_parser/src/movement_tagger.h index 1b7dfbe2ac21b4d68b6cb17fc4efd3d3941ec187..b93222d3cfc5d197ceca068d7352cdfdd481b0ff 100644 --- a/maca_trans_parser/src/movement_tagger.h +++ b/maca_trans_parser/src/movement_tagger.h @@ -5,4 +5,12 @@ #include"feat_vec.h" int movement_tagger(config *c, int postag); + +int forward(config *c, int postag); +int next_choice(config *c, int postag); +int backward(config *c); +int choice_n(config *c, int n); + + + #endif diff --git a/maca_trans_parser/src/oracle_tagger.c b/maca_trans_parser/src/oracle_tagger.c index d948b278380e3332d9fde398c6d6a1d07fbe5d5f..bbdef3670a8a160ccc05f2ccd06532a581a8d578 100644 --- a/maca_trans_parser/src/oracle_tagger.c +++ b/maca_trans_parser/src/oracle_tagger.c @@ -1,17 +1,6 @@ #include"oracle_tagger.h" -int oracle_tagger(config *c, sentence *ref) +int oracle_tagger(config *c) { - word *b0; /* next word in the bufer */ - /* int b0_index; */ - int b0_pos; - if(!word_buffer_is_empty(c->bf)){ - b0 = word_buffer_b0(c->bf); - b0_pos = word_get_pos(b0); - /* printf("b0_pos = %d\n", b0_pos); */ - /* b0_index = word_get_index(b0); */ - /* return word_get_pos(ref->words[b0_index]); */ - return b0_pos; - } - return -1; + return word_get_pos(word_buffer_b0(config_get_buffer(c))); } diff --git a/maca_trans_parser/src/oracle_tagger.h b/maca_trans_parser/src/oracle_tagger.h index 360421c8159138f7bd5f139193a22d22622dc5af..28e3478336817b440cc52adcf94f36b6a21ede34 100644 --- a/maca_trans_parser/src/oracle_tagger.h +++ b/maca_trans_parser/src/oracle_tagger.h @@ -4,8 +4,7 @@ #include<stdio.h> #include<stdlib.h> #include"config.h" -#include"sentence.h" -int oracle_tagger(config *c, sentence *ref); +int oracle_tagger(config *c); #endif diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c index 9a50f63a81f68ff20b4d9617c7516349069ac6be..ea9ddd6b37545b8c87effa572dbd89b2821346b4 100644 --- a/maca_trans_parser/src/simple_decoder_tagger.c +++ b/maca_trans_parser/src/simple_decoder_tagger.c @@ -82,6 +82,7 @@ void print_word(word *w, mcd *mcd_struct, dico *dico_pos, int postag) } } +#if 1 void simple_decoder_tagger(context *ctx) { config *c; @@ -135,3 +136,83 @@ void simple_decoder_tagger(context *ctx) /* config_print(stdout, c); */ config_free(c); } +#endif + +#if 0 + +void simple_decoder_tagger(context *ctx) +{ + config *c; + feat_vec *fv = feat_vec_new(feature_types_nb); + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + int postag; + float max; + dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); + + c = config_new(f, ctx->mcd_struct, 5); + + while(!config_is_terminal(c)){ + if(ctx->f2p) + /* add_signature_to_words_in_word_buffer(c->bf, ctx->f2p, dico_pos); */ + add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); + + if(ctx->debug_mode){ + fprintf(stderr, "***********************************\n"); + fprintf(stderr, "b0 lex = %d\n", word_get_form(word_buffer_b0(config_get_buffer(c)))); + config_print(stderr, c); + } + + /* config_print(stdout, c); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + + /* feat_vec_print(stdout, fv); */ + postag = feature_table_argmax(fv, ft, &max); + /* printf("postag = %d\n", postag); */ + + if(ctx->debug_mode){ + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + for(int i=0; i < 3; i++){ + fprintf(stderr, "%d\t", i); + fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score); + } + free(vcode_array); + } + + word_set_pos(word_buffer_b0(config_get_buffer(c)), postag); + + if((word_buffer_b0(config_get_buffer(c)))->index > 0){ + /* word_buffer_move_left(config_get_buffer(c)); */ + word_buffer_move_left(config_get_buffer(c)); + int postag_old = word_get_pos(word_buffer_b0(config_get_buffer(c))); + + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + int postag_new = feature_table_argmax(fv, ft, &max); + + if(ctx->debug_mode){ + fprintf(stderr, "***********************************\n"); + fprintf(stderr, "b1p = %s\n", dico_int2string(dico_pos, b1p(c))); + fprintf(stderr, "bm1p = %s\n", dico_int2string(dico_pos, bm1p(c))); + fprintf(stderr, "b0 index = %d\n", word_get_index(word_buffer_b0(config_get_buffer(c)))); + fprintf(stderr, "b0 lex = %d\n", word_get_form(word_buffer_b0(config_get_buffer(c)))); + config_print(stderr, c); + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + for(int i=0; i < 3; i++){ + fprintf(stderr, "%d\t", i); + fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score); + } + free(vcode_array); + } + + if(postag_new != postag_old) + fprintf(stderr, "postag old = %s postag_new = %s\n", dico_int2string(dico_pos, postag_old), dico_int2string(dico_pos, postag_new)); + + word_set_pos(word_buffer_b0(c->bf), postag_new); + print_word(word_buffer_b0(c->bf), ctx->mcd_struct, dico_pos, postag_new); + word_buffer_move_right(config_get_buffer(c)); + } + word_buffer_move_right(config_get_buffer(c)); + } + config_free(c); +} +#endif diff --git a/maca_trans_parser/src/simple_decoder_tagger_bt.c b/maca_trans_parser/src/simple_decoder_tagger_bt.c new file mode 100644 index 0000000000000000000000000000000000000000..5d28c28e1db0202dd21d51db0d96b81b8eb9bcda --- /dev/null +++ b/maca_trans_parser/src/simple_decoder_tagger_bt.c @@ -0,0 +1,134 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include<ctype.h> + +#include"context.h" +#include"movement_tagger.h" +#include"feat_fct.h" +#include"config2feat_vec.h" +#include"feature_table.h" +#include"dico.h" +#include"mcd.h" + +void add_signature_to_words_in_word_buffer2(word_buffer *bf, form2pos *f2p) +{ + int i; + word *w; + char lower_form[100]; + + for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){ + w = word_buffer_get_word_n(bf, i); + if(word_get_signature(w) != -1) break; + w->signature = form2pos_get_signature(f2p, w->form); + if(w->signature == -1){ + strcpy(lower_form, w->form); + to_lower_string(lower_form); + w->signature = form2pos_get_signature(f2p, lower_form); + } + } +} + +void print_word2(word *w, mcd *mcd_struct, dico *dico_pos, int postag) +{ + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + if(mcd_get_pos_col(mcd_struct) == -1){ + printf("%s\t%s\n", w->input, dico_int2string(dico_pos, postag)); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) printf("\t"); + if(col_nb == mcd_get_pos_col(mcd_struct)) + printf("%s", dico_int2string(dico_pos, postag)); + else + word_print_col_n(stdout, w, col_nb); + col_nb++; + token = strtok(NULL, "\t"); + } + if(col_nb <= mcd_get_pos_col(mcd_struct)) + printf("\t%s", dico_int2string(dico_pos, postag)); + printf("\n"); + free(buffer); + } +} + +void simple_decoder_tagger2(context *ctx) +{ + config *c; + feat_vec *fv = feat_vec_new(2); + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + + int postag; + + feat_model *local_feat_model = feat_model_read("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.fm", ctx->verbose); + dico_vec *local_dico_vec = dico_vec_read("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.vocab", ctx->hash_ratio); + dico *local_dico_pos = dico_vec_get_dico(local_dico_vec, (char *)"POS"); + dico *local_perceptron_features = dico_vec_get_dico(local_dico_vec, (char *)"d_perceptron_features"); + feature_table *local_ft = feature_table_load("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.model", ctx->verbose); + + c = config_new(f, ctx->mcd_struct, 5); + + while(!config_is_terminal(c)){ + if(ctx->f2p) + /* add_signature_to_words_in_word_buffer2(c->bf, ctx->f2p, dico_pos); */ + add_signature_to_words_in_word_buffer2(c->bf, ctx->f2p); + + /* postag = word_get_pos(word_buffer_b0(c->bf)); */ + if(ctx->debug_mode){ + fprintf(stderr, "***********************************\n"); + config_print(stderr, c); + } + + /* if postag is not specified in input it is predicted */ + /* if(postag == -1){ */ + + /* apply local model */ + config2feat_vec_cff(local_feat_model, c, local_perceptron_features, fv, LOOKUP_MODE); + + if(c->vcode_array) free(c->vcode_array); + c->vcode_array = feature_table_get_vcode_array(fv, local_ft); + + postag = c->vcode_array[0].class_code; + + if(ctx->debug_mode){ + fprintf(stderr, "apply local model\n"); + for(int i=0; i < 5; i++) + fprintf(stderr, "%d\t%s\t%.4f\n", i, dico_int2string(local_dico_pos, c->vcode_array[i].class_code), c->vcode_array[i].score); + } + + forward(c, postag); + + /* apply global model */ + + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + + if(ctx->debug_mode){ + fprintf(stderr, "apply global model\n"); + for(int i=0; i < 3; i++) + fprintf(stderr, "%d\t%d\t%.4f\n", i, vcode_array[i].class_code, vcode_array[i].score); + } + + int choice = vcode_array[0].class_code; + if(choice != 0){ + postag = c->vcode_array[choice].class_code; + choice_n(c, choice); + } + free(vcode_array); + /* } */ + print_word2(word_buffer_bm1(c->bf), ctx->mcd_struct, local_dico_pos, postag); + + } + /* config_print(stdout, c); */ + config_free(c); +} + diff --git a/perceptron/lib/src/feature_table.c b/perceptron/lib/src/feature_table.c index 99ac711a528f01a306a22bc921e9e5f3f2c5b0c2..5c2384aa457685b2c68faa96577e8e29139a9152 100644 --- a/perceptron/lib/src/feature_table.c +++ b/perceptron/lib/src/feature_table.c @@ -325,8 +325,10 @@ vcode* feature_table_get_vcode_array(feat_vec *fv, feature_table* ft) table[cla].score = 0; table[cla].class_code = cla; for(feat = 0; feat < fv->nb;feat++){ + if(fv->t[feat] >= ft->features_nb) continue; if(fv->t[feat] == -1) continue; - table[cla].score += ft->table[fv->t[feat]][cla]; + table[cla].score += + ft->table[fv->t[feat]][cla]; } } table[classes_nb].class_code = -1;