diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index fb3e04dd772bfb808bcf522438c3cb49b89e54d3..a372ba31ae251d2398c936fba8c172d6c4fe3aa3 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -1,14 +1,17 @@ set(SOURCES src/context.c src/feat_desc.c - src/movement_parser_arc_eager.c +# src/movement_parser_arc_eager.c + src/movement_tagparser_arc_eager.c src/movement_tagger.c src/feat_fct.c src/global_feat_vec.c # src/oracle_parser.c - src/oracle_parser_arc_eager.c +# src/oracle_parser_arc_eager.c + src/oracle_tagparser_arc_eager.c src/oracle_tagger.c # src/simple_decoder_parser.c src/simple_decoder_parser_arc_eager.c + src/simple_decoder_tagparser_arc_eager.c src/simple_decoder_forrest.c src/simple_decoder_tagger.c src/feat_lib.c @@ -47,11 +50,17 @@ install (TARGETS maca_trans_tagger_mcf2cff DESTINATION bin) #target_link_libraries(maca_trans_parser_mcf2cff maca_common) #install (TARGETS maca_trans_parser_mcf2cff DESTINATION bin) -add_executable(maca_trans_parser_arc_eager_mcf2cff ./src/maca_trans_parser_arc_eager_mcf2cff.c) -target_link_libraries(maca_trans_parser_arc_eager_mcf2cff perceptron) -target_link_libraries(maca_trans_parser_arc_eager_mcf2cff transparse) -target_link_libraries(maca_trans_parser_arc_eager_mcf2cff maca_common) -install (TARGETS maca_trans_parser_arc_eager_mcf2cff DESTINATION bin) +#add_executable(maca_trans_parser_arc_eager_mcf2cff ./src/maca_trans_parser_arc_eager_mcf2cff.c) +#target_link_libraries(maca_trans_parser_arc_eager_mcf2cff perceptron) +#target_link_libraries(maca_trans_parser_arc_eager_mcf2cff transparse) +#target_link_libraries(maca_trans_parser_arc_eager_mcf2cff maca_common) +#install (TARGETS maca_trans_parser_arc_eager_mcf2cff DESTINATION bin) + +add_executable(maca_trans_tagparser_arc_eager_mcf2cff ./src/maca_trans_tagparser_arc_eager_mcf2cff.c) +target_link_libraries(maca_trans_tagparser_arc_eager_mcf2cff perceptron) +target_link_libraries(maca_trans_tagparser_arc_eager_mcf2cff transparse) +target_link_libraries(maca_trans_tagparser_arc_eager_mcf2cff maca_common) +install (TARGETS maca_trans_tagparser_arc_eager_mcf2cff DESTINATION bin) add_executable(compare_traces ./src/compare_traces.c) target_link_libraries(compare_traces perceptron) @@ -65,6 +74,12 @@ target_link_libraries(maca_trans_parser transparse) target_link_libraries(maca_trans_parser maca_common) install (TARGETS maca_trans_parser DESTINATION bin) +add_executable(maca_trans_tagparser ./src/maca_trans_tagparser.c) +target_link_libraries(maca_trans_tagparser perceptron) +target_link_libraries(maca_trans_tagparser transparse) +target_link_libraries(maca_trans_tagparser maca_common) +install (TARGETS maca_trans_tagparser DESTINATION bin) + add_executable(maca_trans_parser_forrest ./src/decode_forrest.c) target_link_libraries(maca_trans_parser_forrest perceptron) target_link_libraries(maca_trans_parser_forrest transparse) diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index e02095c8dc54cd47614e602b733f7ff38dd4609d..e1beddd40aef1cd7c60804349712cf6a4e184904 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -65,6 +65,7 @@ context *context_new(void) ctx->features_model = NULL; ctx->vocabs = NULL; ctx->dico_labels = NULL; + ctx->dico_postags = NULL; ctx->f2p = NULL; ctx->iteration_nb = 4; @@ -163,7 +164,7 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[21] = + static struct option long_options[22] = { {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 013a09645c0813c91c844d4db8081592ca5a3511..afdcd11c44e4d4bd402eb297131daa965e43e23e 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -16,6 +16,12 @@ #define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model" #define DEFAULT_F2P_FILENAME "fP" +#define DEFAULT_MULTI_COL_DESC_TAGPARSER_FILENAME "maca_trans_tagparser.mcd" +#define DEFAULT_FEATURES_MODEL_TAGPARSER_FILENAME "maca_trans_tagparser.fm" +#define DEFAULT_VOCABS_TAGPARSER_FILENAME "maca_trans_tagparser.vocab" +#define DEFAULT_MODEL_TAGPARSER_FILENAME "maca_trans_tagparser.model" +#define DEFAULT_F2P_FILENAME "fP" + #include "dico_vec.h" #include "feat_model.h" #include "mcd.h" @@ -52,6 +58,7 @@ typedef struct { int stream_mode; dico *d_perceptron_features; dico *dico_labels; + dico *dico_postags; char *maca_data_path; char *language; char *root_label; diff --git a/maca_trans_parser/src/maca_trans_tagparser.c b/maca_trans_parser/src/maca_trans_tagparser.c new file mode 100644 index 0000000000000000000000000000000000000000..6f990743965dbd5192e90a14e0ed4dcf8fc154c9 --- /dev/null +++ b/maca_trans_parser/src/maca_trans_tagparser.c @@ -0,0 +1,132 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"feat_fct.h" +#include"feature_table.h" +#include"dico.h" +#include"beam.h" +#include"simple_decoder_tagparser_arc_eager.h" +#include"config2feat_vec.h" + +void decode_help_message(context *ctx) +{ + context_general_help_message(ctx); + /* context_beam_help_message(ctx); */ + /* context_conll_help_message(ctx); */ + fprintf(stderr, "INPUT\n"); + context_input_help_message(ctx); + context_mcd_help_message(ctx); + context_model_help_message(ctx); + context_vocabs_help_message(ctx); + context_features_model_help_message(ctx); + context_root_label_help_message(ctx); +} + +void decode_check_options(context *ctx){ + if(ctx->help + /*!ctx->conll_filename*/ + /* || !ctx->perc_model_filename + || !ctx->mcd_filename + || !ctx->vocabs_filename + || !ctx->features_model_filename*/ + ){ + decode_help_message(ctx); + exit(1); + } +} + +void set_linguistic_resources_filenames_tagparser(context *ctx) +{ + char absolute_path[500]; + char absolute_filename[500]; + + absolute_path[0] = '\0'; + + if(ctx->maca_data_path) + strcat(absolute_path, ctx->maca_data_path); + else + strcat(absolute_path, getenv("MACAON_DIR")); + + strcat(absolute_path, "/"); + strcat(absolute_path, ctx->language); + strcat(absolute_path, "/bin/"); + + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_MODEL_TAGPARSER_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_VOCABS_TAGPARSER_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + /* if(!ctx->mcd_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_TAGPARSER_FILENAME); + ctx->mcd_filename = strdup(absolute_filename); + }*/ + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_TAGPARSER_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + if(!ctx->f2p_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_F2P_FILENAME); + ctx->f2p_filename = strdup(absolute_filename); + ctx->f2p = form2pos_read(ctx->f2p_filename); + } + + + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename); + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + + ctx = context_read_options(argc, argv); + decode_check_options(ctx); + + set_linguistic_resources_filenames_tagparser(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + + /* dico_vec_print(NULL, ctx->vocabs); */ + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + + ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + ctx->dico_postags = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); + + if(ctx->dico_labels == NULL){ + fprintf(stderr, "cannot find label names\n"); + return 1; + } + + ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 3; + + /* load models */ + + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + simple_decoder_tagparser_arc_eager(ctx); + + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c b/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c new file mode 100644 index 0000000000000000000000000000000000000000..aeff65821e36595871bbe29fb881eaf8c21eeafe --- /dev/null +++ b/maca_trans_parser/src/maca_trans_tagparser_arc_eager_mcf2cff.c @@ -0,0 +1,199 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"movement_tagparser_arc_eager.h" +#include"oracle_tagparser_arc_eager.h" +#include"feat_fct.h" +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"word_emb.h" +#include"config2feat_vec.h" + +void maca_trans_parser_mcf2cff_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_vocabs_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_vocabs_help_message(ctx); + +} + +void maca_trans_parser_mcf2cff_check_options(context *ctx) +{ + if(!ctx->input_filename + || ctx->help + /* || !ctx->mcd_filename */ + /* || !(ctx->cff_filename || ctx->fann_filename) */ + ){ + maca_trans_parser_mcf2cff_help_message(ctx); + exit(1); + } +} + +void generate_training_file_stream(FILE *output_file, context *ctx) +{ + config *c; + int mvt_code; + char mvt_type; + int mvt_label; + feat_vec *fv = feat_vec_new(feature_types_nb); + int sentence_nb = 0; + int root_label = dico_string2int(ctx->dico_labels, (char *) ctx->root_label); + word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct); + FILE *mcf_file = myfopen(ctx->input_filename, "r"); + dico *dico_postag = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); + /* create an mcd that corresponds to ctx->mcd_struct, but without gov and label */ + /* the idea is to ignore syntax in the mcf file that will be read */ + /* it is ugly !!! */ + + mcd *mcd_struct_hyp = mcd_copy(ctx->mcd_struct); + mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_POS); + mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_GOV); + mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_LABEL); + mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_SENT_SEG); + + c = config_initial(mcf_file, mcd_struct_hyp, 5); + + while(!word_buffer_end(ref) && (sentence_nb < ctx->sent_nb)){ + /*printf("************ REF ************\n"); + word_buffer_print(stdout, ref); + printf("*****************************\n");*/ + + if(ctx->f2p) + add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); + + + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + + mvt_code = oracle_tagparser_arc_eager(c, ref, root_label); + mvt_type = movement_type(mvt_code); + mvt_label = movement_label(mvt_code); + + if(ctx->debug_mode){ + config_print(stdout,c); + movement_print(stdout, mvt_code, ctx->dico_labels, dico_postag); + fprintf(stdout, "\n"); + } + + if(ctx->trace_mode){ + fprintf(output_file, "%d\t", word_get_index(word_buffer_b0(config_get_buffer(c)))); + stack_print(output_file, c->st); + fprintf(output_file, "\t"); + + movement_print(output_file, mvt_code, ctx->dico_labels, dico_postag); + fprintf(output_file, "\t1\n"); + } + else{ + fprintf(output_file, "%d", mvt_code); + feat_vec_print(output_file, fv); + } + + if(mvt_type == MVT_EOS){ + movement_eos(c, 0); + sentence_nb++; + if(word_buffer_is_last(ref)) + break; + } + + if(mvt_type == MVT_POSTAG){ + movement_add_pos(c, 0, mvt_label); + continue; + } + + if(mvt_type == MVT_LEFT){ + movement_left_arc(c, mvt_label, 0); + continue; + } + + if(mvt_type == MVT_RIGHT){ + movement_right_arc(c, mvt_label, 0); + word_buffer_move_right(ref); + continue; + } + + if(mvt_type == MVT_REDUCE){ + movement_reduce(c, 0); + continue; + } + + if(mvt_type == MVT_ROOT){ + movement_root(c, 0, root_label); + continue; + } + + if(mvt_type == MVT_SHIFT){ + movement_shift(c, 1, 0); + word_buffer_move_right(ref); + continue; + } + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + + ctx = context_read_options(argc, argv); + maca_trans_parser_mcf2cff_check_options(ctx); + + ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); + + if(ctx->mode == TRAIN_MODE){ + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + } + else if(ctx->mode == TEST_MODE){ + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + } + + ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + + + if(ctx->dico_labels == NULL){ + fprintf(stderr, "cannot find label names\n"); + return 1; + } + ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 3; + + feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); + + + /* in train mode create feature dictionnary for perceptron */ + if(ctx->mode == TRAIN_MODE) + ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); + + /* in test mode read feature dictionnary for perceptron */ + if(ctx->mode == TEST_MODE) + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + /* add the feature dictionnary to the dico vector */ + dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + + /* open output file */ + output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w") : stdout; + + generate_training_file_stream(output_file, ctx); + + if(ctx->mode == TRAIN_MODE) + dico_vec_print(ctx->vocabs_filename, ctx->vocabs); + + if(ctx->cff_filename) + fclose(output_file); + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/movement_tagparser_arc_eager.c b/maca_trans_parser/src/movement_tagparser_arc_eager.c new file mode 100644 index 0000000000000000000000000000000000000000..bdf7e4c6636cb7d3f01c7185791f0f26eef5f13f --- /dev/null +++ b/maca_trans_parser/src/movement_tagparser_arc_eager.c @@ -0,0 +1,157 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"util.h" +#include"movement_tagparser_arc_eager.h" + +void movement_print(FILE *f, int mvt_code, dico *dico_labels, dico *dico_postag) +{ + + int mvt_type = movement_type(mvt_code); + int mvt_label = movement_label(mvt_code); + char *label; + + if(mvt_type == MVT_SHIFT) {fprintf(f, "SHIFT"); return;} + if(mvt_type == MVT_REDUCE) {fprintf(f, "REDUCE"); return;} + if(mvt_type == MVT_ROOT) {fprintf(f, "ROOT"); return;} + if(mvt_type == MVT_EOS) {fprintf(f, "EOS"); return;} + if(mvt_type == MVT_POSTAG){ + fprintf(f, "POSTAG"); + label = dico_int2string(dico_postag, mvt_label); + fprintf(f, " %s", label); + return; + } + if(mvt_type == MVT_RIGHT) fprintf(f, "RIGHT"); + else fprintf(f, "LEFT"); + label = dico_int2string(dico_labels, mvt_label); + fprintf(f, " %s", label); +} + +int movement_type(int mvt) +{ + if(mvt == MVT_SHIFT) return MVT_SHIFT; /* 0 */ + if(mvt == MVT_REDUCE) return MVT_REDUCE; /* 1 */ + if(mvt == MVT_ROOT) return MVT_ROOT; /* 2 */ + if(mvt == MVT_EOS) return MVT_EOS; /* 3 */ + if(mvt % 3 == 0) return MVT_RIGHT; /* 4, 7, 10 ... */ + if(mvt % 3 == 1) return MVT_POSTAG; /* 5, 8, 11 ... */ + /*if(mvt % 3 == 2)*/ return MVT_LEFT; /* 6, 9, 12 ... */ +} + +int movement_label(int mvt) +{ + if(mvt == MVT_SHIFT) return -1; + if(mvt == MVT_REDUCE) return -1; + if(mvt == MVT_ROOT) return -1; + if(mvt == MVT_EOS) return -1; + if(mvt % 3 == 1) /* pos movement */ + return (mvt - 4) / 3; + if(mvt % 3 == 2) /* left movement */ + return (mvt - 5) / 3; + /* if(mvt % 3 == 0)*/ /* right movement */ + return (mvt - 6) / 3; +} + +int movement_add_pos(config *c, float score, int pos) +{ + if(word_buffer_b0(config_get_buffer(c)) == NULL) return 0; + if(word_get_pos(word_buffer_b0(config_get_buffer(c))) != -1) return 0; + word_set_pos(word_buffer_b0(config_get_buffer(c)), pos); + + /* stack_push(config_get_stack(c), word_buffer_b0(config_get_buffer(c))); + word_buffer_move_right(config_get_buffer(c));*/ + config_add_mvt(c, movement_postag(pos)); + + return 1; +} + +int movement_eos(config *c, float score) +{ + if(stack_is_empty(config_get_stack(c))) return 0; + if(word_get_sent_seg(stack_top(config_get_stack(c))) == 1) return 0; + + + /* word on the top of the stack is sent_seg */ + word_set_sent_seg(stack_top(config_get_stack(c)), 1); + + /* (config_get_stack(c))->top = 0; */ + + config_add_mvt(c, MVT_EOS); + return 1; +} + +int movement_left_arc(config *c, int label, float score) +{ + if(stack_is_empty(config_get_stack(c))) return 0; + /* if(word_buffer_is_empty(config_get_buffer(c))) return 0; */ + + /* word on top of the stack should not have a governor */ + if(word_get_gov(stack_top(config_get_stack(c))) != WORD_INVALID_GOV) return 0; + + word *gov = word_buffer_b0(config_get_buffer(c)); + word *dep = stack_top(config_get_stack(c)); + int dist = (word_get_index(gov)) - (word_get_index(dep)); + + /* create a new dependency */ + word_set_gov(dep, dist); + word_set_label(dep, label); + + stack_pop(config_get_stack(c)); + config_add_mvt(c, movement_left_code(label)); + return 1; +} + +int movement_right_arc(config *c, int label, float score) +{ + if(stack_is_empty(config_get_stack(c))) return 0; + + word *gov = stack_top(config_get_stack(c)); + word *dep = word_buffer_b0(config_get_buffer(c)); + int dist = (word_get_index(gov)) - (word_get_index(dep)); + + /* create a new dependency */ + word_set_gov(dep, dist); + word_set_label(dep, label); + + stack_push(config_get_stack(c), word_buffer_b0(config_get_buffer(c))); + word_buffer_move_right(config_get_buffer(c)); + + config_add_mvt(c, movement_right_code(label)); + return 1; +} + +int movement_shift(config *c, int stream, float score) +{ + if(word_buffer_is_empty(config_get_buffer(c))) return 0; + stack_push(config_get_stack(c), word_buffer_b0(config_get_buffer(c))); + word_buffer_move_right(config_get_buffer(c)); + config_add_mvt(c, MVT_SHIFT); + return 1; +} + +int movement_reduce(config *c, float score) +{ + if(stack_nbelem(config_get_stack(c)) <= 1) return 0; + + /* if(stack_is_empty(config_get_stack(c))) return 0; */ + + /* word on top of stack must have a governor */ + if(word_get_gov(stack_top(config_get_stack(c))) == WORD_INVALID_GOV) return 0; + stack_pop(config_get_stack(c)); + config_add_mvt(c, MVT_REDUCE); + return 1; +} + +int movement_root(config *c, float score, int root_code) +{ + word *s0 = stack_top(config_get_stack(c)); + if(s0 == NULL) return 0; + word_set_gov(s0, 0); + word_set_label(s0, root_code); + s0->is_root = 1; + + stack_pop(config_get_stack(c)); + + config_add_mvt(c, MVT_ROOT); + return 1; +} diff --git a/maca_trans_parser/src/movement_tagparser_arc_eager.h b/maca_trans_parser/src/movement_tagparser_arc_eager.h new file mode 100644 index 0000000000000000000000000000000000000000..880e14c73cd46b8eff5b7583a557690b99fd99e6 --- /dev/null +++ b/maca_trans_parser/src/movement_tagparser_arc_eager.h @@ -0,0 +1,35 @@ +#ifndef __MOVEMENT_TAGPARSER_ARC_EAGER__ +#define __MOVEMENT_TAGPARSER_ARC_EAGER__ + +#include"config.h" +#include"feat_vec.h" + + +#define MVT_SHIFT 0 +#define MVT_REDUCE 1 +#define MVT_ROOT 2 +#define MVT_EOS 3 +#define MVT_LEFT 4 +#define MVT_RIGHT 5 +#define MVT_POSTAG 6 + +#define movement_postag(postag) (3 * (postag) + 4) + +/* even movements are left movements (except 0, which is shift and 2 which is root) */ +#define movement_left_code(label) (3 * (label) + 5) + +/* odd movements are right movements (except 1, which is reduce and 3 which is end_of_sentence) */ +#define movement_right_code(label) (3 * (label) + 6) + +int movement_type(int mvt); +int movement_label(int mvt); + +int movement_left_arc(config *c, int label, float score); +int movement_right_arc(config *c, int label, float score); +int movement_shift(config *c, int stream, float score); +int movement_reduce(config *c, float score); +int movement_root(config *c, float score, int root_code); +int movement_eos(config *c, float score); +int movement_add_pos(config *c, float score, int postag); +void movement_print(FILE *f, int mvt_code, dico *dico_labels, dico *dico_postag); +#endif diff --git a/maca_trans_parser/src/oracle_tagparser_arc_eager.c b/maca_trans_parser/src/oracle_tagparser_arc_eager.c new file mode 100644 index 0000000000000000000000000000000000000000..8d3a152420e540d7d3817d42071a3c9cf6c685c8 --- /dev/null +++ b/maca_trans_parser/src/oracle_tagparser_arc_eager.c @@ -0,0 +1,134 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"word_buffer.h" +#include"movement_tagparser_arc_eager.h" + +int check_all_dependents_of_word_in_ref_are_in_hyp(config *c, word_buffer *ref, int word_index) +{ + int dep; + int gov_ref; + int gov_hyp; + int sentence_change; + +#if 0 + for(dep = word_index - 1; (dep >= 0) && (word_get_sent_seg(word_buffer_get_word_n(ref, dep)) == 0); dep--){ + gov_ref = word_get_gov_index(word_buffer_get_word_n(ref, dep)); + if(gov_ref == word_index){ /* dep is a dependent of word in ref */ + /* check that dep has the same governor in hyp */ + gov_hyp = word_get_gov_index(word_buffer_get_word_n(config_get_buffer(c), dep)); + if(gov_hyp != gov_ref) return 0; + } + } + + for(dep = word_index + 1; ((dep < word_buffer_get_nbelem(ref)) && (word_get_sent_seg(word_buffer_get_word_n(ref, dep)) == 0)); dep++){ + gov_ref = word_get_gov_index(word_buffer_get_word_n(ref, dep)); + if(gov_ref == word_index){ /* dep is a dependent of word in ref */ + /* check that dep has the same governor in hyp */ + gov_hyp = word_get_gov_index(word_buffer_get_word_n(config_get_buffer(c), dep)); + if(gov_hyp != gov_ref) return 0; + } + } +#endif + +#if 1 + for(dep = word_index - 1; (dep >= 0) && (word_get_sent_seg(word_buffer_get_word_n(ref, dep)) == 0); dep--){ + gov_ref = word_get_gov_index(word_buffer_get_word_n(ref, dep)); + if(gov_ref == word_index){ /* dep is a dependent of word in ref */ + /* check that dep has the same governor in hyp */ + gov_hyp = word_get_gov_index(word_buffer_get_word_n(config_get_buffer(c), dep)); + if(gov_hyp != gov_ref) return 0; + } + } + + sentence_change = 0; + for(dep = word_index + 1; (dep < word_buffer_get_nbelem(ref)) && (sentence_change == 0); dep++){ + if(word_get_sent_seg(word_buffer_get_word_n(ref, dep)) == 1) + sentence_change = 1; + gov_ref = word_get_gov_index(word_buffer_get_word_n(ref, dep)); + if(gov_ref == word_index){ /* dep is a dependent of word in ref */ + /* look for a dependency in hyp such that its dependent is dep */ + gov_hyp = word_get_gov_index(word_buffer_get_word_n(config_get_buffer(c), dep)); + if(gov_hyp != gov_ref) return 0; + } + } + +#endif + + return 1; +} + +int oracle_tagparser_arc_eager(config *c, word_buffer *ref, int root_label) +{ + word *s0; /* word on top of stack */ + word *b0; /* next word in the bufer */ + int s0_index, b0_index; + int s0_gov_index, b0_gov_index; + int s0_label; + /* int s0_label_in_hyp; */ + + b0 = word_buffer_b0(config_get_buffer(c)); + b0_index = word_get_index(b0); + b0_gov_index = word_get_gov_index(word_buffer_get_word_n(ref, b0_index)); + + + /* give a pos to b0 if it does not have one */ + if(word_get_pos(b0) == -1){ + /* word_set_pos(b0, word_get_pos(word_buffer_get_word_n(ref, b0_index))); */ + /* return movement_postag(word_get_pos(b0)); */ + + return movement_postag(word_get_pos(word_buffer_get_word_n(ref, b0_index))); + } + + + /* if(!stack_is_empty(config_get_stack(c)) && !word_buffer_is_empty(config_get_buffer(c))){ */ + if(!stack_is_empty(config_get_stack(c))){ + + + s0 = stack_top(config_get_stack(c)); + s0_index = word_get_index(s0); + s0_gov_index = word_get_gov_index(word_buffer_get_word_n(ref, s0_index)); + s0_label = word_get_label(word_buffer_get_word_n(ref, s0_index)); + /* s0_label_in_hyp = word_get_label(word_buffer_get_word_n(config_get_buffer(c), s0_index)); */ + + /* printf("s0_index = %d b0_index = %d\n", s0_index, b0_index); + printf("dans ref gov de s0 (%d) = %d\n", s0_index, s0_gov_index); + printf("dans ref gov de b0 (%d) = %d\n", b0_index, b0_gov_index);*/ + + + + + /* s0 is the root of the sentence */ + if((s0_label == root_label) + && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index) + ){ + return MVT_ROOT; + } + + /* word on the top of the stack is an end of sentence marker */ + if((word_get_sent_seg(word_buffer_get_word_n(ref, s0_index)) == 1) + && (word_get_sent_seg(word_buffer_get_word_n(config_get_buffer(c), s0_index)) != 1)){ + return MVT_EOS; + } + + /* LEFT ARC b0 is the governor and s0 the dependent */ + if(s0_gov_index == b0_index){ + return movement_left_code(word_get_label(word_buffer_get_word_n(ref, s0_index))); + } + + /* RIGHT ARC s0 is the governor and b0 the dependent */ + if(b0_gov_index == s0_index){ + return movement_right_code(word_get_label(word_buffer_get_word_n(ref, b0_index))); + } + /* REDUCE */ + if((stack_nbelem(config_get_stack(c)) > 1) + && check_all_dependents_of_word_in_ref_are_in_hyp(c, ref, s0_index) /* word on top must have all its dependents */ + && (word_get_gov(stack_top(config_get_stack(c))) != WORD_INVALID_GOV)){ /* word on top of the stack has a governor */ + return MVT_REDUCE; + } + } + + /* SHIFT */ + return MVT_SHIFT; + +} diff --git a/maca_trans_parser/src/oracle_tagparser_arc_eager.h b/maca_trans_parser/src/oracle_tagparser_arc_eager.h new file mode 100644 index 0000000000000000000000000000000000000000..585f848c0bd248e236be5681953b2e8aeef7b5ef --- /dev/null +++ b/maca_trans_parser/src/oracle_tagparser_arc_eager.h @@ -0,0 +1,11 @@ +#ifndef __ORACLE_TAGPARSER_ARC_EAGER__ +#define __ORACLE_TAGPARSER_ARC_EAGER__ + + +#include"config.h" +#include"word_buffer.h" + + +int oracle_tagparser_arc_eager(config *c, word_buffer *ref, int root_label); + +#endif diff --git a/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.c b/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.c new file mode 100644 index 0000000000000000000000000000000000000000..469913e08921192b15386ac2c67fa78f0b3f930f --- /dev/null +++ b/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.c @@ -0,0 +1,160 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"movement_tagparser_arc_eager.h" +#include"feat_fct.h" +#include"config2feat_vec.h" +#include"feature_table.h" +#include"dico.h" + +void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p) +{ + int i; + word *w; + + for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){ + w = word_buffer_get_word_n(bf, i); + if(word_get_signature(w) != -1) break; + w->signature = form2pos_get_signature(f2p, w->form); + } +} + + + +void print_word_buffer_tagparser(config *c, dico *dico_labels, dico *dico_pos) +{ + int i; + word *dep; + char *label; + char *pos; + + for(i=0; i < config_get_buffer(c)->nbelem; i++){ + dep = word_buffer_get_word_n(config_get_buffer(c), i); + printf("%s\t", word_get_input(dep)); + pos = (word_get_pos(dep) == -1)? NULL : dico_int2string(dico_pos, word_get_pos(dep)); + if(pos != NULL) + printf("%s\t_\t", pos) ; + else + printf("_\t_\t"); + + printf("%d\t", word_get_gov(dep)); + label = (word_get_label(dep) == -1)? NULL : dico_int2string(dico_labels, word_get_label(dep)); + if(label != NULL) + printf("%s\t", label) ; + else + printf("_\t"); + if(word_get_sent_seg(dep) == 1) + printf("1\n") ; + else + printf("0\n"); + } +} + + +void simple_decoder_tagparser_arc_eager(context *ctx) +{ + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + int root_label; + int mvt_code; + int mvt_type; + int mvt_label; + float max; + feat_vec *fv = feat_vec_new(feature_types_nb); + config *c = NULL; + int result; + float entropy; + float delta; + int argmax1, argmax2; + float max1, max2; + int index; + + root_label = dico_string2int(ctx->dico_labels, ctx->root_label); + if(root_label == -1) root_label = 0; + + c = config_initial(f, ctx->mcd_struct, 5); + while(!config_is_terminal(c)){ + if(ctx->f2p) + add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); + + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + mvt_code = feature_table_argmax(fv, ft, &max); + mvt_type = movement_type(mvt_code); + mvt_label = movement_label(mvt_code); + + if(ctx->trace_mode){ + index = word_get_index(word_buffer_b0(config_get_buffer(c))); + fprintf(stdout, "%d\t", index); + + stack_print(stdout, c->st); + fprintf(stdout, "\t"); + + movement_print(stdout, mvt_code, ctx->dico_labels, ctx->dico_postags); + fprintf(stdout, "\t"); + feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2); + printf("%f\n", max1 - max2); + + } + + if(ctx->debug_mode){ + fprintf(stdout, "***********************************\n"); + config_print(stdout, c); + entropy = feature_table_entropy(fv, ft); + /* delta = feature_table_diff_scores(fv, ft); */ + feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2); + movement_print(stdout, argmax1, ctx->dico_labels, ctx->dico_postags); + printf(":\t%f\n", max1); + movement_print(stdout, argmax2, ctx->dico_labels, ctx->dico_postags); + printf(":\t%f\n", max2); + printf("delta = %f\n", max1 - max2); + + /* delta = feature_table_first_second(fv, ft); */ + /* printf("entropy = %f delta = %f\n", entropy, delta); */ + printf("entropy = %f\n",entropy); + + /* movement_print(stdout, mvt_code, ctx->dico_labels); */ + } + result = 0; + switch(mvt_type){ + case MVT_POSTAG : + result = movement_add_pos(c, max, mvt_label); + break; + case MVT_LEFT : + result = movement_left_arc(c, mvt_label, max); + break; + case MVT_RIGHT: + result = movement_right_arc(c, mvt_label, max); + break; + case MVT_REDUCE: + result = movement_reduce(c, max); + break; + case MVT_ROOT: + result = movement_root(c, max, root_label); + break; + case MVT_EOS: + result = movement_eos(c, max); + break; + case MVT_SHIFT: + result = movement_shift(c, 1, max); + } + + if(result == 0){ + if(ctx->debug_mode){ + fprintf(stdout, "WARNING : movement cannot be executed doing a SHIFT instead !\n"); + } + movement_shift(c, 1, max); + } + } + + if(!ctx->trace_mode) + print_word_buffer_tagparser(c, ctx->dico_labels, ctx->dico_postags); + + config_free(c); + feat_vec_free(fv); + feature_table_free(ft); + if(ctx->input_filename) + fclose(f); +} diff --git a/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.h b/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.h new file mode 100644 index 0000000000000000000000000000000000000000..03fdb14d286ac8f4fbcd77e07e337b138e8ba90a --- /dev/null +++ b/maca_trans_parser/src/simple_decoder_tagparser_arc_eager.h @@ -0,0 +1,7 @@ +#ifndef __SIMPLE_DECODER_TAGPARSER_ARC_EAGER__ +#define __SIMPLE_DECODER_TAGPARSER_ARC_EAGER__ +#include"context.h" + +void simple_decoder_tagparser_arc_eager(context *ctx); + +#endif