diff --git a/maca_trans_parser/src/maca_trans_attach_punct.c b/maca_trans_parser/src/maca_trans_attach_punct.c new file mode 100644 index 0000000000000000000000000000000000000000..f528c993848bbe99b0e0cc6f86e9fb9f129482dc --- /dev/null +++ b/maca_trans_parser/src/maca_trans_attach_punct.c @@ -0,0 +1,209 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"word_emb.h" +#include"config2feat_vec.h" + +void maca_trans_attach_punct_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_vocabs_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_vocabs_help_message(ctx); + + + context_root_label_help_message(ctx); + context_punct_label_help_message(ctx); + +} + +void maca_trans_attach_punct_check_options(context *ctx) +{ + if(!ctx->input_filename + || ctx->help + /* || !ctx->mcd_filename */ + /* || !(ctx->cff_filename || ctx->fann_filename) */ + ){ + maca_trans_attach_punct_help_message(ctx); + exit(1); + } +} + +int look_left_for_target(word_buffer *wb, int target) +{ + int position = word_buffer_get_current_index(wb) - 1; + int gov; + int step = 0; + do{ + step--; + if(position == target) return step; + gov = word_get_gov(word_buffer_get_word_n(wb, position)); + position += gov; + // printf("position = %d gov = %d\n", position, gov); + }while((position >= 0) && (gov < 0)); + return 0; +} + +int look_right_for_target(word_buffer *wb, int target) +{ + int position = word_buffer_get_current_index(wb) + 1; + int gov; + int step = 0; + do{ + step++; + if(position == target) return step; + gov = word_get_gov(word_buffer_get_word_n(wb, position)); + position += gov; + // printf("position = %d gov = %d\n", position, gov); + }while((position < word_buffer_get_nbelem(wb)) && (gov > 0)); + return 0; +} + + + +int get_left_attachement_site(word_buffer *wb) +{ + int position = word_buffer_get_current_index(wb) - 1; + word *w; + int gov = 0; + do{ + position += gov; + w = word_buffer_get_word_n(wb, position); + gov = word_get_gov(w); + // printf("position = %d gov = %d\n", position, gov); + }while((position >= 0) && (gov < 0)); + return position; +} + +int get_right_attachement_site(word_buffer *wb) +{ + int position = word_buffer_get_current_index(wb) + 1; + word *w; + int gov = 0; + do{ + position += gov; + w = word_buffer_get_word_n(wb, position); + gov = word_get_gov(w); + // printf("position = %d gov = %d\n", position, gov); + }while((position < word_buffer_get_nbelem(wb)) && (gov > 0)); + return position; +} + +void generate_training_file(FILE *output_file, context *ctx) +{ + config *c; + int mvt_code; + char mvt_type; + int mvt_label; + feat_vec *fv = feat_vec_new(feature_types_nb); + int sentence_nb = 0; + int root_label = dico_string2int(ctx->dico_labels, (char *) ctx->root_label); + int punct_label = dico_string2int(ctx->dico_labels, (char *) ctx->punct_label); + word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct); + FILE *mcf_file = myfopen(ctx->input_filename, "r"); + word *b0; + int left_attachement_site; + int right_attachement_site; + int target; + int cla; + int highest_left, highest_right; + // c = config_new(mcf_file, mcd_struct_hyp, 5); + + while(!word_buffer_end(ref) && (sentence_nb < ctx->sent_nb)){ + b0 = word_buffer_b0(ref); + if(word_get_label(b0) == punct_label){ + left_attachement_site = get_left_attachement_site(ref); + right_attachement_site = get_right_attachement_site(ref); + highest_left = (word_get_index(b0) + word_get_gov(b0) == left_attachement_site)? 1 : 0; + highest_right = (word_get_index(b0) + word_get_gov(b0) == right_attachement_site)? 1 : 0; + + if(highest_left) printf("class = HL\n"); + else if(highest_right) printf("class = HR\n"); + else{ + target = word_get_index(b0) + word_get_gov(b0); + if(word_get_gov(b0) < 0){ + cla = look_left_for_target(ref, target); + + printf("class = %d", cla); + // if(highest_left) printf("*"); + printf("\n"); + } + + if(word_get_gov(b0) > 0){ + cla = look_right_for_target(ref, target); + printf("class = %d", cla); + // if(highest_right) printf("*"); + printf("\n"); + } + } + + } + word_buffer_move_right(ref); + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + + ctx = context_read_options(argc, argv); + maca_trans_attach_punct_check_options(ctx); + + // ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + + if(ctx->mode == TRAIN_MODE){ + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + } + else if(ctx->mode == TEST_MODE){ + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + } + + ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + + if(ctx->dico_labels == NULL){ + fprintf(stderr, "cannot find label names\n"); + return 1; + } + + /* in train mode create feature dictionnary for perceptron */ + if(ctx->mode == TRAIN_MODE) + ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); + + /* in test mode read feature dictionnary for perceptron */ + if(ctx->mode == TEST_MODE) + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + /* add the feature dictionnary to the dico vector */ + // dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + + /* open output file */ + output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w") : stdout; + + generate_training_file(output_file, ctx); + + /* if(ctx->mode == TRAIN_MODE) + dico_vec_print(ctx->vocabs_filename, ctx->vocabs);*/ + + if(ctx->cff_filename) + fclose(output_file); + context_free(ctx); + return 0; +} +