#include<stdio.h> #include<stdlib.h> #include<string.h> #include<unistd.h> #include<getopt.h> #include"movement_parser.h" #include"oracle_parser.h" #include"feat_fct.h" #include"context.h" #include"feat_vec.h" #include"dico_vec.h" #include"word_emb.h" #include"config2feat_vec.h" void maca_trans_parser_conll2cff_help_message(context *ctx) { context_general_help_message(ctx); context_mode_help_message(ctx); context_sent_nb_help_message(ctx); fprintf(stderr, "INPUT\n"); context_conll_help_message(ctx); fprintf(stderr, "IN TEST MODE\n"); context_vocabs_help_message(ctx); fprintf(stderr, "OUTPUT\n"); context_cff_help_message(ctx); fprintf(stderr, "IN TRAIN MODE\n"); context_vocabs_help_message(ctx); } void maca_trans_parser_conll2cff_check_options(context *ctx) { if(!ctx->input_filename || ctx->help /* || !ctx->mcd_filename */ /* || !(ctx->cff_filename || ctx->fann_filename) */ ){ maca_trans_parser_conll2cff_help_message(ctx); exit(1); } } void generate_training_file_stream(FILE *output_file, context *ctx) { config *c; int mvt_code; char mvt_type; int mvt_label; feat_vec *fv = feat_vec_new(feature_types_nb); sentence *ref = NULL; int sentence_nb = 0; /* int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), (char *) ctx->root_label); */ int root_label = dico_string2int(ctx->dico_labels, (char *) ctx->root_label); FILE *conll_file = myfopen(ctx->input_filename, "r"); FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); c = config_initial(conll_file, ctx->mcd_struct, 5); while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ /* sentence_print(stdout, ref, ctx->dico_labels); */ while(1){ /* config_print(stdout,c); */ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); /* feat_vec_print(stdout, fv); */ mvt_code = oracle_parser(c, ref); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); /* printf("mvt code = %d\n", mvt_code); */ /* movement_print(stdout, mvt_code, ctx->dico_labels); */ fprintf(output_file, "%d", mvt_code); feat_vec_print(output_file, fv); if(queue_is_empty(c->bf)) break; if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ /* sentence is complete */ /* create the root arc */ movement_right_arc(c, mvt_label, 0); /* shift dummy word in stack */ movement_shift(c, 1, 0); /* printf("sentence complete config : "); config_print(stdout,c); */ /* empty depset */ depset_free(c->ds); c->ds = depset_new(); sentence_free(ref); sentence_nb++; c->current_index = queue_renumber_words(c->bf); break; } if(mvt_type == MVT_LEFT){ movement_left_arc(c, mvt_label, 0); continue; } if(mvt_type == MVT_RIGHT){ movement_right_arc(c, mvt_label, 0); continue; } if(mvt_type == MVT_SHIFT){ movement_shift(c, 1, 0); continue; } } } } void generate_training_file_buffer(FILE *output_file, context *ctx) { config *c; int mvt_code; char mvt_type; int mvt_label; feat_vec *fv = feat_vec_new(feature_types_nb); sentence *ref = NULL; int sentence_nb = 0; FILE *conll_file = myfopen(ctx->input_filename, "r"); FILE *conll_file_ref = myfopen(ctx->input_filename, "r"); c = config_initial(conll_file, ctx->mcd_struct, 0); while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ /* sentence_print(stdout, ref, NULL); */ queue_read_sentence(c->bf, conll_file, ctx->mcd_struct); while(!config_is_terminal(c)){ /* config_print(stdout,c); */ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); mvt_code = oracle_parser(c, ref); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); /* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */ fprintf(output_file, "%d", mvt_code); feat_vec_print(output_file, fv); if(mvt_type == MVT_LEFT){ movement_left_arc(c, mvt_label, 0); continue; } if(mvt_type == MVT_RIGHT){ movement_right_arc(c, mvt_label, 0); continue; } if(mvt_type == MVT_SHIFT){ movement_shift(c, 0, 0); continue; } } config_free(c); c = config_initial(conll_file, ctx->mcd_struct, 0); sentence_nb++; } } int main(int argc, char *argv[]) { context *ctx; FILE *output_file; ctx = context_read_options(argc, argv); maca_trans_parser_conll2cff_check_options(ctx); ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); if(ctx->mode == TRAIN_MODE){ mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); } else if(ctx->mode == TEST_MODE){ ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); } /* dico_vec_print(NULL, ctx->vocabs); */ ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); if(ctx->dico_labels == NULL){ fprintf(stderr, "cannot find label names\n"); return 1; } ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 1; feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); /* in train mode create feature dictionnary for perceptron */ if(ctx->mode == TRAIN_MODE) ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); /* in test mode read feature dictionnary for perceptron */ if(ctx->mode == TEST_MODE) ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); /* add the feature dictionnary to the dico vector */ dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); /* open output file */ if(ctx->cff_filename) output_file = myfopen(ctx->cff_filename, "w"); else output_file = stdout; if(ctx->stream_mode) generate_training_file_stream(output_file, ctx); else generate_training_file_buffer(output_file, ctx); if(ctx->mode == TRAIN_MODE){ /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ dico_vec_print(ctx->vocabs_filename, ctx->vocabs); } if(ctx->cff_filename) fclose(output_file); context_free(ctx); return 0; }