From 18b96fe3d975220dfd5a6d86c0a8fa082a957ab0 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Wed, 6 Jul 2016 16:49:51 -0400 Subject: [PATCH] added file maca_trans_parser_conll2cff.c transform_treebank is now obselete --- maca_common/include/mcd.h | 3 + maca_lemmatizer/src/context.c | 2 +- maca_trans_parser/CMakeLists.txt | 2 +- maca_trans_parser/src/context.c | 39 +-- maca_trans_parser/src/context.h | 1 + maca_trans_parser/src/decode.c | 3 +- .../src/maca_trans_parser_conll2cff.c | 225 ++++++++++++++++++ .../src/maca_trans_parser_conll2fann.c | 2 +- maca_trans_parser/src/sentence.c | 2 +- maca_trans_parser/src/sentence.h | 1 - maca_trans_parser/src/simple_decoder.c | 7 +- maca_trans_parser/src/transform_treebank.c | 2 +- 12 files changed, 265 insertions(+), 24 deletions(-) create mode 100644 maca_trans_parser/src/maca_trans_parser_conll2cff.c diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index e759789..df53f7c 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -15,6 +15,9 @@ #include "word_emb.h" #include "dico_vec.h" +#define mcd_get_dico_label(m) (m)->dico_array[FEAT_TYPE_LABEL] + + typedef struct { int nb_col; int type2col[FEAT_TYPE_NB]; diff --git a/maca_lemmatizer/src/context.c b/maca_lemmatizer/src/context.c index 9fa49c6..1bc694b 100644 --- a/maca_lemmatizer/src/context.c +++ b/maca_lemmatizer/src/context.c @@ -81,7 +81,7 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[8] = + static struct option long_options[10] = { {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 32c84d5..acda80b 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -32,7 +32,7 @@ target_link_libraries(maca_trans_parser_conll2fann transparse) target_link_libraries(maca_trans_parser_conll2fann maca_common) install (TARGETS maca_trans_parser_conll2fann DESTINATION bin) -add_executable(maca_trans_parser_conll2cff ./src/transform_treebank.c) +add_executable(maca_trans_parser_conll2cff ./src/maca_trans_parser_conll2cff.c) target_link_libraries(maca_trans_parser_conll2cff transparse) target_link_libraries(maca_trans_parser_conll2cff maca_common) install (TARGETS maca_trans_parser_conll2cff DESTINATION bin) diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 6558bc9..780b647 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -12,19 +12,20 @@ void context_set_linguistic_resources_filenames(context *ctx); void context_free(context *ctx) { - if(ctx->program_name) free(ctx->program_name); - if(ctx->conll_filename) free(ctx->conll_filename); - if(ctx->perc_model_filename) free(ctx->perc_model_filename); - if(ctx->dnn_model_filename) free(ctx->dnn_model_filename); - if(ctx->dico_features_filename) free(ctx->dico_features_filename); - if(ctx->dico_classes_filename) free(ctx->dico_classes_filename); - if(ctx->cff_filename) free(ctx->cff_filename); - if(ctx->fann_filename) free(ctx->fann_filename); - if(ctx->mcd_filename) free(ctx->mcd_filename); - if(ctx->stag_desc_filename) free(ctx->stag_desc_filename); + if(ctx->program_name) free(ctx->program_name); + if(ctx->conll_filename) free(ctx->conll_filename); + if(ctx->perc_model_filename) free(ctx->perc_model_filename); + if(ctx->dnn_model_filename) free(ctx->dnn_model_filename); + if(ctx->dico_features_filename) free(ctx->dico_features_filename); + if(ctx->dico_classes_filename) free(ctx->dico_classes_filename); + if(ctx->cff_filename) free(ctx->cff_filename); + if(ctx->fann_filename) free(ctx->fann_filename); + if(ctx->mcd_filename) free(ctx->mcd_filename); + if(ctx->stag_desc_filename) free(ctx->stag_desc_filename); if(ctx->features_model_filename) free(ctx->features_model_filename); - if(ctx->maca_data_path) free(ctx->maca_data_path); - if(ctx->language) free(ctx->language); + if(ctx->maca_data_path) free(ctx->maca_data_path); + if(ctx->language) free(ctx->language); + if(ctx->root_label) free(ctx->root_label); if(ctx->d_perceptron_features) dico_free(ctx->d_perceptron_features); @@ -59,6 +60,7 @@ context *context_new(void) c->maca_data_path = NULL; c->language = strdup("fr"); + c->root_label = strdup("root"); c->d_perceptron_features = NULL; c->mcd_struct = NULL; c->features_model = NULL; @@ -154,6 +156,9 @@ void context_maca_data_path_help_message(context *ctx){ fprintf(stderr, "\t-Y --maca_data_path : path to the maca_data directory\n"); } +void context_root_label_help_message(context *ctx){ + fprintf(stderr, "\t-R --root_label : name of the root label (default is \"root\")\n"); +} context *context_read_options(int argc, char *argv[]) { @@ -163,7 +168,7 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[25] = + static struct option long_options[26] = { {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, @@ -189,12 +194,13 @@ context *context_read_options(int argc, char *argv[]) {"vocabs", required_argument, 0, 'V'}, {"stream", required_argument, 0, 'T'}, {"language", required_argument, 0, 'X'}, - {"maca_data_path", required_argument, 0, 'Y'} + {"maca_data_path", required_argument, 0, 'Y'}, + {"root_label", required_argument, 0, 'R'} }; optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "dhvT:m:f:c:i:n:x:u:r:o:b:y:s:M:H:S:C:F:V:X:Y:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "dhvT:m:f:c:i:n:x:u:r:o:b:y:s:M:H:S:C:F:V:X:Y:R:", long_options, &option_index)) != -1){ switch (c) { case 'd': @@ -270,6 +276,9 @@ context *context_read_options(int argc, char *argv[]) case 'Y': ctx->maca_data_path = strdup(optarg); break; + case 'R': + ctx->root_label = strdup(optarg); + break; } } diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 222f8ad..2b0ed94 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -47,6 +47,7 @@ typedef struct { dico *dico_labels; char *maca_data_path; char *language; + char *root_label; } context; context *context_new(void); diff --git a/maca_trans_parser/src/decode.c b/maca_trans_parser/src/decode.c index 6dcf4e3..fcd0e66 100644 --- a/maca_trans_parser/src/decode.c +++ b/maca_trans_parser/src/decode.c @@ -56,7 +56,6 @@ int main(int argc, char *argv[]) ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); - ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); if(ctx->dico_labels == NULL){ @@ -65,7 +64,7 @@ int main(int argc, char *argv[]) } ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 1; - root_label = dico_string2int(ctx->dico_labels, (char *)"root"); + root_label = dico_string2int(ctx->dico_labels, ctx->root_label); if(root_label == -1) root_label = 0; diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff.c b/maca_trans_parser/src/maca_trans_parser_conll2cff.c new file mode 100644 index 0000000..8193933 --- /dev/null +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff.c @@ -0,0 +1,225 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"movement.h" +#include"oracle.h" +#include"feat_fct.h" +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"corpus.h" +#include"word_emb.h" +#include"config2feat_vec.h" + +void maca_trans_parser_conll2cff_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_alphabet_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_alphabet_help_message(ctx); + +} + +void maca_trans_parser_conll2cff_check_options(context *ctx) +{ + if(!ctx->conll_filename + || ctx->help + /* || !ctx->mcd_filename */ + || !(ctx->cff_filename || ctx->fann_filename) + ){ + maca_trans_parser_conll2cff_help_message(ctx); + exit(1); + } +} + +void generate_training_file_stream(FILE *output_file, context *ctx) +{ + config *c; + int mvt_code; + char mvt_type; + int mvt_label; + feat_vec *fv = feat_vec_new(feature_types_nb); + sentence *ref = NULL; + int sentence_nb = 0; + int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), ctx->root_label); + FILE *conll_file = myfopen(ctx->conll_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); + + c = config_initial(conll_file, ctx->mcd_struct, 10, 5); + + while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ + /* sentence_print(stdout, ref, mcd_get_dico_label(ctx->mcd_struct)); */ + while(1){ + /* config_print(stdout,c); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + + mvt_code = oracle(c, ref); + + mvt_type = movement_type(mvt_code); + mvt_label = movement_label(mvt_code); + + /* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */ + + fprintf(output_file, "%d", mvt_code); + feat_vec_print(output_file, fv); + + if(queue_is_empty(c->bf)) break; + + if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ /* sentence is complete */ + + /* create the root arc */ + movement_right_arc(c, mvt_label, 0); + + /* shift dummy word in stack */ + movement_shift(c, 1, 0); + + /* printf("sentence complete config : "); + config_print(stdout,c); */ + + /* empty depset */ + depset_free(c->ds); + c->ds = depset_new(); + sentence_free(ref); + sentence_nb++; + break; + } + + if(mvt_type == MVT_LEFT){ + movement_left_arc(c, mvt_label, 0); + continue; + } + if(mvt_type == MVT_RIGHT){ + movement_right_arc(c, mvt_label, 0); + continue; + } + if(mvt_type == MVT_SHIFT){ + movement_shift(c, 1, 0); + continue; + } + } + } +} + +void generate_training_file_buffer(FILE *output_file, context *ctx) +{ + config *c; + int mvt_code; + char mvt_type; + int mvt_label; + feat_vec *fv = feat_vec_new(feature_types_nb); + sentence *ref = NULL; + int sentence_nb = 0; + FILE *conll_file = myfopen(ctx->conll_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); + + c = config_initial(conll_file, ctx->mcd_struct, 1000, 0); + + while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ + /* sentence_print(stdout, ref, NULL); */ + queue_read_sentence(c->bf, conll_file, ctx->mcd_struct); + while(!config_is_terminal(c)){ + /* config_print(stdout,c); */ + + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + + mvt_code = oracle(c, ref); + + mvt_type = movement_type(mvt_code); + mvt_label = movement_label(mvt_code); + + /* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */ + + fprintf(output_file, "%d", mvt_code); + feat_vec_print(output_file, fv); + + if(mvt_type == MVT_LEFT){ + movement_left_arc(c, mvt_label, 0); + continue; + } + if(mvt_type == MVT_RIGHT){ + movement_right_arc(c, mvt_label, 0); + continue; + } + if(mvt_type == MVT_SHIFT){ + movement_shift(c, 0, 0); + continue; + } + } + config_free(c); + c = config_initial(conll_file, ctx->mcd_struct, 1000, 0); + sentence_nb++; + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + + ctx = context_read_options(argc, argv); + maca_trans_parser_conll2cff_check_options(ctx); + + if(ctx->mode == TRAIN_MODE){ + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + } + else if(ctx->mode == TEST_MODE){ + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); + } + + ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + if(ctx->dico_labels == NULL){ + fprintf(stderr, "cannot find label names\n"); + return 1; + } + ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 1; + + feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); + + + /* in train mode create feature dictionnary for perceptron */ + if(ctx->mode == TRAIN_MODE) + ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); + + /* in test mode read feature dictionnary for perceptron */ + if(ctx->mode == TEST_MODE) + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + /* add the feature dictionnary to the dico vector */ + dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + + /* open output file */ + if(ctx->cff_filename) + output_file = myfopen(ctx->cff_filename, "w"); + else + output_file = stdout; + + if(ctx->stream_mode) + generate_training_file_stream(output_file, ctx); + else + generate_training_file_buffer(output_file, ctx); + + if(ctx->mode == TRAIN_MODE){ + /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ + dico_vec_print(ctx->vocabs_filename, ctx->vocabs); + + } + + if(ctx->cff_filename) + fclose(output_file); + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/maca_trans_parser_conll2fann.c b/maca_trans_parser/src/maca_trans_parser_conll2fann.c index e61f8b4..d116b9a 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2fann.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2fann.c @@ -176,4 +176,4 @@ int main(int argc, char *argv[]) context_free(ctx); return 0; } -k + diff --git a/maca_trans_parser/src/sentence.c b/maca_trans_parser/src/sentence.c index 95d8977..63cdb8b 100644 --- a/maca_trans_parser/src/sentence.c +++ b/maca_trans_parser/src/sentence.c @@ -64,7 +64,7 @@ void sentence_free(sentence *s) sentence *sentence_read(FILE *f, mcd *mcd_struct) { - sentence *s = sentence_init(mcd_struct, f); + sentence *s = sentence_init(mcd_struct, f); char buffer[1000]; word *w = NULL; diff --git a/maca_trans_parser/src/sentence.h b/maca_trans_parser/src/sentence.h index 7210ddb..6a80509 100644 --- a/maca_trans_parser/src/sentence.h +++ b/maca_trans_parser/src/sentence.h @@ -3,7 +3,6 @@ #include"word.h" #include"util.h" -#include"depset.h" #include"mcd.h" typedef struct { diff --git a/maca_trans_parser/src/simple_decoder.c b/maca_trans_parser/src/simple_decoder.c index 052962a..fe39b22 100644 --- a/maca_trans_parser/src/simple_decoder.c +++ b/maca_trans_parser/src/simple_decoder.c @@ -76,10 +76,14 @@ void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico * c = config_initial(f, mcd_struct, 10, 5); while(!config_is_terminal(c)){ + config_print(stdout, c); config2feat_vec_cff(fm, c, dico_features, fv, LOOKUP_MODE); + /* feat_vec_print_string(fv, dico_features); */ mvt_code = feature_table_argmax(fv, ft, &max); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); + + /* printf("code predicted = %d\n", mvt_code); */ if((stack_height(c->st)==1) && (mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ /* sentence is complete */ @@ -94,7 +98,8 @@ void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico * /* config_print(stdout, c); */ config_connect_subtrees(c, root_label); - depset_print_new_index(stdout, c->ds, dico_labels); + /* depset_print_new_index(stdout, c->ds, dico_labels);*/ + depset_print2(stdout, c->ds, dico_labels); /* pop the dummy word */ stack_pop(c->st); diff --git a/maca_trans_parser/src/transform_treebank.c b/maca_trans_parser/src/transform_treebank.c index fff6b53..5c3006c 100644 --- a/maca_trans_parser/src/transform_treebank.c +++ b/maca_trans_parser/src/transform_treebank.c @@ -70,7 +70,7 @@ int generate_training_file_stream(FILE *output_file, context *ctx) sentence *ref = NULL; int nb_trans = 0; int sentence_nb = 0; - int root_label = dico_string2int(ctx->mcd_struct->dico_array[FEAT_TYPE_LABEL], (char *)"root"); + int root_label = dico_string2int(ctx->mcd_struct->dico_array[FEAT_TYPE_LABEL], ctx->root_label); FILE *conll_file = myfopen(ctx->conll_filename, "r"); FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); word *b0; -- GitLab