diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index e759789f3667de115689dc6275f032161f3d3784..df53f7c8759c59f6a52c25047a8cf955b1e00fc3 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -15,6 +15,9 @@ #include "word_emb.h" #include "dico_vec.h" +#define mcd_get_dico_label(m) (m)->dico_array[FEAT_TYPE_LABEL] + + typedef struct { int nb_col; int type2col[FEAT_TYPE_NB]; diff --git a/maca_lemmatizer/src/context.c b/maca_lemmatizer/src/context.c index 9fa49c670e0959fc0e4e12616b97b260c5391be2..1bc694bfa3c9d0275168545fc5fe9041556426b1 100644 --- a/maca_lemmatizer/src/context.c +++ b/maca_lemmatizer/src/context.c @@ -81,7 +81,7 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[8] = + static struct option long_options[10] = { {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 32c84d5d714f153f4a1f51e3a14767ad66b1f379..acda80b3fbf1359f660360511406062bcc49fd69 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -32,7 +32,7 @@ target_link_libraries(maca_trans_parser_conll2fann transparse) target_link_libraries(maca_trans_parser_conll2fann maca_common) install (TARGETS maca_trans_parser_conll2fann DESTINATION bin) -add_executable(maca_trans_parser_conll2cff ./src/transform_treebank.c) +add_executable(maca_trans_parser_conll2cff ./src/maca_trans_parser_conll2cff.c) target_link_libraries(maca_trans_parser_conll2cff transparse) target_link_libraries(maca_trans_parser_conll2cff maca_common) install (TARGETS maca_trans_parser_conll2cff DESTINATION bin) diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 6558bc94af0d64d17fe080303be580c9fc338f75..780b64785c9db6ca218a1efe0f1810664afc4db2 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -12,19 +12,20 @@ void context_set_linguistic_resources_filenames(context *ctx); void context_free(context *ctx) { - if(ctx->program_name) free(ctx->program_name); - if(ctx->conll_filename) free(ctx->conll_filename); - if(ctx->perc_model_filename) free(ctx->perc_model_filename); - if(ctx->dnn_model_filename) free(ctx->dnn_model_filename); - if(ctx->dico_features_filename) free(ctx->dico_features_filename); - if(ctx->dico_classes_filename) free(ctx->dico_classes_filename); - if(ctx->cff_filename) free(ctx->cff_filename); - if(ctx->fann_filename) free(ctx->fann_filename); - if(ctx->mcd_filename) free(ctx->mcd_filename); - if(ctx->stag_desc_filename) free(ctx->stag_desc_filename); + if(ctx->program_name) free(ctx->program_name); + if(ctx->conll_filename) free(ctx->conll_filename); + if(ctx->perc_model_filename) free(ctx->perc_model_filename); + if(ctx->dnn_model_filename) free(ctx->dnn_model_filename); + if(ctx->dico_features_filename) free(ctx->dico_features_filename); + if(ctx->dico_classes_filename) free(ctx->dico_classes_filename); + if(ctx->cff_filename) free(ctx->cff_filename); + if(ctx->fann_filename) free(ctx->fann_filename); + if(ctx->mcd_filename) free(ctx->mcd_filename); + if(ctx->stag_desc_filename) free(ctx->stag_desc_filename); if(ctx->features_model_filename) free(ctx->features_model_filename); - if(ctx->maca_data_path) free(ctx->maca_data_path); - if(ctx->language) free(ctx->language); + if(ctx->maca_data_path) free(ctx->maca_data_path); + if(ctx->language) free(ctx->language); + if(ctx->root_label) free(ctx->root_label); if(ctx->d_perceptron_features) dico_free(ctx->d_perceptron_features); @@ -59,6 +60,7 @@ context *context_new(void) c->maca_data_path = NULL; c->language = strdup("fr"); + c->root_label = strdup("root"); c->d_perceptron_features = NULL; c->mcd_struct = NULL; c->features_model = NULL; @@ -154,6 +156,9 @@ void context_maca_data_path_help_message(context *ctx){ fprintf(stderr, "\t-Y --maca_data_path : path to the maca_data directory\n"); } +void context_root_label_help_message(context *ctx){ + fprintf(stderr, "\t-R --root_label : name of the root label (default is \"root\")\n"); +} context *context_read_options(int argc, char *argv[]) { @@ -163,7 +168,7 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[25] = + static struct option long_options[26] = { {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, @@ -189,12 +194,13 @@ context *context_read_options(int argc, char *argv[]) {"vocabs", required_argument, 0, 'V'}, {"stream", required_argument, 0, 'T'}, {"language", required_argument, 0, 'X'}, - {"maca_data_path", required_argument, 0, 'Y'} + {"maca_data_path", required_argument, 0, 'Y'}, + {"root_label", required_argument, 0, 'R'} }; optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "dhvT:m:f:c:i:n:x:u:r:o:b:y:s:M:H:S:C:F:V:X:Y:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "dhvT:m:f:c:i:n:x:u:r:o:b:y:s:M:H:S:C:F:V:X:Y:R:", long_options, &option_index)) != -1){ switch (c) { case 'd': @@ -270,6 +276,9 @@ context *context_read_options(int argc, char *argv[]) case 'Y': ctx->maca_data_path = strdup(optarg); break; + case 'R': + ctx->root_label = strdup(optarg); + break; } } diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 222f8ad31a978ad60623f56e99fbd57545ecf46d..2b0ed94a6cae35a67b2c767c81734c4ab04a177a 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -47,6 +47,7 @@ typedef struct { dico *dico_labels; char *maca_data_path; char *language; + char *root_label; } context; context *context_new(void); diff --git a/maca_trans_parser/src/decode.c b/maca_trans_parser/src/decode.c index 6dcf4e3229d7fe871cd3b1cdad17a09de7d3ba19..fcd0e66ba6f49fdee01ba14717ec70647b66dd01 100644 --- a/maca_trans_parser/src/decode.c +++ b/maca_trans_parser/src/decode.c @@ -56,7 +56,6 @@ int main(int argc, char *argv[]) ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); - ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); if(ctx->dico_labels == NULL){ @@ -65,7 +64,7 @@ int main(int argc, char *argv[]) } ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 1; - root_label = dico_string2int(ctx->dico_labels, (char *)"root"); + root_label = dico_string2int(ctx->dico_labels, ctx->root_label); if(root_label == -1) root_label = 0; diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff.c b/maca_trans_parser/src/maca_trans_parser_conll2cff.c new file mode 100644 index 0000000000000000000000000000000000000000..81939332a90d226839fca522a5d7f2135e7a47d1 --- /dev/null +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff.c @@ -0,0 +1,225 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"movement.h" +#include"oracle.h" +#include"feat_fct.h" +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"corpus.h" +#include"word_emb.h" +#include"config2feat_vec.h" + +void maca_trans_parser_conll2cff_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_alphabet_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_alphabet_help_message(ctx); + +} + +void maca_trans_parser_conll2cff_check_options(context *ctx) +{ + if(!ctx->conll_filename + || ctx->help + /* || !ctx->mcd_filename */ + || !(ctx->cff_filename || ctx->fann_filename) + ){ + maca_trans_parser_conll2cff_help_message(ctx); + exit(1); + } +} + +void generate_training_file_stream(FILE *output_file, context *ctx) +{ + config *c; + int mvt_code; + char mvt_type; + int mvt_label; + feat_vec *fv = feat_vec_new(feature_types_nb); + sentence *ref = NULL; + int sentence_nb = 0; + int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), ctx->root_label); + FILE *conll_file = myfopen(ctx->conll_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); + + c = config_initial(conll_file, ctx->mcd_struct, 10, 5); + + while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ + /* sentence_print(stdout, ref, mcd_get_dico_label(ctx->mcd_struct)); */ + while(1){ + /* config_print(stdout,c); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + + mvt_code = oracle(c, ref); + + mvt_type = movement_type(mvt_code); + mvt_label = movement_label(mvt_code); + + /* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */ + + fprintf(output_file, "%d", mvt_code); + feat_vec_print(output_file, fv); + + if(queue_is_empty(c->bf)) break; + + if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ /* sentence is complete */ + + /* create the root arc */ + movement_right_arc(c, mvt_label, 0); + + /* shift dummy word in stack */ + movement_shift(c, 1, 0); + + /* printf("sentence complete config : "); + config_print(stdout,c); */ + + /* empty depset */ + depset_free(c->ds); + c->ds = depset_new(); + sentence_free(ref); + sentence_nb++; + break; + } + + if(mvt_type == MVT_LEFT){ + movement_left_arc(c, mvt_label, 0); + continue; + } + if(mvt_type == MVT_RIGHT){ + movement_right_arc(c, mvt_label, 0); + continue; + } + if(mvt_type == MVT_SHIFT){ + movement_shift(c, 1, 0); + continue; + } + } + } +} + +void generate_training_file_buffer(FILE *output_file, context *ctx) +{ + config *c; + int mvt_code; + char mvt_type; + int mvt_label; + feat_vec *fv = feat_vec_new(feature_types_nb); + sentence *ref = NULL; + int sentence_nb = 0; + FILE *conll_file = myfopen(ctx->conll_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); + + c = config_initial(conll_file, ctx->mcd_struct, 1000, 0); + + while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ + /* sentence_print(stdout, ref, NULL); */ + queue_read_sentence(c->bf, conll_file, ctx->mcd_struct); + while(!config_is_terminal(c)){ + /* config_print(stdout,c); */ + + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + + mvt_code = oracle(c, ref); + + mvt_type = movement_type(mvt_code); + mvt_label = movement_label(mvt_code); + + /* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */ + + fprintf(output_file, "%d", mvt_code); + feat_vec_print(output_file, fv); + + if(mvt_type == MVT_LEFT){ + movement_left_arc(c, mvt_label, 0); + continue; + } + if(mvt_type == MVT_RIGHT){ + movement_right_arc(c, mvt_label, 0); + continue; + } + if(mvt_type == MVT_SHIFT){ + movement_shift(c, 0, 0); + continue; + } + } + config_free(c); + c = config_initial(conll_file, ctx->mcd_struct, 1000, 0); + sentence_nb++; + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + + ctx = context_read_options(argc, argv); + maca_trans_parser_conll2cff_check_options(ctx); + + if(ctx->mode == TRAIN_MODE){ + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + } + else if(ctx->mode == TEST_MODE){ + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); + } + + ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + if(ctx->dico_labels == NULL){ + fprintf(stderr, "cannot find label names\n"); + return 1; + } + ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 1; + + feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); + + + /* in train mode create feature dictionnary for perceptron */ + if(ctx->mode == TRAIN_MODE) + ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); + + /* in test mode read feature dictionnary for perceptron */ + if(ctx->mode == TEST_MODE) + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + /* add the feature dictionnary to the dico vector */ + dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + + /* open output file */ + if(ctx->cff_filename) + output_file = myfopen(ctx->cff_filename, "w"); + else + output_file = stdout; + + if(ctx->stream_mode) + generate_training_file_stream(output_file, ctx); + else + generate_training_file_buffer(output_file, ctx); + + if(ctx->mode == TRAIN_MODE){ + /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ + dico_vec_print(ctx->vocabs_filename, ctx->vocabs); + + } + + if(ctx->cff_filename) + fclose(output_file); + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/maca_trans_parser_conll2fann.c b/maca_trans_parser/src/maca_trans_parser_conll2fann.c index e61f8b4baef63239845b15ba6d43c3634dad2c06..d116b9a6e333476d9aa744190ef47231a34f7cd8 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2fann.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2fann.c @@ -176,4 +176,4 @@ int main(int argc, char *argv[]) context_free(ctx); return 0; } -k + diff --git a/maca_trans_parser/src/sentence.c b/maca_trans_parser/src/sentence.c index 95d8977e6e69c730f041f19340908344f378fcbe..63cdb8b8a76caebd127b24530d2321ecf46671ab 100644 --- a/maca_trans_parser/src/sentence.c +++ b/maca_trans_parser/src/sentence.c @@ -64,7 +64,7 @@ void sentence_free(sentence *s) sentence *sentence_read(FILE *f, mcd *mcd_struct) { - sentence *s = sentence_init(mcd_struct, f); + sentence *s = sentence_init(mcd_struct, f); char buffer[1000]; word *w = NULL; diff --git a/maca_trans_parser/src/sentence.h b/maca_trans_parser/src/sentence.h index 7210ddbd2c106f2c8052b986895bc8a41446e33b..6a80509dab72ff1627001a81158936d2cc180841 100644 --- a/maca_trans_parser/src/sentence.h +++ b/maca_trans_parser/src/sentence.h @@ -3,7 +3,6 @@ #include"word.h" #include"util.h" -#include"depset.h" #include"mcd.h" typedef struct { diff --git a/maca_trans_parser/src/simple_decoder.c b/maca_trans_parser/src/simple_decoder.c index 052962a2cc96ba43cd4bf2c139bd17ce19c800fd..fe39b223bbe575e3ac4f561704ee4f0fb0e261fa 100644 --- a/maca_trans_parser/src/simple_decoder.c +++ b/maca_trans_parser/src/simple_decoder.c @@ -76,10 +76,14 @@ void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico * c = config_initial(f, mcd_struct, 10, 5); while(!config_is_terminal(c)){ + config_print(stdout, c); config2feat_vec_cff(fm, c, dico_features, fv, LOOKUP_MODE); + /* feat_vec_print_string(fv, dico_features); */ mvt_code = feature_table_argmax(fv, ft, &max); mvt_type = movement_type(mvt_code); mvt_label = movement_label(mvt_code); + + /* printf("code predicted = %d\n", mvt_code); */ if((stack_height(c->st)==1) && (mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ /* sentence is complete */ @@ -94,7 +98,8 @@ void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico * /* config_print(stdout, c); */ config_connect_subtrees(c, root_label); - depset_print_new_index(stdout, c->ds, dico_labels); + /* depset_print_new_index(stdout, c->ds, dico_labels);*/ + depset_print2(stdout, c->ds, dico_labels); /* pop the dummy word */ stack_pop(c->st); diff --git a/maca_trans_parser/src/transform_treebank.c b/maca_trans_parser/src/transform_treebank.c index fff6b53b2d2950e7779f385e3266d539f4ff48ed..5c3006c04b7d3af0923aa80ad4d9f2d0fe0c6be1 100644 --- a/maca_trans_parser/src/transform_treebank.c +++ b/maca_trans_parser/src/transform_treebank.c @@ -70,7 +70,7 @@ int generate_training_file_stream(FILE *output_file, context *ctx) sentence *ref = NULL; int nb_trans = 0; int sentence_nb = 0; - int root_label = dico_string2int(ctx->mcd_struct->dico_array[FEAT_TYPE_LABEL], (char *)"root"); + int root_label = dico_string2int(ctx->mcd_struct->dico_array[FEAT_TYPE_LABEL], ctx->root_label); FILE *conll_file = myfopen(ctx->conll_filename, "r"); FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); word *b0;