diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 1ffc335cc55b5988b509537b5e7527d6e3bbb46e..96b1df214ab9cc865dc0a023cfabf59bc4ca8f89 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -50,6 +50,12 @@ target_link_libraries(maca_trans_tagger_mcf2cff transparse) target_link_libraries(maca_trans_tagger_mcf2cff maca_common) install (TARGETS maca_trans_tagger_mcf2cff DESTINATION bin) +add_executable(maca_error_predictor_tagger ./src/maca_error_predictor_tagger.c) +target_link_libraries(maca_error_predictor_tagger perceptron) +target_link_libraries(maca_error_predictor_tagger transparse) +target_link_libraries(maca_error_predictor_tagger maca_common) +install (TARGETS maca_error_predictor_tagger DESTINATION bin) + add_executable(maca_trans_morpho_mcf2cff ./src/maca_trans_morpho_mcf2cff.c) target_link_libraries(maca_trans_morpho_mcf2cff perceptron) target_link_libraries(maca_trans_morpho_mcf2cff transparse) diff --git a/maca_trans_parser/src/maca_error_predictor_tagger.c b/maca_trans_parser/src/maca_error_predictor_tagger.c new file mode 100644 index 0000000000000000000000000000000000000000..f0ce5148463743607d876639df2a712a6a911e2a --- /dev/null +++ b/maca_trans_parser/src/maca_error_predictor_tagger.c @@ -0,0 +1,233 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include<ctype.h> +#include"movement_tagger.h" +#include"oracle_tagger.h" +#include"feat_fct.h" +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"word_emb.h" +#include"config2feat_vec.h" +#include"feature_table.h" +#include"dico.h" +#include"mcd.h" + +void print_word_simple(word *w, mcd *mcd_struct, dico *dico_pos, int postag) +{ + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + printf("%s\t%s\t%s\n", token, strtok(NULL, "\t"),dico_int2string(dico_pos, postag)); + /* + if(mcd_get_pos_col(mcd_struct) == -1){ + printf("%s\t%s\n", w->input, dico_int2string(dico_pos, postag)); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) printf("\t"); + if(col_nb == mcd_get_pos_col(mcd_struct)) + printf("%s", dico_int2string(dico_pos, postag)); + else + word_print_col_n(stdout, w, col_nb); + col_nb++; + token = strtok(NULL, "\t"); + } + if(col_nb <= mcd_get_pos_col(mcd_struct)) + printf("\t%s", dico_int2string(dico_pos, postag)); + printf("\n"); + free(buffer); + }*/ +} + + +void add_signature_to_words_in_word_buffer_tagger(word_buffer *bf, form2pos *f2p) +{ + int i; + word *w; + char lower_form[1000]; + + for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){ + w = word_buffer_get_word_n(bf, i); + if(word_get_signature(w) != -1) break; + w->signature = form2pos_get_signature(f2p, w->form); + if(w->signature == -1){ + if(w->form){ + strcpy(lower_form, w->form); + to_lower_string(lower_form); + w->signature = form2pos_get_signature(f2p, lower_form); + } + } + } +} + +void maca_error_predictor_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + context_mcd_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_vocabs_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_vocabs_help_message(ctx); + +} + +void maca_error_predictor_check_options(context *ctx) +{ + if(0 /*!ctx->input_filename + || ctx->help + / || !ctx->mcd_filename / + || !(ctx->cff_filename || ctx->fann_filename) + */){ + maca_error_predictor_help_message(ctx); + exit(1); + } +} + + +int config_is_equal_tagger(config *c1, config *c2) +{ + return ((bm1p(c1)==bm1p(c2))&&(bm2p(c1)==bm2p(c2))&&(bm3p(c1)==bm3p(c2))); +} + +void generate_error_train(FILE *output_file, context *ctx) +{ + config *config_oracle; + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + feat_vec *fv_oracle = feat_vec_new(feature_types_nb); + FILE *conll_file_oracle = myfopen(ctx->input_filename, "r"); + int postag_oracle; + float max; + word *b0; + dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); + + config *config_predicted; + feat_vec *fv_predicted = feat_vec_new(feature_types_nb); + FILE *conll_file_predicted = myfopen(ctx->input_filename, "r"); + int postag_predicted; + + config_oracle = config_new(conll_file_oracle, ctx->mcd_struct, 5); + config_predicted = config_new(conll_file_predicted, ctx->mcd_struct, 5); + + while(!config_is_terminal(config_oracle)){ + /*if(ctx->f2p){ + add_signature_to_words_in_word_buffer_tagger(config_oracle->bf, ctx->f2p); + add_signature_to_words_in_word_buffer_tagger(config_predicted->bf, ctx->f2p); + }*/ + + // oracle + config2feat_vec_cff(ctx->features_model, config_oracle, ctx->d_perceptron_features, fv_oracle, LOOKUP_MODE); + postag_oracle = oracle_tagger(config_oracle); + printf("Oracle : "); + print_word_simple(word_buffer_b0(config_oracle->bf), ctx->mcd_struct, dico_pos, postag_oracle); + + // predicted + b0 = word_buffer_b0(config_predicted->bf); + config2feat_vec_cff(ctx->features_model, config_predicted, ctx->d_perceptron_features, fv_predicted, LOOKUP_MODE); + postag_predicted = feature_table_argmax(fv_predicted, ft, &max); + + printf("Predicted : "); + print_word_simple(b0, ctx->mcd_struct, dico_pos, postag_predicted); + + if(1){ + vcode *vcode_array = feature_table_get_vcode_array(fv_predicted, ft); + for(int i=0; i < 3; i++){ + fprintf(stdout, "%d\t", i); + fprintf(stdout, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score); + } + free(vcode_array); + } + + + + if (postag_oracle!=postag_predicted) + fprintf(stdout, "**************** DIFFERENT CHOICE ***********\n\n"); + + else + fprintf(stdout, "**************** EQUAL CHOICE ***********\n\n"); + + movement_tagger(config_oracle, postag_oracle); + movement_tagger(config_predicted, postag_predicted); + + fprintf(output_file, "%d", ((config_is_equal_tagger(config_oracle, config_predicted)) ? 1 : 0)); + fprintf(output_file, " or : %d, pred : %d", postag_oracle,postag_predicted); + feat_vec_print(output_file, fv_predicted); + //word_set_pos(word_buffer_bm1(config_predicted->bf), postag_oracle); + + } + + feat_vec_free(fv_oracle); + feat_vec_free(fv_predicted); + feature_table_free(ft); + config_free(config_oracle); + config_free(config_predicted); + + fclose(conll_file_oracle); + fclose(conll_file_predicted); + +} + + + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + + ctx = context_read_options(argc, argv); + //maca_error_predictor_check_options(ctx); + + ctx->perc_model_filename = "/home/mathis/maca_data2/fr/bin/maca_trans_tagger.model" ; + ctx->features_model_filename = "/home/mathis/maca_data2/fr/bin/maca_trans_tagger.fm" ; + ctx->input_filename = "/home/mathis/maca_data2/fr/data/treebank/dev.conll07"; + ctx->f2p_filename = "/home/mathis/maca_data2/fr/bin/fP"; + ctx->vocabs_filename = "/home/mathis/maca_data2/fr/bin/maca_trans_tagger.vocab"; + ctx->cff_filename = "/home/mathis/test/stage/error.cff"; + + ctx->mcd_struct = mcd_build_conll07(); + + //decode_tagger_set_linguistic_resources_filenames(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), 1);//ctx->verbose); + + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); + + /* add the feature dictionnary to the dico vector */ + dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + + /* open output file */ + if(ctx->cff_filename) + output_file = myfopen(ctx->cff_filename, "w"); + else + output_file = stdout; + + generate_error_train(output_file,ctx); + + if(ctx->cff_filename) + fclose(output_file); + + //context_free(ctx); + + return 0; +} + +