From 44a96605c2ca9e30db451b6a922ec534b2a9e906 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Mon, 14 Nov 2016 10:00:55 -0500 Subject: [PATCH] added maca_tokenizer an extremely basic tokenizer --- CMakeLists.txt | 2 + maca_graph_parser/maca_graph_parser.c | 1 - .../maca_graph_parser_alphabet.c | 5 - maca_graph_parser/maca_graph_parser_corpora.c | 1 - .../maca_graph_parser_decode_main.c | 2 +- .../maca_graph_parser_decoder2.c | 2 +- .../maca_graph_parser_dep_count_table.c | 3 +- .../maca_graph_parser_feature_table.c | 8 +- .../maca_graph_parser_features.c | 6 +- .../maca_graph_parser_hyperdecoder.c | 4 +- maca_graph_parser/maca_graph_parser_main.c | 2 +- .../maca_graph_parser_print_model_main.c | 2 +- .../maca_graph_parser_sentence.c | 2 +- maca_graph_parser/simple_parser.cc | 4 +- maca_tokenizer/CMakeLists.txt | 4 + maca_tokenizer/main.c | 7 ++ maca_tokenizer/tok_rules.l | 16 +++ maca_trans_parser/CMakeLists.txt | 6 + maca_trans_parser/src/feat_fct.c | 2 +- .../src/maca_trans_tagger_mcf2cff_bt.c | 119 +++++++++++------- .../src/simple_decoder_tagger_bt.c | 22 ++-- 21 files changed, 138 insertions(+), 82 deletions(-) create mode 100644 maca_tokenizer/CMakeLists.txt create mode 100644 maca_tokenizer/main.c create mode 100644 maca_tokenizer/tok_rules.l diff --git a/CMakeLists.txt b/CMakeLists.txt index bd2ed1c..519d490 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 2.8.7) project(macaon2) add_definitions("-Wall") +find_package(FLEX) include_directories(maca_common/include) include_directories(perceptron/lib/include) @@ -10,6 +11,7 @@ add_subdirectory(maca_common) add_subdirectory(maca_tools) add_subdirectory(perceptron) #add_subdirectory(maca_lemmatizer) +add_subdirectory(maca_tokenizer) add_subdirectory(maca_trans_parser) add_subdirectory(maca_crf_tagger) add_subdirectory(maca_graph_parser) diff --git a/maca_graph_parser/maca_graph_parser.c b/maca_graph_parser/maca_graph_parser.c index 4869972..bc066ca 100644 --- a/maca_graph_parser/maca_graph_parser.c +++ b/maca_graph_parser/maca_graph_parser.c @@ -336,7 +336,6 @@ maca_graph_parser_ctx * maca_graph_parser_LoadCTX(int argc, char ** argv) { void maca_graph_parser_init(maca_graph_parser_ctx * ctx) { - int i; /* lexicon */ /* used only in maca_graph_parser_sentence, at the moment */ /* filename in cfg */ diff --git a/maca_graph_parser/maca_graph_parser_alphabet.c b/maca_graph_parser/maca_graph_parser_alphabet.c index c96f164..c162196 100644 --- a/maca_graph_parser/maca_graph_parser_alphabet.c +++ b/maca_graph_parser/maca_graph_parser_alphabet.c @@ -92,8 +92,6 @@ char * maca_graph_parser_alphabet_get_symbol(maca_graph_parser_alphabet *a, int void maca_graph_parser_alphabet_print4(char *filename, maca_graph_parser_alphabet *a1, maca_graph_parser_alphabet *a2, maca_graph_parser_alphabet *a3, maca_graph_parser_alphabet *a4) { FILE *f; - int i; - char *symbol; if(filename == NULL) f = stdout; @@ -116,8 +114,6 @@ void maca_graph_parser_alphabet_print4(char *filename, maca_graph_parser_alphabe void maca_graph_parser_alphabet_print5(char *filename, maca_graph_parser_alphabet *a1, maca_graph_parser_alphabet *a2, maca_graph_parser_alphabet *a3, maca_graph_parser_alphabet *a4, maca_graph_parser_alphabet *a5) { FILE *f; - int i; - char *symbol; if(filename == NULL) f = stdout; @@ -224,7 +220,6 @@ maca_graph_parser_alphabet **maca_graph_parser_alphabet_load5(char *filename) maca_graph_parser_alphabet *maca_graph_parser_alphabet_load(char *filename) { FILE *f; - int i; char symbol[1000]; maca_graph_parser_alphabet *a = NULL; diff --git a/maca_graph_parser/maca_graph_parser_corpora.c b/maca_graph_parser/maca_graph_parser_corpora.c index ad90a37..03869a5 100644 --- a/maca_graph_parser/maca_graph_parser_corpora.c +++ b/maca_graph_parser/maca_graph_parser_corpora.c @@ -162,7 +162,6 @@ hyp_ref_vector *load_mcf_corpus(maca_graph_parser_ctx *ctx){ hyp_ref_vector *v = allocate_hyp_ref_vector(ctx->sent_nb); maca_graph_parser_sentence *ref_s = NULL; maca_graph_parser_sentence *hyp_s = NULL; - int col_id; maca_mcf *format; maca_mcf_column *column; char buffer[128]; diff --git a/maca_graph_parser/maca_graph_parser_decode_main.c b/maca_graph_parser/maca_graph_parser_decode_main.c index b5b7bbe..cbe41c9 100644 --- a/maca_graph_parser/maca_graph_parser_decode_main.c +++ b/maca_graph_parser/maca_graph_parser_decode_main.c @@ -31,7 +31,7 @@ int main(int argc, char **argv) { - char c; + /* char c; */ maca_graph_parser_ctx * ctx; maca_graph_parser_sentence *sentence; int sent_num; diff --git a/maca_graph_parser/maca_graph_parser_decoder2.c b/maca_graph_parser/maca_graph_parser_decoder2.c index 6320294..4494e68 100644 --- a/maca_graph_parser/maca_graph_parser_decoder2.c +++ b/maca_graph_parser/maca_graph_parser_decoder2.c @@ -237,7 +237,7 @@ void maca_graph_parser_decoder2_decode(maca_graph_parser_ctx *ctx, maca_graph_pa /* */ Closed *max_C; Open *max_O; - int argmax_i; + /* int argmax_i; */ int label_argmax; Open *cand_O; float score_cand_O; diff --git a/maca_graph_parser/maca_graph_parser_dep_count_table.c b/maca_graph_parser/maca_graph_parser_dep_count_table.c index 93e7c3c..1acde6e 100644 --- a/maca_graph_parser/maca_graph_parser_dep_count_table.c +++ b/maca_graph_parser/maca_graph_parser_dep_count_table.c @@ -110,7 +110,8 @@ void maca_graph_parser_dep_count_table_print(maca_graph_parser_ctx * ctx, char * { FILE *f; maca_graph_parser_dep_count_table t = ctx->dep_count_table; - int gov, dep, label, dir, count, length_class; + int gov, dep, label, length_class; + /* int dir, count; */ if(filename == NULL) f = stdout; diff --git a/maca_graph_parser/maca_graph_parser_feature_table.c b/maca_graph_parser/maca_graph_parser_feature_table.c index 5016623..cf64482 100644 --- a/maca_graph_parser/maca_graph_parser_feature_table.c +++ b/maca_graph_parser/maca_graph_parser_feature_table.c @@ -71,7 +71,7 @@ void maca_graph_parser_feature_table_fill(maca_graph_parser_ctx *ctx, maca_graph feat_vector *fv_grandchildren = NULL; feat_vector *fv_sibling = NULL; - int labels_nb = ctx->labels_nb; + /* int labels_nb = ctx->labels_nb; */ /* default edge label: joker */ int dft_label = ctx->fct_joker; @@ -324,10 +324,10 @@ void maca_graph_parser_feature_table_fill(maca_graph_parser_ctx *ctx, maca_graph void maca_graph_parser_feature_table_free(maca_graph_parser_ctx *ctx) { - int i,j,k,l; + /* int i,j,k,l; */ maca_graph_parser_feature_table *d = ctx->feature_table; - int length = d->len; - int types = d->typesLen; + /* int length = d->len; */ + /* int types = d->typesLen; */ if(ctx->basic_features){ free(d->pl[0][0]); diff --git a/maca_graph_parser/maca_graph_parser_features.c b/maca_graph_parser/maca_graph_parser_features.c index 9b04fae..1b07c62 100644 --- a/maca_graph_parser/maca_graph_parser_features.c +++ b/maca_graph_parser/maca_graph_parser_features.c @@ -160,7 +160,7 @@ templ *maca_graph_parser_templ_allocator(int v0, int v1, int v2, int v3, int v4, { int start = 0; int end; - int l; + /* int l; */ templ *t = malloc(sizeof(templ)); if(t == NULL){ fprintf(stderr, "memory allocation error\n"); @@ -658,7 +658,7 @@ feat_vector *first(maca_graph_parser_sentence *s, maca_graph_parser_ctx *ctx, in int subcat_feats_nb = s->synt_feats_nb[gov]; int *subcat_feats_array = s->synt_feats_array[gov]; - int i,j; + int i; /* fprintf(stderr, "extract first order features : gov : (%d,%s,%s) dep :(%d,%s,%s)\n", gov, */ @@ -1036,7 +1036,7 @@ feat_vector *grandchildren(maca_graph_parser_sentence *s, maca_graph_parser_ctx int subcat_feats_nb = s->synt_feats_nb[gov]; int *subcat_feats_array = s->synt_feats_array[gov]; - int i,j; + int i; if(gdep == -1){ diff --git a/maca_graph_parser/maca_graph_parser_hyperdecoder.c b/maca_graph_parser/maca_graph_parser_hyperdecoder.c index 50b98ec..c883475 100644 --- a/maca_graph_parser/maca_graph_parser_hyperdecoder.c +++ b/maca_graph_parser/maca_graph_parser_hyperdecoder.c @@ -259,7 +259,7 @@ void maca_graph_parser_hyperdecoder_init(maca_graph_parser_ctx *ctx, maca_graph_ int m; int j[2] = {0, 0}; /* min dep count */ - int dep_count; + /* int dep_count; */ /* test: default edge label */ /* int dft_label = maca_tags_get_code(ctx->cfg, "morpho", "fct", "__JOKER__"); */ int dft_label = maca_alphabet_get_code(ctx->labels_alphabet, "__JOKER__"); @@ -723,7 +723,7 @@ void find_kbest(maca_graph_parser_ctx *ctx, Vertex *v, int k, maca_graph_parser_ */ int i; - int j[2] = {-1, -1}; + /* int j[2] = {-1, -1}; */ int ja; int jb; heap *cand; diff --git a/maca_graph_parser/maca_graph_parser_main.c b/maca_graph_parser/maca_graph_parser_main.c index ea3c637..08d281c 100644 --- a/maca_graph_parser/maca_graph_parser_main.c +++ b/maca_graph_parser/maca_graph_parser_main.c @@ -75,7 +75,7 @@ int main(int argc, char **argv) void maca_graph_parser_decode_main(maca_graph_parser_ctx * ctx) { maca_alphabet_array *alpha_array; - int i; + /* int i; */ int sent_num; /*maca_sentence * ms;*/ diff --git a/maca_graph_parser/maca_graph_parser_print_model_main.c b/maca_graph_parser/maca_graph_parser_print_model_main.c index 20023da..9c9dd24 100644 --- a/maca_graph_parser/maca_graph_parser_print_model_main.c +++ b/maca_graph_parser/maca_graph_parser_print_model_main.c @@ -9,7 +9,7 @@ int main(int argc, char *argv[]) int i,j; FILE *f = stdout; maca_graph_parser_ctx * ctx; - int hval; + /* int hval; */ maca_alphabet_array *alpha_array; maca_graph_parser_model *model = NULL; diff --git a/maca_graph_parser/maca_graph_parser_sentence.c b/maca_graph_parser/maca_graph_parser_sentence.c index d50cc4f..269baae 100644 --- a/maca_graph_parser/maca_graph_parser_sentence.c +++ b/maca_graph_parser/maca_graph_parser_sentence.c @@ -436,7 +436,7 @@ maca_mcf_sentence *maca_graph_parser_read_mcf_sentence(maca_graph_parser_ctx *ct int length; maca_mcf_word *mcf_word = NULL; int index, k; - int code_postag, code_lemma, code_form, code_label, code_synt_feat, gov; + int code_postag, code_lemma, code_form, code_label, gov; int nb_synt_feats; int *synt_feats; char invalid_sentence; diff --git a/maca_graph_parser/simple_parser.cc b/maca_graph_parser/simple_parser.cc index f2821c5..15ca59f 100644 --- a/maca_graph_parser/simple_parser.cc +++ b/maca_graph_parser/simple_parser.cc @@ -97,8 +97,8 @@ macaon::Parser::Parser( ctx = maca_graph_parser_LoadCTX(argc, (char**) argv); ctx->verbose_flag = verbose_flag; - int i; - int sent_num; + // int i; + //int sent_num; /* alphabets */ /* load alphabets */ diff --git a/maca_tokenizer/CMakeLists.txt b/maca_tokenizer/CMakeLists.txt new file mode 100644 index 0000000..d524f50 --- /dev/null +++ b/maca_tokenizer/CMakeLists.txt @@ -0,0 +1,4 @@ +FLEX_TARGET(tokenizer tok_rules.l ${CMAKE_CURRENT_BINARY_DIR}/maca_tokenizer.c) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +add_executable(maca_tokenizer main.c ${FLEX_tokenizer_OUTPUTS}) +install (TARGETS maca_tokenizer DESTINATION bin) diff --git a/maca_tokenizer/main.c b/maca_tokenizer/main.c new file mode 100644 index 0000000..f0bef4c --- /dev/null +++ b/maca_tokenizer/main.c @@ -0,0 +1,7 @@ +int main(int argc, char* argv[]) { + + yylex() ; + + return 0; +} + diff --git a/maca_tokenizer/tok_rules.l b/maca_tokenizer/tok_rules.l new file mode 100644 index 0000000..d60af0e --- /dev/null +++ b/maca_tokenizer/tok_rules.l @@ -0,0 +1,16 @@ +%{ +#include <stdio.h> +%} +%option noyywrap +%% +" "+ printf("\n"); +\. printf("\n."); +\, printf("\n,"); +' printf("'\n"); +’ printf("'\n"); +\n+ printf("\n"); +du printf("de\nle"); +des printf("de\nles"); +au printf("à\nle"); +aux printf("à\nles"); +%% diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index a64e8f8..18eea28 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -102,6 +102,12 @@ target_link_libraries(maca_trans_tagger transparse) target_link_libraries(maca_trans_tagger maca_common) install (TARGETS maca_trans_tagger DESTINATION bin) +add_executable(maca_trans_tagger_bt ./src/maca_trans_tagger_bt.c) +target_link_libraries(maca_trans_tagger_bt perceptron) +target_link_libraries(maca_trans_tagger_bt transparse) +target_link_libraries(maca_trans_tagger_bt maca_common) +install (TARGETS maca_trans_tagger_bt DESTINATION bin) + #add_executable(maca_trans_parser_train ./src/train_perceptron.c) #target_compile_options(maca_trans_parser_train INTERFACE -Wall) #target_link_libraries(maca_trans_parser_train perceptron) diff --git a/maca_trans_parser/src/feat_fct.c b/maca_trans_parser/src/feat_fct.c index 392fece..608c53b 100644 --- a/maca_trans_parser/src/feat_fct.c +++ b/maca_trans_parser/src/feat_fct.c @@ -656,7 +656,7 @@ int mvt1(config *c) int delta1(config *c) { if(c->vcode_array == NULL) return -1; - int delta = (int) (c->vcode_array[0].score - c->vcode_array[1].score); + int delta = (int) (c->vcode_array[0].score - c->vcode_array[1].score); return (delta >= 10)? 10: delta; } diff --git a/maca_trans_parser/src/maca_trans_tagger_mcf2cff_bt.c b/maca_trans_parser/src/maca_trans_tagger_mcf2cff_bt.c index 68eb95f..364120e 100644 --- a/maca_trans_parser/src/maca_trans_tagger_mcf2cff_bt.c +++ b/maca_trans_parser/src/maca_trans_tagger_mcf2cff_bt.c @@ -70,79 +70,100 @@ void generate_training_file(FILE *output_file, context *ctx) FILE *conll_file = myfopen(ctx->input_filename, "r"); FILE *conll_file_predicted = myfopen(ctx->input_filename, "r"); int postag_oracle; - dico *dico_pos_oracle = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); + /* dico *dico_pos_oracle = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); */ feat_model *local_feat_model = feat_model_read("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.fm", ctx->verbose); dico_vec *local_dico_vec = dico_vec_read("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.vocab", ctx->hash_ratio); - dico *dico_pos_local = dico_vec_get_dico(local_dico_vec, (char *)"POS"); + /* dico *dico_pos_local = dico_vec_get_dico(local_dico_vec, (char *)"POS"); */ feature_table *local_ft = feature_table_load("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.model", ctx->verbose); dico *local_perceptron_features = dico_vec_get_dico(local_dico_vec, (char *)"d_perceptron_features"); config *config_predicted; int postag_predicted; - int i; - char *postag_oracle_string; - char *postag_predicted_string; - + /* int i; */ + /* char *postag_oracle_string; */ + /* char *postag_predicted_string; */ + config_predicted = config_new(conll_file_predicted, ctx->mcd_struct, 5); config_oracle = config_new(conll_file, ctx->mcd_struct, 5); - + while(!config_is_terminal(config_oracle)){ if(ctx->f2p){ add_signature_to_words_in_word_buffer(config_oracle->bf, ctx->f2p); add_signature_to_words_in_word_buffer(config_predicted->bf, ctx->f2p); } - + postag_oracle = word_get_pos(word_buffer_b0(config_get_buffer(config_oracle))); - postag_oracle_string = dico_int2string(dico_pos_oracle, postag_oracle); + /* postag_oracle_string = dico_int2string(dico_pos_oracle, postag_oracle); */ config2feat_vec_cff(local_feat_model, config_predicted, local_perceptron_features, fv, ctx->mode); - + if(config_predicted->vcode_array) free(config_predicted->vcode_array); config_predicted->vcode_array = feature_table_get_vcode_array(fv, local_ft); postag_predicted = config_predicted->vcode_array[0].class_code; - postag_predicted_string = dico_int2string(dico_pos_local, postag_predicted); - + /* postag_predicted_string = dico_int2string(dico_pos_local, postag_predicted); */ + if(ctx->debug_mode){ - if(strcmp(postag_oracle_string, postag_predicted_string)){ + /* if(strcmp(postag_oracle_string, postag_predicted_string)){ */ + if(postag_oracle != postag_predicted){ fprintf(stdout, "**************** DIFFERENTS ***********\n"); fprintf(stdout, "%s\n", word_get_input(word_buffer_b0(config_get_buffer(config_oracle)))); } } - + + forward(config_predicted, postag_predicted); forward(config_oracle, postag_oracle); - - if(!strcmp(postag_oracle_string, postag_predicted_string)){ + + fprintf(output_file, "%d", postag_oracle); + config2feat_vec_cff(ctx->features_model, config_predicted, ctx->d_perceptron_features, fv, ctx->mode); + feat_vec_print(output_file, fv); + word_set_pos(word_buffer_bm1(config_predicted->bf), postag_oracle); + } +} +#if 0 +/* if(!strcmp(postag_oracle_string, postag_predicted_string)){ */ + if(postag_oracle == postag_predicted){ fprintf(output_file, "0"); config2feat_vec_cff(ctx->features_model, config_predicted, ctx->d_perceptron_features, fv, ctx->mode); feat_vec_print(output_file, fv); + if(ctx->debug_mode){ + printf("CHOOSE 0\n"); + } } - + + if(postag_oracle == postag_predicted) + printf("CORRECT %d\n", delta2(config_predicted)); + else + printf("WRONG %d\n", delta2(config_predicted)); + int choice = 1; - while(strcmp(postag_oracle_string, postag_predicted_string) && (choice < 3)){ - - if(ctx->debug_mode){ - fprintf(stdout, "%d postag oracle = %s postag predicted = %s\n", - word_buffer_get_current_index(config_get_buffer(config_oracle)), - dico_int2string(dico_pos_oracle, postag_oracle), - dico_int2string(dico_pos_local, postag_predicted)); - - for(i=0; i < 5; i++) - fprintf(stdout, "%d\t%s\t%.4f\t%.4f\n", i, - dico_int2string(dico_pos_local, config_predicted->vcode_array[i].class_code), - config_predicted->vcode_array[i].score, - config_predicted->vcode_array[i].score - config_predicted->vcode_array[0].score); - fprintf(stdout, "CHOICE %d\n", choice); - } + /* while(strcmp(postag_oracle_string, postag_predicted_string) && (choice < 10)){ */ + while((postag_oracle != postag_predicted) && (choice < 10)){ + if(ctx->debug_mode){ + if(choice == 1){ + fprintf(stdout, "%d postag oracle = %s postag predicted = %s\n", + word_buffer_get_current_index(config_get_buffer(config_oracle)), + dico_int2string(dico_pos_oracle, postag_oracle), + dico_int2string(dico_pos_local, postag_predicted)); + + for(i=0; i < 5; i++) + fprintf(stdout, "%d\t%s\t%.4f\t%.4f\n", i, + dico_int2string(dico_pos_local, config_predicted->vcode_array[i].class_code), + config_predicted->vcode_array[i].score, + config_predicted->vcode_array[i].score - config_predicted->vcode_array[0].score); + } + /* fprintf(stdout, "CHOICE %d\n", choice); */ + } postag_predicted = config_predicted->vcode_array[choice].class_code; - postag_predicted_string = dico_int2string(dico_pos_local, postag_predicted); - - if(!strcmp(postag_predicted_string, postag_oracle_string)){ - if(ctx->debug_mode){ - printf("GOOD CHOICE\n"); - } + /* postag_predicted_string = dico_int2string(dico_pos_local, postag_predicted); */ + + /* if(!strcmp(postag_predicted_string, postag_oracle_string)){ */ + if(postag_predicted == postag_oracle){ + if(ctx->debug_mode){ + printf("CHOOSE %d\n", choice); + } fprintf(output_file, "%d", choice); config2feat_vec_cff(ctx->features_model, config_predicted, ctx->d_perceptron_features, fv, ctx->mode); feat_vec_print(output_file, fv); @@ -152,7 +173,7 @@ void generate_training_file(FILE *output_file, context *ctx) } } } - +#endif int main(int argc, char *argv[]) { context *ctx; @@ -163,24 +184,26 @@ int main(int argc, char *argv[]) ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); - if(ctx->mode == TRAIN_MODE){ + /* if(ctx->mode == TRAIN_MODE){ mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); } - else if(ctx->mode == TEST_MODE){ - ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); - mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); - } + else if(ctx->mode == TEST_MODE){*/ + + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + + /* } */ feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); /* in train mode create feature dictionnary for perceptron */ if(ctx->mode == TRAIN_MODE) - ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); + ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features_bt", 10000000); /* in test mode read feature dictionnary for perceptron */ if(ctx->mode == TEST_MODE) - ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features_bt"); /* add the feature dictionnary to the dico vector */ dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); @@ -193,11 +216,11 @@ int main(int argc, char *argv[]) generate_training_file(output_file, ctx); - if(ctx->mode == TRAIN_MODE){ + /* if(ctx->mode == TRAIN_MODE){ */ /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ dico_vec_print(ctx->vocabs_filename, ctx->vocabs); - } + /* } */ if(ctx->cff_filename) fclose(output_file); diff --git a/maca_trans_parser/src/simple_decoder_tagger_bt.c b/maca_trans_parser/src/simple_decoder_tagger_bt.c index 5d28c28..5cfc385 100644 --- a/maca_trans_parser/src/simple_decoder_tagger_bt.c +++ b/maca_trans_parser/src/simple_decoder_tagger_bt.c @@ -69,9 +69,10 @@ void simple_decoder_tagger2(context *ctx) int postag; feat_model *local_feat_model = feat_model_read("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.fm", ctx->verbose); - dico_vec *local_dico_vec = dico_vec_read("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.vocab", ctx->hash_ratio); - dico *local_dico_pos = dico_vec_get_dico(local_dico_vec, (char *)"POS"); - dico *local_perceptron_features = dico_vec_get_dico(local_dico_vec, (char *)"d_perceptron_features"); + /* dico_vec *local_dico_vec = dico_vec_read("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.vocab", ctx->hash_ratio); */ + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features_bt"); + dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); + dico *local_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); feature_table *local_ft = feature_table_load("/home/alexis/maca_data2/fr/bin/maca_trans_tagger.model", ctx->verbose); c = config_new(f, ctx->mcd_struct, 5); @@ -101,7 +102,7 @@ void simple_decoder_tagger2(context *ctx) if(ctx->debug_mode){ fprintf(stderr, "apply local model\n"); for(int i=0; i < 5; i++) - fprintf(stderr, "%d\t%s\t%.4f\n", i, dico_int2string(local_dico_pos, c->vcode_array[i].class_code), c->vcode_array[i].score); + fprintf(stderr, "%d\t%s\t%.4f\n", i, dico_int2string(dico_pos, c->vcode_array[i].class_code), c->vcode_array[i].score); } forward(c, postag); @@ -109,23 +110,26 @@ void simple_decoder_tagger2(context *ctx) /* apply global model */ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); - vcode *vcode_array = feature_table_get_vcode_array(fv, ft); if(ctx->debug_mode){ fprintf(stderr, "apply global model\n"); for(int i=0; i < 3; i++) - fprintf(stderr, "%d\t%d\t%.4f\n", i, vcode_array[i].class_code, vcode_array[i].score); + /* fprintf(stderr, "%d\t%d\t%.4f\n", i, vcode_array[i].class_code, vcode_array[i].score); */ + fprintf(stderr, "%d\t%s\t%.4f\n", i, dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score); } - int choice = vcode_array[0].class_code; + int choice = vcode_array[0].class_code; + word_set_pos(word_buffer_bm1(c->bf), choice); + /* if(choice != 0){ postag = c->vcode_array[choice].class_code; choice_n(c, choice); - } + }*/ free(vcode_array); /* } */ - print_word2(word_buffer_bm1(c->bf), ctx->mcd_struct, local_dico_pos, postag); + /* print_word2(word_buffer_bm1(c->bf), ctx->mcd_struct, dico_pos, postag); */ + print_word2(word_buffer_bm1(c->bf), ctx->mcd_struct, dico_pos, choice); } /* config_print(stdout, c); */ -- GitLab