diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index acda80b3fbf1359f660360511406062bcc49fd69..1a17285e3f5a97e2e12053d7630a6d2743cfd3b5 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -2,12 +2,15 @@ set(SOURCES src/context.c src/feat_desc.c src/feature_table.c src/movement.c + src/movement_tagger.c src/sentence.c src/feat_fct.c src/feat_vec.c src/global_feat_vec.c src/oracle.c + src/oracle_tagger.c src/simple_decoder.c + src/simple_decoder_tagger.c src/cf_file.c src/feat_lib.c src/perceptron.c @@ -27,6 +30,11 @@ add_library(transparse STATIC ${SOURCES}) #compiling, linking and installing executables +add_executable(maca_trans_parser_conll2cff_tagger ./src/maca_trans_parser_conll2cff_tagger.c) +target_link_libraries(maca_trans_parser_conll2cff_tagger transparse) +target_link_libraries(maca_trans_parser_conll2cff_tagger maca_common) +install (TARGETS maca_trans_parser_conll2cff_tagger DESTINATION bin) + add_executable(maca_trans_parser_conll2fann ./src/maca_trans_parser_conll2fann.c) target_link_libraries(maca_trans_parser_conll2fann transparse) target_link_libraries(maca_trans_parser_conll2fann maca_common) @@ -42,6 +50,11 @@ target_link_libraries(maca_trans_parser transparse) target_link_libraries(maca_trans_parser maca_common) install (TARGETS maca_trans_parser DESTINATION bin) +add_executable(maca_trans_tagger ./src/decode_tagger.c) +target_link_libraries(maca_trans_tagger transparse) +target_link_libraries(maca_trans_tagger maca_common) +install (TARGETS maca_trans_tagger DESTINATION bin) + add_executable(maca_trans_parser_train ./src/train_perceptron.c) target_compile_options(maca_trans_parser_train INTERFACE -Wall) target_link_libraries(maca_trans_parser_train transparse) diff --git a/maca_trans_parser/src/decode_tagger.c b/maca_trans_parser/src/decode_tagger.c new file mode 100644 index 0000000000000000000000000000000000000000..30951cc49aa171edd888ae565e25ab0cdb7fe68b --- /dev/null +++ b/maca_trans_parser/src/decode_tagger.c @@ -0,0 +1,102 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"feat_fct.h" +#include"feature_table.h" +#include"dico.h" +#include"beam.h" +#include"simple_decoder_tagger.h" +/*#include"dnn_decoder.h"*/ +#include"config2feat_vec.h" + +void decode_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_beam_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + context_mcd_help_message(ctx); + context_model_help_message(ctx); + context_vocabs_help_message(ctx); + context_features_model_help_message(ctx); + context_language_help_message(ctx); + context_maca_data_path_help_message(ctx); + +} + +void decode_check_options(context *ctx){ + if(ctx->help + /*!ctx->conll_filename*/ + /* || !ctx->perc_model_filename + || !ctx->mcd_filename + || !ctx->vocabs_filename + || !ctx->features_model_filename*/ + ){ + decode_help_message(ctx); + exit(1); + } +} + +int main(int argc, char *argv[]) +{ + FILE *conll_file = NULL; + context *ctx; + feature_table *ft; + /* struct fann *ann; */ + int root_label; + dico *dico_pos; + ctx = context_read_options(argc, argv); + decode_check_options(ctx); + + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); + + dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); + /* when in stream mode, force to renumber the tokens (ugly !) */ + if(ctx->stream_mode){ + ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; + } + + + /* load models */ + + if(ctx->perc_model_filename){ + /* ctx->d_perceptron_features = dico_read(ctx->perceptron_features_filename, ctx->hash_ratio); */ + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + ft = feature_table_load(ctx->perc_model_filename); + /* hash_stats(dico_features->htable); */ + } + + + /* else if(ctx->dnn_model_filename){ + ann = fann_create_from_file(ctx->dnn_model_filename); + if(!ann){ + fprintf(stderr, "Error creating ann --- ABORTING.\n"); + return -1; + } + } + else{*/ + + if(ctx->conll_filename) + conll_file= myfopen(ctx->conll_filename, "r"); + else + conll_file = stdin; + + if(ctx->perc_model_filename){ + if(ctx->beam_width == 1){ + simple_decoder_tagger(conll_file, ctx->mcd_struct, ctx->d_perceptron_features, dico_pos, ft, ctx->features_model, ctx->verbose, ctx->stream_mode); + } + else + beam_decoder(conll_file, ctx->mcd_struct, ctx->d_perceptron_features, dico_pos, ft, ctx->features_model, ctx->verbose, root_label, ctx->beam_width, ctx->mvt_nb); + } + + /* else if(ctx->dnn_model_filename){ + dnn_decoder(conll_file, ctx->mcd_struct, ann, ctx->features_model, ctx->verbose, root_label, ctx->stream_mode); + }*/ + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/feature_table.c b/maca_trans_parser/src/feature_table.c index 715596d9800daac27b2f243f6e71dafdc9ff850b..db8363a16d8d6a2e51e59193429f6f1bb748edb1 100644 --- a/maca_trans_parser/src/feature_table.c +++ b/maca_trans_parser/src/feature_table.c @@ -143,13 +143,17 @@ int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max) int cla, argmax; int classes_nb = ft->classes_nb; int feat; + + /* printf("feat tabl argmax classes nb = %d\n", classes_nb); */ for(cla=0; cla < classes_nb; cla++) classes_score[cla] = 0; for(feat=0; feat < fv->nb; feat++){ for(cla=0; cla < classes_nb; cla++){ - if(fv->t[feat] != -1) + if(fv->t[feat] != -1){ + /* printf("feat score = %f\n", ft->table[fv->t[feat]][cla]); */ classes_score[cla] += ft->table[fv->t[feat]][cla]; + } } } diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c new file mode 100644 index 0000000000000000000000000000000000000000..146c728cd692ad5051fcc0f8d3d48aea0e69dae1 --- /dev/null +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c @@ -0,0 +1,166 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"movement_tagger.h" +#include"oracle.h" +#include"feat_fct.h" +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"corpus.h" +#include"word_emb.h" +#include"config2feat_vec.h" + +void maca_trans_parser_conll2cff_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_alphabet_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_alphabet_help_message(ctx); + +} + +void maca_trans_parser_conll2cff_check_options(context *ctx) +{ + if(!ctx->conll_filename + || ctx->help + /* || !ctx->mcd_filename */ + || !(ctx->cff_filename || ctx->fann_filename) + ){ + maca_trans_parser_conll2cff_help_message(ctx); + exit(1); + } +} + +void generate_training_file_stream(FILE *output_file, context *ctx) +{ + config *c; + int mvt_code; + char mvt_type; + int mvt_label; + feat_vec *fv = feat_vec_new(feature_types_nb); + sentence *ref = NULL; + int sentence_nb = 0; + int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), ctx->root_label); + FILE *conll_file = myfopen(ctx->conll_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); + int postag; + + c = config_initial(conll_file, ctx->mcd_struct, 10, 5); + + while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ + /* sentence_print(stdout, ref, mcd_get_dico_label(ctx->mcd_struct)); */ + while(1){ + /* config_print(stdout,c); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + postag = oracle_tagger(c, ref); + + fprintf(output_file, "%d", postag); + feat_vec_print(output_file, fv); + + + if(postag != -1) + movement_tagger(c, postag, 0, 1); + } + } +} + +void generate_training_file_buffer(FILE *output_file, context *ctx) +{ + config *c; + int mvt_code; + char mvt_type; + int mvt_label; + feat_vec *fv = feat_vec_new(feature_types_nb); + sentence *ref = NULL; + int sentence_nb = 0; + FILE *conll_file = myfopen(ctx->conll_filename, "r"); + FILE *conll_file_ref = myfopen(ctx->conll_filename, "r"); + int postag; + c = config_initial(conll_file, ctx->mcd_struct, 1000, 0); + + while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){ + /* sentence_print(stdout, ref, NULL); */ + queue_read_sentence(c->bf, conll_file, ctx->mcd_struct); + queue_remove(c->bf); /* get rid of dummy token */ + while(!config_is_terminal(c)){ + /* config_print(stdout, c); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + postag = oracle_tagger(c, ref); + fprintf(output_file, "%d", postag); + feat_vec_print(output_file, fv); + + if(postag != -1) + movement_tagger(c, postag, 0, 0); + } + config_free(c); + c = config_initial(conll_file, ctx->mcd_struct, 1000, 0); + sentence_nb++; + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + + ctx = context_read_options(argc, argv); + maca_trans_parser_conll2cff_check_options(ctx); + + if(ctx->mode == TRAIN_MODE){ + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + } + else if(ctx->mode == TEST_MODE){ + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); + } + + feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); + + + /* in train mode create feature dictionnary for perceptron */ + if(ctx->mode == TRAIN_MODE) + ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); + + /* in test mode read feature dictionnary for perceptron */ + if(ctx->mode == TEST_MODE) + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + /* add the feature dictionnary to the dico vector */ + dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + + /* open output file */ + if(ctx->cff_filename) + output_file = myfopen(ctx->cff_filename, "w"); + else + output_file = stdout; + + if(ctx->stream_mode) + generate_training_file_stream(output_file, ctx); + else + generate_training_file_buffer(output_file, ctx); + + if(ctx->mode == TRAIN_MODE){ + /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ + dico_vec_print(ctx->vocabs_filename, ctx->vocabs); + + } + + if(ctx->cff_filename) + fclose(output_file); + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/movement_tagger.c b/maca_trans_parser/src/movement_tagger.c new file mode 100644 index 0000000000000000000000000000000000000000..ff5e3057625de5bc0f16a4fdd55a1d2ba5243847 --- /dev/null +++ b/maca_trans_parser/src/movement_tagger.c @@ -0,0 +1,21 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"util.h" +#include"movement_tagger.h" + +int movement_tagger(config *c, int postag, float score, int stream) +{ + word *b0 = NULL; + if(queue_is_empty(c->bf)) return 0; + + b0 = queue_elt_n(c->bf, 0); + word_set_pos(b0, postag); + stack_push(c->st, queue_remove(c->bf)); + + /* in stream mode, read a new word and add it to the buffer */ + if(stream) + config_add_next_word_to_buffer(c); + + return 1; +} diff --git a/maca_trans_parser/src/movement_tagger.h b/maca_trans_parser/src/movement_tagger.h new file mode 100644 index 0000000000000000000000000000000000000000..7168f5aa0218ba32e05fc84b97e76e5cab9c6a80 --- /dev/null +++ b/maca_trans_parser/src/movement_tagger.h @@ -0,0 +1,8 @@ +#ifndef __MOVEMENT_TAGGER__ +#define __MOVEMENT_TAGGER__ + +#include"config.h" +#include"feat_vec.h" +int movement_tagger(config *c, int postag, float score, int stream); + +#endif diff --git a/maca_trans_parser/src/oracle_tagger.c b/maca_trans_parser/src/oracle_tagger.c new file mode 100644 index 0000000000000000000000000000000000000000..c08cc234a0463f215ac485be9284eaddf8db7d1e --- /dev/null +++ b/maca_trans_parser/src/oracle_tagger.c @@ -0,0 +1,14 @@ +#include"oracle_tagger.h" + +int oracle_tagger(config *c, sentence *ref) +{ + word *b0; /* next word in the bufer */ + int b0_index; + + if(!queue_is_empty(c->bf)){ + b0 = queue_elt_n(c->bf, 0); + b0_index = word_get_index(b0); + return word_get_pos(ref->words[b0_index]); + } + return -1; +} diff --git a/maca_trans_parser/src/oracle_tagger.h b/maca_trans_parser/src/oracle_tagger.h new file mode 100644 index 0000000000000000000000000000000000000000..360421c8159138f7bd5f139193a22d22622dc5af --- /dev/null +++ b/maca_trans_parser/src/oracle_tagger.h @@ -0,0 +1,11 @@ +#ifndef __ORACLE_TAGGER__ +#define __ORACLE_TAGGER__ + +#include<stdio.h> +#include<stdlib.h> +#include"config.h" +#include"sentence.h" + +int oracle_tagger(config *c, sentence *ref); + +#endif diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c new file mode 100644 index 0000000000000000000000000000000000000000..e50a1ae8144223c2581bd4fd8d0e77bda42cbb70 --- /dev/null +++ b/maca_trans_parser/src/simple_decoder_tagger.c @@ -0,0 +1,77 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"movement_tagger.h" +#include"feat_fct.h" +#include"config2feat_vec.h" +#include"feature_table.h" +#include"dico.h" + +void simple_decoder_buffer(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose); +void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose); + + +void simple_decoder_tagger(FILE *f, mcd *mcd_struct, dico *d_perceptron_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose, int stream_mode) +{ + if(stream_mode) + simple_decoder_stream(f, mcd_struct, d_perceptron_features, dico_pos, ft, fm, verbose); + else + simple_decoder_buffer(f, mcd_struct, d_perceptron_features, dico_pos, ft, fm, verbose); +} + +void simple_decoder_buffer(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose) +{ + config *c; + int postag; + feat_vec *fv = feat_vec_new(feature_types_nb); + float max; + int i; + word *w; + + c = config_initial(f, mcd_struct, 1000, 0); + + /* read a sentence and put it in the buffer */ + while(queue_read_sentence(c->bf, f, mcd_struct)){ + queue_remove(c->bf); /* get rid of dummy token */ + while(!config_is_terminal(c)){ + + config2feat_vec_cff(fm, c, dico_features, fv, LOOKUP_MODE); + + postag = feature_table_argmax(fv, ft, &max); + if(postag != -1) + movement_tagger(c, postag, max, 0); + } + /* config_print(stdout, c); */ + + for(i = stack_nbelem(c->st)-1; i >= 0 ; i--){ + w = stack_elt_n(c->st, i); + printf("%s\t%s\n", w->input, dico_int2string(dico_pos, word_get_pos(w))); + } + + /* config_free(c); */ + c = config_initial(f, mcd_struct, 1000, 0); + } +} + + +void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose) +{ + config *c; + feat_vec *fv = feat_vec_new(feature_types_nb); + + c = config_initial(f, mcd_struct, 10, 5); + while(!config_is_terminal(c)){ + config_print(stdout, c); + config2feat_vec_cff(fm, c, dico_features, fv, LOOKUP_MODE); + + + } + + /* config_print(stdout, c); */ + + /* config_free(c); */ + +} diff --git a/maca_trans_parser/src/simple_decoder_tagger.h b/maca_trans_parser/src/simple_decoder_tagger.h new file mode 100644 index 0000000000000000000000000000000000000000..2aea2ce2817da1c384fd88197400d90fe4379d18 --- /dev/null +++ b/maca_trans_parser/src/simple_decoder_tagger.h @@ -0,0 +1,6 @@ +#ifndef __SIMPLE_DECODER_TAGGER__ +#define __SIMPLE_DECODER_TAGGER__ + +void simple_decoder_tagger(FILE *f, mcd *mcd_struct, dico *d_perceptron_features, dico *d_labels, feature_table *ft, feat_model *fm, int verbose, int stream_mode); + +#endif