diff --git a/maca_lexer/src/maca_lexer.c b/maca_lexer/src/maca_lexer.c index ac3eaa321166f595495e8ee31a0543e627839385..9793c5d2b7edfc1e6a362e993eea86339dc30dc7 100644 --- a/maca_lexer/src/maca_lexer.c +++ b/maca_lexer/src/maca_lexer.c @@ -93,7 +93,7 @@ int main(int argc, char *argv[]) float end_array[100]; int path_index = 0; int next_state; - int orfeo = 1; + int orfeo = 0; char form[1000]; float start; float end; diff --git a/maca_tokenizer/src/en_tok_rules.l b/maca_tokenizer/src/en_tok_rules.l index 0e8ea5bfae0d8a26b5127619f767b75813e49b54..07b169cc0bcd706f97fc7d62d86e953a3d028f17 100644 --- a/maca_tokenizer/src/en_tok_rules.l +++ b/maca_tokenizer/src/en_tok_rules.l @@ -1,17 +1,35 @@ %{ -#include <stdio.h> +#include<stdio.h> +#include"maca_tokenizer_functions_for_lex.h" extern int defait_amalgames; +extern int offset; +extern int token_length; +extern char *token; %} %option prefix="en" %option noyywrap %% -[0-9]+\.[0-9]+ printf("%s", yytext); -[ \t]+ printf("\n"); -\. printf("\n."); -\, printf("\n,"); -don't printf("do\nnot"); + +\<[^\>]*\> {maca_tokenizer_segment((char *)"", yytext);} +[ \t]+ {maca_tokenizer_segment((char *)"", yytext);} +[ ]*\. {maca_tokenizer_segment((char *)".", yytext);} +[ ]*\? {maca_tokenizer_segment((char *)"?", yytext);} +[ ]*\! {maca_tokenizer_segment((char *)"!", yytext);} +[ ]*, {maca_tokenizer_segment((char *)",", yytext);} +[ ]*: {maca_tokenizer_segment((char *)":", yytext);} +[ ]*; {maca_tokenizer_segment((char *)";", yytext);} +[ ]*… {maca_tokenizer_segment((char *)"…", yytext);} +[ ]*\) {maca_tokenizer_segment((char *)")", yytext);} +[ ]*» {maca_tokenizer_segment((char *)"»", yytext);} +\( {maca_tokenizer_segment((char *)"((", yytext);} +\" {maca_tokenizer_segment((char *)"\"", yytext);} +« {maca_tokenizer_segment((char *)"«", yytext);} + +[0-9]+\.[0-9]+ {maca_tokenizer_segment(yytext, yytext);} + +don't printf("do\nnot\n"); don’t printf("do\nnot"); doesn't printf("does\nnot"); doesn’t printf("does\nnot"); @@ -23,5 +41,6 @@ wanna printf("want\nto"); ’s printf("\n's"); \n+ printf("\n"); +. {maca_tokenizer_add_char_to_token(yytext[0]);} %% diff --git a/maca_tokenizer/src/fr_tok_rules.l b/maca_tokenizer/src/fr_tok_rules.l index 0f2b0cb8665e406e9c4c58eff105f603f1282bbb..8478402b957317a95cff5b19e2b475fc4801b880 100644 --- a/maca_tokenizer/src/fr_tok_rules.l +++ b/maca_tokenizer/src/fr_tok_rules.l @@ -5,9 +5,10 @@ extern int defait_amalgames; /*extern int print_offset; extern int print_token_length;*/ -int offset = 0; -int token_length = 0; -char token[10000]; +extern int defait_amalgames; +extern int offset; +extern int token_length; +extern char *token; %} %option prefix="fr" @@ -31,11 +32,12 @@ char token[10000]; [ ]*\) {maca_tokenizer_segment((char *)")", yytext);} [ ]*» {maca_tokenizer_segment((char *)"»", yytext);} \( {maca_tokenizer_segment((char *)"((", yytext);} -' {maca_tokenizer_segment((char *)"'", yytext);} -’ {maca_tokenizer_segment((char *)"'", yytext);} \" {maca_tokenizer_segment((char *)"\"", yytext);} « {maca_tokenizer_segment((char *)"«", yytext);} +[^ ]*' {maca_tokenizer_segment((char *)yytext, yytext);} +[^ ]*’ {maca_tokenizer_segment((char *)yytext, yytext);} + [0-9]+,[0-9]+ {maca_tokenizer_segment(yytext, yytext);} -je {maca_tokenizer_segment((char *)"-je", yytext);} diff --git a/maca_tokenizer/src/maca_tokenizer.c b/maca_tokenizer/src/maca_tokenizer.c index 524baa61360d1191c4d25aeef7794bb91c36769e..c7ecc9399c6f1cac7e363757cb01a90c4a81cba1 100644 --- a/maca_tokenizer/src/maca_tokenizer.c +++ b/maca_tokenizer/src/maca_tokenizer.c @@ -10,6 +10,11 @@ int defait_amalgames = 0; int print_offset = 0; int print_token_length = 0; +int offset = 0; +int token_length = 0; +char token[10000]; + + void maca_tokenizer_help_message(context *ctx) { context_general_help_message(ctx); diff --git a/maca_trans_parser/CMakeLists.txt b/maca_trans_parser/CMakeLists.txt index 169c3e8ad5f9f13d08ed5b0ceca07b697b2bcaf9..a3cded9233f9abdf656a33a4ba577e004ad525ef 100644 --- a/maca_trans_parser/CMakeLists.txt +++ b/maca_trans_parser/CMakeLists.txt @@ -3,18 +3,21 @@ set(SOURCES src/context.c src/movement_parser_arc_eager.c src/movement_tagparser_arc_eager.c src/movement_tagger.c + src/movement_chunker.c src/feat_fct.c # src/global_feat_vec.c # src/oracle_parser.c src/oracle_parser_arc_eager.c src/oracle_tagparser_arc_eager.c src/oracle_tagger.c + src/oracle_chunker.c # src/simple_decoder_parser.c src/simple_decoder_parser_arc_eager.c src/simple_decoder_tagparser_arc_eager.c src/simple_decoder_parser_arc_eager_error_predictor.c # src/simple_decoder_forrest.c src/simple_decoder_tagger.c + src/simple_decoder_chunker.c src/simple_decoder_tagger_error_predictor.c # src/simple_decoder_tagger_bt.c src/stack.c @@ -52,6 +55,12 @@ target_link_libraries(maca_trans_tagger_mcf2cff transparse) target_link_libraries(maca_trans_tagger_mcf2cff maca_common) install (TARGETS maca_trans_tagger_mcf2cff DESTINATION bin) +add_executable(maca_trans_chunker_mcf2cff ./src/maca_trans_chunker_mcf2cff.c) +target_link_libraries(maca_trans_chunker_mcf2cff perceptron) +target_link_libraries(maca_trans_chunker_mcf2cff transparse) +target_link_libraries(maca_trans_chunker_mcf2cff maca_common) +install (TARGETS maca_trans_chunker_mcf2cff DESTINATION bin) + add_executable(maca_error_predictor_tagger_mcf2cff ./src/maca_error_predictor_tagger_mcf2cff.c) target_link_libraries(maca_error_predictor_tagger_mcf2cff perceptron) target_link_libraries(maca_error_predictor_tagger_mcf2cff transparse) @@ -136,6 +145,12 @@ target_link_libraries(maca_trans_tagger transparse) target_link_libraries(maca_trans_tagger maca_common) install (TARGETS maca_trans_tagger DESTINATION bin) +add_executable(maca_trans_chunker ./src/maca_trans_chunker.c) +target_link_libraries(maca_trans_chunker perceptron) +target_link_libraries(maca_trans_chunker transparse) +target_link_libraries(maca_trans_chunker maca_common) +install (TARGETS maca_trans_chunker DESTINATION bin) + add_executable(maca_trans_morpho ./src/maca_trans_morpho.c) target_link_libraries(maca_trans_morpho perceptron) target_link_libraries(maca_trans_morpho transparse) diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 66053c4581b292bb5a5049073db7abea57f4f9b4..e00bf24963406eab4f066daae654910ef46217d3 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -14,6 +14,11 @@ #define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab" #define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model" +#define DEFAULT_MULTI_COL_DESC_CHUNKER_FILENAME "maca_trans_chunker.mcd" +#define DEFAULT_FEATURES_MODEL_CHUNKER_FILENAME "maca_trans_chunker.fm" +#define DEFAULT_VOCABS_CHUNKER_FILENAME "maca_trans_chunker.vocab" +#define DEFAULT_MODEL_CHUNKER_FILENAME "maca_trans_chunker.model" + #define DEFAULT_MULTI_COL_DESC_TAGGER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_tagger.mcd" #define DEFAULT_FEATURES_MODEL_TAGGER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_tagger.fm" #define DEFAULT_VOCABS_TAGGER_ERROR_PREDICTOR_FILENAME "maca_error_predictor_tagger.vocab" diff --git a/maca_trans_parser/src/maca_trans_chunker.c b/maca_trans_parser/src/maca_trans_chunker.c new file mode 100644 index 0000000000000000000000000000000000000000..518967534f197e2c88949c1ec6c4a8b431b0fb36 --- /dev/null +++ b/maca_trans_parser/src/maca_trans_chunker.c @@ -0,0 +1,106 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"context.h" +#include"feat_fct.h" +#include"feature_table.h" +#include"dico.h" +#include"beam.h" +#include"form2pos.h" +#include"simple_decoder_chunker.h" +/*#include"dnn_decoder.h"*/ +#include"config2feat_vec.h" + +void decode_chunker_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_beam_help_message(ctx); + context_conll_help_message(ctx); + fprintf(stderr, "INPUT\n"); + context_input_help_message(ctx); + context_mcd_help_message(ctx); + context_model_help_message(ctx); + context_vocabs_help_message(ctx); + context_features_model_help_message(ctx); + context_f2p_filename_help_message(ctx); +} + +void decode_chunker_check_options(context *ctx){ + if(ctx->help + /*!ctx->conll_filename*/ + /* || !ctx->perc_model_filename + || !ctx->mcd_filename + || !ctx->vocabs_filename + || !ctx->features_model_filename*/ + ){ + decode_chunker_help_message(ctx); + exit(1); + } +} + +void decode_chunker_set_linguistic_resources_filenames(context *ctx) +{ + char absolute_filename[500]; + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MODEL_CHUNKER_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_VOCABS_CHUNKER_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + /* if(!ctx->mcd_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_CHUNKER_FILENAME); + ctx->mcd_filename = strdup(absolute_filename); + }*/ + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_CHUNKER_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + if(!ctx->f2p_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_F2P_FILENAME); + ctx->f2p_filename = strdup(absolute_filename); + ctx->f2p = form2pos_read(ctx->f2p_filename); + } + + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename); + } +} + + +int main(int argc, char *argv[]) +{ + context *ctx = context_read_options(argc, argv); + decode_chunker_check_options(ctx); + + decode_chunker_set_linguistic_resources_filenames(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + if(ctx->beam_width == 1) + simple_decoder_chunker(ctx); + + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/maca_trans_chunker_mcf2cff.c b/maca_trans_parser/src/maca_trans_chunker_mcf2cff.c new file mode 100644 index 0000000000000000000000000000000000000000..4177094025c19dda70e662d876eeccea218e873d --- /dev/null +++ b/maca_trans_parser/src/maca_trans_chunker_mcf2cff.c @@ -0,0 +1,116 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include"movement_chunker.h" +#include"oracle_chunker.h" +#include"feat_fct.h" +#include"context.h" +#include"feat_vec.h" +#include"dico_vec.h" +#include"word_emb.h" +#include"config2feat_vec.h" + +void maca_trans_chunker_mcf2cff_help_message(context *ctx) +{ + context_general_help_message(ctx); + context_mode_help_message(ctx); + context_sent_nb_help_message(ctx); + context_mcd_help_message(ctx); + + fprintf(stderr, "INPUT\n"); + context_conll_help_message(ctx); + fprintf(stderr, "IN TEST MODE\n"); + context_vocabs_help_message(ctx); + + fprintf(stderr, "OUTPUT\n"); + context_cff_help_message(ctx); + fprintf(stderr, "IN TRAIN MODE\n"); + context_vocabs_help_message(ctx); +} + +void maca_trans_chunker_mcf2cff_check_options(context *ctx) +{ + if(!ctx->input_filename + || ctx->help + /* || !ctx->mcd_filename */ + || !(ctx->cff_filename || ctx->fann_filename) + ){ + maca_trans_chunker_mcf2cff_help_message(ctx); + exit(1); + } +} + +void generate_training_file(FILE *output_file, context *ctx) +{ + config *c; + feat_vec *fv = feat_vec_new(feature_types_nb); + FILE *conll_file = myfopen(ctx->input_filename, "r"); + int tag; + + c = config_new(conll_file, ctx->mcd_struct, 5); + + while(!config_is_terminal(c)){ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); + tag = oracle_chunker(c); + + fprintf(output_file, "%d", tag); + feat_vec_print(output_file, fv); + movement_chunker(c, tag); + } +} + +int main(int argc, char *argv[]) +{ + context *ctx; + FILE *output_file; + + ctx = context_read_options(argc, argv); + maca_trans_chunker_mcf2cff_check_options(ctx); + + ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose); + + + if(ctx->mode == TRAIN_MODE){ + mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename); + ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct); + } + else if(ctx->mode == TEST_MODE){ + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + } + + feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); + + /* in train mode create feature dictionnary for perceptron */ + if(ctx->mode == TRAIN_MODE) + ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000); + + /* in test mode read feature dictionnary for perceptron */ + if(ctx->mode == TEST_MODE) + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + /* add the feature dictionnary to the dico vector */ + dico_vec_add(ctx->vocabs, ctx->d_perceptron_features); + + /* open output file */ + if(ctx->cff_filename) + output_file = myfopen(ctx->cff_filename, "w"); + else + output_file = stdout; + + generate_training_file(output_file, ctx); + + if(ctx->mode == TRAIN_MODE){ + /* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */ + dico_vec_print(ctx->vocabs_filename, ctx->vocabs); + + } + + if(ctx->cff_filename) + fclose(output_file); + context_free(ctx); + return 0; +} + diff --git a/maca_trans_parser/src/movement_chunker.c b/maca_trans_parser/src/movement_chunker.c new file mode 100644 index 0000000000000000000000000000000000000000..522c7646e2931435bc29181cdbca17fc9a1f0c5b --- /dev/null +++ b/maca_trans_parser/src/movement_chunker.c @@ -0,0 +1,13 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"util.h" +#include"movement_chunker.h" + +int movement_chunker(config *c, int postag) +{ + word_set_pos(word_buffer_b0(c->bf), postag); + word_buffer_move_right(c->bf); + + return 1; +} diff --git a/maca_trans_parser/src/movement_chunker.h b/maca_trans_parser/src/movement_chunker.h new file mode 100644 index 0000000000000000000000000000000000000000..437f5503a9520755f5085265dbd9b9c31e2695e9 --- /dev/null +++ b/maca_trans_parser/src/movement_chunker.h @@ -0,0 +1,7 @@ +#ifndef __MOVEMENT_CHUNKER__ +#define __MOVEMENT_CHUNKER__ + +#include"config.h" +#include"feat_vec.h" +int movement_chunker(config *c, int postag); +#endif diff --git a/maca_trans_parser/src/oracle_chunker.c b/maca_trans_parser/src/oracle_chunker.c new file mode 100644 index 0000000000000000000000000000000000000000..3a75cc46c054deaa564cfd55ef17a9fcad6516b7 --- /dev/null +++ b/maca_trans_parser/src/oracle_chunker.c @@ -0,0 +1,6 @@ +#include"oracle_chunker.h" + +int oracle_chunker(config *c) +{ + return word_get_A(word_buffer_b0(config_get_buffer(c))); +} diff --git a/maca_trans_parser/src/oracle_chunker.h b/maca_trans_parser/src/oracle_chunker.h new file mode 100644 index 0000000000000000000000000000000000000000..c821dc9178b7a74ed538c7cca7ef897895b96ba4 --- /dev/null +++ b/maca_trans_parser/src/oracle_chunker.h @@ -0,0 +1,10 @@ +#ifndef __ORACLE_CHUNKER__ +#define __ORACLE_CHUNKER__ + +#include<stdio.h> +#include<stdlib.h> +#include"config.h" + +int oracle_chunker(config *c); + +#endif diff --git a/maca_trans_parser/src/simple_decoder_chunker.c b/maca_trans_parser/src/simple_decoder_chunker.c new file mode 100644 index 0000000000000000000000000000000000000000..7acf82ca49cd66dfe15586b66c5d732bd8d784ee --- /dev/null +++ b/maca_trans_parser/src/simple_decoder_chunker.c @@ -0,0 +1,96 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<unistd.h> +#include<getopt.h> +#include<ctype.h> + +#include"context.h" +#include"movement_chunker.h" +#include"feat_fct.h" +#include"config2feat_vec.h" +#include"feature_table.h" +#include"dico.h" +#include"mcd.h" + +void print_word(word *w, mcd *mcd_struct, dico *dico_bio, int tag) +{ + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + if(mcd_get_a_col(mcd_struct) == -1){ + printf("%s\t%s\n", w->input, dico_int2string(dico_bio, tag)); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) printf("\t"); + if(col_nb == mcd_get_a_col(mcd_struct)) + printf("%s", dico_int2string(dico_bio, tag)); + else + word_print_col_n(stdout, w, col_nb); + col_nb++; + token = strtok(NULL, "\t"); + } + if(col_nb <= mcd_get_a_col(mcd_struct)) + printf("\t%s", dico_int2string(dico_bio, tag)); + printf("\n"); + free(buffer); + } +} + +void simple_decoder_chunker(context *ctx) +{ + config *c; + feat_vec *fv = feat_vec_new(feature_types_nb); + FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + int tag; + float max; + word *b0; + dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"A"); + + c = config_new(f, ctx->mcd_struct, 5); + + while(!config_is_terminal(c)){ + b0 = word_buffer_b0(c->bf); + tag = -1;//word_get_pos(b0); + + if(ctx->debug_mode){ + fprintf(stderr, "***********************************\n"); + config_print(stderr, c); + } + + /* if tag is not specified in input it is predicted */ + if(tag == -1){ + /* config_print(stdout, c); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + + /* feat_vec_print(stdout, fv); */ + tag = feature_table_argmax(fv, ft, &max); + /* printf("tag = %d\n", tag); */ + + if(ctx->debug_mode){ + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + for(int i=0; i < 3; i++){ + fprintf(stderr, "%d\t", i); + fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score); + } + free(vcode_array); + } + } + + print_word(b0, ctx->mcd_struct, dico_pos, tag); + + movement_chunker(c, tag); + + } + /* config_print(stdout, c); */ + feat_vec_free(fv); + feature_table_free(ft); + config_free(c); + if (ctx->input_filename) fclose(f); +} + diff --git a/maca_trans_parser/src/simple_decoder_chunker.h b/maca_trans_parser/src/simple_decoder_chunker.h new file mode 100644 index 0000000000000000000000000000000000000000..1c5bd0768e7fba453d3a046251d0ec17438c3c70 --- /dev/null +++ b/maca_trans_parser/src/simple_decoder_chunker.h @@ -0,0 +1,7 @@ +#ifndef __SIMPLE_DECODER_CHUNKER__ +#define __SIMPLE_DECODER_CHUNKER__ +#include "context.h" + +void simple_decoder_chunker(context *ctx); + +#endif