diff --git a/maca_common/CMakeLists.txt b/maca_common/CMakeLists.txt index d0dbc552b3f9756143c299ffe07eb309fdbda97c..4fb7f512c32d2bdb99945cd41b13499540f15e27 100644 --- a/maca_common/CMakeLists.txt +++ b/maca_common/CMakeLists.txt @@ -5,6 +5,7 @@ set(SOURCES src/util.c src/mcd.c src/dico_vec.c src/feat_types.c + src/form2pos.c ) #compiling library diff --git a/maca_common/include/form2pos.h b/maca_common/include/form2pos.h new file mode 100644 index 0000000000000000000000000000000000000000..515618ca240bfd76a32818b47078b83df48b4d12 --- /dev/null +++ b/maca_common/include/form2pos.h @@ -0,0 +1,24 @@ +#ifndef __FORM2POS__ +#define __FORM2POS__ + +#include"hash.h" +#include"dico.h" + +typedef struct +{ + int nbelem; + int pos_nb; + dico *d_pos; + dico *d_signature; + hash *h_form2signature; +} form2pos; + + +form2pos *form2pos_new(int nbelem, int pos_nb, char *pos_list); +void form2pos_free(form2pos *f2p); +form2pos *form2pos_read(char *filename); +int form2pos_get_signature(form2pos *f2p, char *form); +int form2pos_form_has_pos(form2pos *f2p, char *form, char *pos); + + +#endif diff --git a/maca_common/include/mcd.h b/maca_common/include/mcd.h index df53f7c8759c59f6a52c25047a8cf955b1e00fc3..07f29affb977445bbf8fa75484d4615e2c6624f4 100644 --- a/maca_common/include/mcd.h +++ b/maca_common/include/mcd.h @@ -17,6 +17,8 @@ #define mcd_get_dico_label(m) (m)->dico_array[FEAT_TYPE_LABEL] +#define mcd_get_form_col(m) (m)->type[FEAT_TYPE_FORM] +#define mcd_set_form_col(m, v) (m)->type[FEAT_TYPE_FORM] = (v) typedef struct { int nb_col; diff --git a/maca_common/src/form2pos.c b/maca_common/src/form2pos.c new file mode 100644 index 0000000000000000000000000000000000000000..610ccf484380740a2f231d74baa87178dba510d2 --- /dev/null +++ b/maca_common/src/form2pos.c @@ -0,0 +1,89 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include"form2pos.h" +#include"util.h" + +form2pos *form2pos_new(int nbelem, int pos_nb, char *pos_list) +{ + form2pos *f2p = memalloc(sizeof(form2pos)); + char *token; + + f2p->nbelem = nbelem; + f2p->pos_nb = pos_nb; + f2p->d_pos = dico_new("d_pos", pos_nb * 10); + f2p->d_signature = dico_new("d_signature", pos_nb * 10); + f2p->h_form2signature = hash_new(nbelem * 4); + token = strtok(pos_list, "\t"); + do{ + dico_add(f2p->d_pos, strdup(token)); + }while((token = strtok(NULL, "\t"))); + return f2p; +} + +void form2pos_free(form2pos *f2p) +{ + dico_free(f2p->d_pos); + dico_free(f2p->d_signature); + hash_free(f2p->h_form2signature); + free(f2p); +} + +form2pos *form2pos_read(char *filename) +{ + FILE *f = myfopen(filename, "r"); + int nbelem; + int pos_nb; + char pos_list[10000]; + char form[300]; + char signature[200]; + form2pos *f2p = NULL; + + /* read number of forms */ + fscanf(f, "%d\n", &nbelem); + + /* read number of pos tags */ + fscanf(f, "%d\n", &pos_nb); + + /* read list of pos tags */ + fgets(pos_list, 10000, f); + + /* printf("form2pos read nbelem = %d pos nb = %d pos list = %s\n", nbelem, pos_nb, pos_list); */ + + f2p = form2pos_new(nbelem, pos_nb, pos_list); + + + while(!feof(f)){ + fscanf(f, "%[^\t]\t%s\n", form, signature); + /* printf("form = %s signature = %s code = %d\n", form, signature, signature_code); */ + hash_add(f2p->h_form2signature, strdup(form), dico_add(f2p->d_signature, signature)); + } + return f2p; +} + +int form2pos_get_signature(form2pos *f2p, char *form) +{ + return hash_get_val(f2p->h_form2signature, form); +} + +int form2pos_form_has_pos(form2pos *f2p, char *form, char *pos) +{ + int pos_code = dico_string2int(f2p->d_pos, pos); + char *signature; + int signature_code; + + if(pos_code == -1){ + fprintf(stderr, "cat %s unknown\n", pos); + return -1; + } + + signature_code = hash_get_val(f2p->h_form2signature, form); + if(signature_code == -1){ + fprintf(stderr, "form %s unknown\n", form); + return -1; + } + + signature = dico_int2string(f2p->d_signature, signature_code); + + return signature[pos_code]; +} diff --git a/maca_common/src/mcd.c b/maca_common/src/mcd.c index a11cae21294ab9a28cf5f51a558d0d1bb44e35fa..780627007c15339d30b79ab304a99be442282278 100644 --- a/maca_common/src/mcd.c +++ b/maca_common/src/mcd.c @@ -20,7 +20,6 @@ mcd *mcd_new(int nb_col) m->representation = (int *)memalloc(nb_col * sizeof(int)); m->type = (int *)memalloc(nb_col * sizeof(int)); m->type_str = (char **)memalloc(nb_col * sizeof(char *)); - /* m->col2type = (int *)memalloc(nb_col * sizeof(int)); */ m->filename = (char **)memalloc(nb_col * sizeof(char *)); m->dico_array = (dico **)memalloc(nb_col * sizeof(dico *)); m->word_emb_array = (word_emb **)memalloc(nb_col * sizeof(word_emb *)); @@ -29,7 +28,6 @@ mcd *mcd_new(int nb_col) m->representation[i] = MCD_REPRESENTATION_NULL; m->type[i] = -1; m->type_str[i] = NULL; - /* m->col2type[i] = -1; */ m->filename[i] = NULL; m->dico_array[i] = NULL; m->word_emb_array[i] = NULL;; diff --git a/maca_trans_parser/src/context.c b/maca_trans_parser/src/context.c index 780b64785c9db6ca218a1efe0f1810664afc4db2..76dff3db872da51e308bc45283963fa5529b9c57 100644 --- a/maca_trans_parser/src/context.c +++ b/maca_trans_parser/src/context.c @@ -23,6 +23,7 @@ void context_free(context *ctx) if(ctx->mcd_filename) free(ctx->mcd_filename); if(ctx->stag_desc_filename) free(ctx->stag_desc_filename); if(ctx->features_model_filename) free(ctx->features_model_filename); + if(ctx->f2p_filename) free(ctx->f2p_filename); if(ctx->maca_data_path) free(ctx->maca_data_path); if(ctx->language) free(ctx->language); if(ctx->root_label) free(ctx->root_label); @@ -36,48 +37,58 @@ void context_free(context *ctx) if(ctx->features_model) feat_model_free(ctx->features_model); + if(ctx->f2p) + form2pos_free(ctx->f2p); + free(ctx); } context *context_new(void) { - context *c = (context *)memalloc(sizeof(context)); - - c->verbose = 0; - c->program_name = NULL; - c->conll_filename = NULL; - c->perc_model_filename = NULL; - c->dnn_model_filename = NULL; - c->dico_features_filename = NULL; - c->dico_classes_filename = NULL; - c->cff_filename = NULL; - c->fann_filename = NULL; - c->stag_desc_filename = NULL; - c->mcd_filename = NULL; - c->features_model_filename = NULL; - c->vocabs_filename = NULL; - - c->maca_data_path = NULL; - c->language = strdup("fr"); - - c->root_label = strdup("root"); - c->d_perceptron_features = NULL; - c->mcd_struct = NULL; - c->features_model = NULL; - c->vocabs = NULL; - c->dico_labels = NULL; - - c->iteration_nb = 4; - c->debug_mode = 0; - c->feature_cutoff = 0; - c->help = 0; - c->hash_ratio = 0.5; - c->mode = TRAIN_MODE; - c->beam_width = 1; - c->sent_nb = 1000000; - c->hidden_neurons_nb = 100; - c->stream_mode = 0; - return c; + context *ctx = (context *)memalloc(sizeof(context)); + + ctx->verbose = 0; + ctx->program_name = NULL; + ctx->conll_filename = NULL; + ctx->perc_model_filename = NULL; + ctx->dnn_model_filename = NULL; + ctx->dico_features_filename = NULL; + ctx->dico_classes_filename = NULL; + ctx->cff_filename = NULL; + ctx->fann_filename = NULL; + ctx->stag_desc_filename = NULL; + ctx->mcd_filename = NULL; + ctx->features_model_filename = NULL; + ctx->vocabs_filename = NULL; + ctx->f2p_filename = NULL; + + ctx->maca_data_path = NULL; + ctx->language = strdup("fr"); + + ctx->root_label = strdup("root"); + ctx->d_perceptron_features = NULL; + ctx->mcd_struct = NULL; + ctx->features_model = NULL; + ctx->vocabs = NULL; + ctx->dico_labels = NULL; + ctx->f2p = NULL; + + + ctx->iteration_nb = 4; + ctx->debug_mode = 0; + ctx->feature_cutoff = 0; + ctx->help = 0; + ctx->hash_ratio = 0.5; + ctx->mode = TRAIN_MODE; + ctx->beam_width = 1; + ctx->sent_nb = 1000000; + ctx->hidden_neurons_nb = 100; + ctx->stream_mode = 0; + + ctx->form_column = -1; + + + return ctx; } void context_general_help_message(context *ctx) @@ -160,6 +171,10 @@ void context_root_label_help_message(context *ctx){ fprintf(stderr, "\t-R --root_label : name of the root label (default is \"root\")\n"); } +void context_f2p_filename_help_message(context *ctx){ + fprintf(stderr, "\t-P --f2p : form to pos (f2p) filename\n"); +} + context *context_read_options(int argc, char *argv[]) { int c; @@ -168,7 +183,7 @@ context *context_read_options(int argc, char *argv[]) ctx->program_name = strdup(argv[0]); - static struct option long_options[26] = + static struct option long_options[28] = { {"help", no_argument, 0, 'h'}, {"verbose", no_argument, 0, 'v'}, @@ -195,12 +210,14 @@ context *context_read_options(int argc, char *argv[]) {"stream", required_argument, 0, 'T'}, {"language", required_argument, 0, 'X'}, {"maca_data_path", required_argument, 0, 'Y'}, - {"root_label", required_argument, 0, 'R'} + {"root_label", required_argument, 0, 'R'}, + {"form_col", required_argument, 0, 'O'}, + {"f2p", required_argument, 0, 'P'} }; optind = 0; opterr = 0; - while ((c = getopt_long (argc, argv, "dhvT:m:f:c:i:n:x:u:r:o:b:y:s:M:H:S:C:F:V:X:Y:R:", long_options, &option_index)) != -1){ + while ((c = getopt_long (argc, argv, "dhvT:m:f:c:i:n:x:u:r:o:b:y:s:M:H:S:C:F:V:X:Y:R:O:P:", long_options, &option_index)) != -1){ switch (c) { case 'd': @@ -279,6 +296,13 @@ context *context_read_options(int argc, char *argv[]) case 'R': ctx->root_label = strdup(optarg); break; + case 'O': + ctx->form_column = atoi(optarg); + break; + case 'P': + ctx->f2p_filename = strdup(optarg); + ctx->f2p = form2pos_read(ctx->f2p_filename); + break; } } @@ -299,7 +323,18 @@ context *context_read_options(int argc, char *argv[]) feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb); */ - if(ctx->mcd_filename == NULL){ + /* if the form column has been set by user, change it in the mcd file */ + /* if(ctx->form_column != -1){ + ctx->mcd_struct = mcd_new(ctx->form_column + 1); + mcd_set_form_col(ctx->mcd_struct, ctx->form_column); + ctx->mcd_struct->representation[ctx->form_column] = MCD_REPRESENTATION_VOCAB; + ctx->mcd_struct->filename[ctx->form_column] = strdup("_"); + ctx->mcd_struct->dico_array[ctx->form_column] = NULL; + ctx->mcd_struct->type_str[ctx->form_column] = strdup("FORM"); + + }*/ + + if(ctx->mcd_struct == NULL){ ctx->mcd_struct = mcd_build_conll07(); } @@ -353,3 +388,51 @@ void context_set_linguistic_resources_filenames(context *ctx) fprintf(stdout, "perc_features_model_filename = %s\n", ctx->features_model_filename);*/ } + +void context_set_linguistic_resources_filenames_tagger(context *ctx) +{ + char absolute_path[500]; + char absolute_filename[500]; + + absolute_path[0] = '\0'; + + if(ctx->maca_data_path) + strcat(absolute_path, ctx->maca_data_path); + else + strcat(absolute_path, getenv("MACAON_DIR")); + + strcat(absolute_path, "/"); + strcat(absolute_path, ctx->language); + strcat(absolute_path, "/bin/"); + + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_MODEL_TAGGER_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_VOCABS_TAGGER_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + /* if(!ctx->mcd_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_TAGGER_FILENAME); + ctx->mcd_filename = strdup(absolute_filename); + }*/ + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_TAGGER_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + /* fprintf(stdout, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stdout, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stdout, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stdout, "perc_features_model_filename = %s\n", ctx->features_model_filename);*/ + +} diff --git a/maca_trans_parser/src/context.h b/maca_trans_parser/src/context.h index 2b0ed94a6cae35a67b2c767c81734c4ab04a177a..83d859bf4cc8ba54493b095d738e7be700971bf2 100644 --- a/maca_trans_parser/src/context.h +++ b/maca_trans_parser/src/context.h @@ -4,15 +4,22 @@ #define TEST_MODE 1 #define TRAIN_MODE 2 + #define DEFAULT_MULTI_COL_DESC_FILENAME "maca_trans_parser.mcd" #define DEFAULT_FEATURES_MODEL_FILENAME "maca_trans_parser.fm" #define DEFAULT_VOCABS_FILENAME "maca_trans_parser.vocab" #define DEFAULT_MODEL_FILENAME "maca_trans_parser.model" +#define DEFAULT_MULTI_COL_DESC_TAGGER_FILENAME "maca_trans_tagger.mcd" +#define DEFAULT_FEATURES_MODEL_TAGGER_FILENAME "maca_trans_tagger.fm" +#define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab" +#define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model" + #include "dico_vec.h" #include "feat_model.h" #include "mcd.h" #include "stdlib.h" +#include "form2pos.h" typedef struct { int help; @@ -25,6 +32,7 @@ typedef struct { char *cff_filename; char *fann_filename; char *stag_desc_filename; + char *f2p_filename; int hidden_neurons_nb; int iteration_nb; int debug_mode; @@ -48,6 +56,8 @@ typedef struct { char *maca_data_path; char *language; char *root_label; + int form_column; + form2pos *f2p; } context; context *context_new(void); @@ -79,5 +89,7 @@ void context_print_alphabets(context *ctx); void context_language_help_message(context *ctx); void context_maca_data_path_help_message(context *ctx); +void context_f2p_filename_help_message(context *ctx); + #endif diff --git a/maca_trans_parser/src/decode_tagger.c b/maca_trans_parser/src/decode_tagger.c index 30951cc49aa171edd888ae565e25ab0cdb7fe68b..f2d92c876b3833eb9002979afc73313251e0ffa5 100644 --- a/maca_trans_parser/src/decode_tagger.c +++ b/maca_trans_parser/src/decode_tagger.c @@ -8,6 +8,7 @@ #include"feature_table.h" #include"dico.h" #include"beam.h" +#include"form2pos.h" #include"simple_decoder_tagger.h" /*#include"dnn_decoder.h"*/ #include"config2feat_vec.h" @@ -22,9 +23,10 @@ void decode_help_message(context *ctx) context_model_help_message(ctx); context_vocabs_help_message(ctx); context_features_model_help_message(ctx); + context_features_model_help_message(ctx); context_language_help_message(ctx); context_maca_data_path_help_message(ctx); - + context_f2p_filename_help_message(ctx); } void decode_check_options(context *ctx){ @@ -44,58 +46,17 @@ int main(int argc, char *argv[]) { FILE *conll_file = NULL; context *ctx; - feature_table *ft; /* struct fann *ann; */ - int root_label; - dico *dico_pos; + ctx = context_read_options(argc, argv); decode_check_options(ctx); ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs); - - dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); - /* when in stream mode, force to renumber the tokens (ugly !) */ - if(ctx->stream_mode){ - ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; - } - - - /* load models */ - - if(ctx->perc_model_filename){ - /* ctx->d_perceptron_features = dico_read(ctx->perceptron_features_filename, ctx->hash_ratio); */ - ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); - ft = feature_table_load(ctx->perc_model_filename); - /* hash_stats(dico_features->htable); */ - } - - - /* else if(ctx->dnn_model_filename){ - ann = fann_create_from_file(ctx->dnn_model_filename); - if(!ann){ - fprintf(stderr, "Error creating ann --- ABORTING.\n"); - return -1; - } - } - else{*/ - - if(ctx->conll_filename) - conll_file= myfopen(ctx->conll_filename, "r"); - else - conll_file = stdin; - - if(ctx->perc_model_filename){ - if(ctx->beam_width == 1){ - simple_decoder_tagger(conll_file, ctx->mcd_struct, ctx->d_perceptron_features, dico_pos, ft, ctx->features_model, ctx->verbose, ctx->stream_mode); - } - else - beam_decoder(conll_file, ctx->mcd_struct, ctx->d_perceptron_features, dico_pos, ft, ctx->features_model, ctx->verbose, root_label, ctx->beam_width, ctx->mvt_nb); - } - /* else if(ctx->dnn_model_filename){ - dnn_decoder(conll_file, ctx->mcd_struct, ann, ctx->features_model, ctx->verbose, root_label, ctx->stream_mode); - }*/ + if(ctx->beam_width == 1) + simple_decoder_tagger(ctx); + context_free(ctx); return 0; } diff --git a/maca_trans_parser/src/feat_fct.c b/maca_trans_parser/src/feat_fct.c index 9fccd97116e989c65eb381bf8e1fb2726dfac829..f8b45feaa8ddee247aeadb5e188ae71483886eb7 100644 --- a/maca_trans_parser/src/feat_fct.c +++ b/maca_trans_parser/src/feat_fct.c @@ -41,6 +41,7 @@ int s0Y(config *c) {return (stack_nbelem(config_get_stack(c)) < 1) ? -1 : word_g int s0Z(config *c) {return (stack_nbelem(config_get_stack(c)) < 1) ? -1 : word_get_Z(stack_elt_n(config_get_stack(c), 0));} int s0U1(config *c) {return (stack_nbelem(config_get_stack(c)) < 1) ? -1 : stack_elt_n(config_get_stack(c), 0)->U1;} +int s0sgn(config *c) {return (stack_nbelem(config_get_stack(c)) < 1) ? -1 : stack_elt_n(config_get_stack(c), 0)->signature;} int s1f(config *c) {return (stack_nbelem(config_get_stack(c)) < 2) ? -1 : word_get_form(stack_elt_n(config_get_stack(c), 1));} int s1l(config *c) {return (stack_nbelem(config_get_stack(c)) < 2) ? -1 : word_get_lemma(stack_elt_n(config_get_stack(c), 1));} @@ -175,6 +176,7 @@ int b0Y(config *c) {return (queue_nbelem(config_get_buffer(c)) < 1) ? -1 : word_ int b0Z(config *c) {return (queue_nbelem(config_get_buffer(c)) < 1) ? -1 : word_get_Z(queue_elt_n(config_get_buffer(c), 0));} int b0U1(config *c) {return (queue_nbelem(config_get_buffer(c)) < 1) ? -1 : queue_elt_n(config_get_buffer(c), 0)->U1;} +int b0sgn(config *c) {return (queue_nbelem(config_get_buffer(c)) < 1) ? -1 : queue_elt_n(config_get_buffer(c), 0)->signature;} int b1f(config *c) {return (queue_nbelem(config_get_buffer(c)) < 2) ? -1 : word_get_form(queue_elt_n(config_get_buffer(c), 1));} @@ -211,6 +213,7 @@ int b1Y(config *c) {return (queue_nbelem(config_get_buffer(c)) < 2) ? -1 : word_ int b1Z(config *c) {return (queue_nbelem(config_get_buffer(c)) < 2) ? -1 : word_get_Z(queue_elt_n(config_get_buffer(c), 1));} int b1U1(config *c) {return (queue_nbelem(config_get_buffer(c)) < 2) ? -1 : queue_elt_n(config_get_buffer(c), 1)->U1;} +int b1sgn(config *c) {return (queue_nbelem(config_get_buffer(c)) < 2) ? -1 : queue_elt_n(config_get_buffer(c), 1)->signature;} int b2f(config *c) {return (queue_nbelem(config_get_buffer(c)) < 3) ? -1 : word_get_form(queue_elt_n(config_get_buffer(c), 2));} int b2l(config *c) {return (queue_nbelem(config_get_buffer(c)) < 3) ? -1 : word_get_lemma(queue_elt_n(config_get_buffer(c), 2));} diff --git a/maca_trans_parser/src/feat_fct.h b/maca_trans_parser/src/feat_fct.h index 02e31894811f92f80ff22f2b7cc9d7953d48efbc..cce0f63febd02a6ac80c0000023645b9603ec309 100644 --- a/maca_trans_parser/src/feat_fct.h +++ b/maca_trans_parser/src/feat_fct.h @@ -40,6 +40,7 @@ int s0Y(config *c); int s0Z(config *c); int s0U1(config *c); +int s0sgn(config *c); int s0r(config *c); @@ -181,7 +182,9 @@ int b0X(config *c); int b0Y(config *c); int b0Z(config *c); int b0r(config *c); + int b0U1(config *c); +int b0sgn(config *c); int b1f(config *c); int b1l(config *c); @@ -215,7 +218,9 @@ int b1W(config *c); int b1X(config *c); int b1Y(config *c); int b1Z(config *c); + int b1U1(config *c); +int b1sgn(config *c); int b1r(config *c); diff --git a/maca_trans_parser/src/feat_lib.c b/maca_trans_parser/src/feat_lib.c index ef78e8084abc766d340e29ea3d729c6dbecc6546..a75a8130e5be273cd76a5910b74abc5df9b19076 100644 --- a/maca_trans_parser/src/feat_lib.c +++ b/maca_trans_parser/src/feat_lib.c @@ -68,6 +68,7 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_Y, (char *)"s0Z", s0Z); feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"s0U1", s0U1); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"s0sgn", s0sgn); feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"s1f", s1f); @@ -213,6 +214,7 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b0Z", b0Z); feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"b0U1", b0U1); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0sgn", b0sgn); feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"b1f", b1f); feat_lib_add(fl, FEAT_TYPE_LEMMA, (char *)"b1l", b1l); @@ -249,6 +251,7 @@ feat_lib *feat_lib_build(void) feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b1Z", b1Z); feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"b1U1", b1U1); + feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b1sgn", b1sgn); feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"b2f", b2f); diff --git a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c index 146c728cd692ad5051fcc0f8d3d48aea0e69dae1..cfd79657fe15d1f44fa55b535266f3d674ebf5f6 100644 --- a/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c +++ b/maca_trans_parser/src/maca_trans_parser_conll2cff_tagger.c @@ -13,6 +13,19 @@ #include"word_emb.h" #include"config2feat_vec.h" +void add_signature_to_words_in_queue(queue *bf, form2pos *f2p) +{ + int i; + word *w; + + for(i=0; i < queue_nbelem(bf); i++){ + w = queue_elt_n(bf, i); + /* printf("add signature %d to word %s\n", form2pos_get_signature(f2p, w->form), w->form); */ + w->signature = form2pos_get_signature(f2p, w->form); + } +} + + void maca_trans_parser_conll2cff_help_message(context *ctx) { context_general_help_message(ctx); @@ -94,6 +107,11 @@ void generate_training_file_buffer(FILE *output_file, context *ctx) /* sentence_print(stdout, ref, NULL); */ queue_read_sentence(c->bf, conll_file, ctx->mcd_struct); queue_remove(c->bf); /* get rid of dummy token */ + + if(ctx->f2p) + add_signature_to_words_in_queue(c->bf, ctx->f2p); + + while(!config_is_terminal(c)){ /* config_print(stdout, c); */ config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode); diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c index e50a1ae8144223c2581bd4fd8d0e77bda42cbb70..6eb1cce8f7c16d915f5ffc93ad044c65e8c63b92 100644 --- a/maca_trans_parser/src/simple_decoder_tagger.c +++ b/maca_trans_parser/src/simple_decoder_tagger.c @@ -10,36 +10,45 @@ #include"feature_table.h" #include"dico.h" -void simple_decoder_buffer(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose); -void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose); - - -void simple_decoder_tagger(FILE *f, mcd *mcd_struct, dico *d_perceptron_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose, int stream_mode) +void add_signature_to_words_in_queue(queue *bf, form2pos *f2p) { - if(stream_mode) - simple_decoder_stream(f, mcd_struct, d_perceptron_features, dico_pos, ft, fm, verbose); - else - simple_decoder_buffer(f, mcd_struct, d_perceptron_features, dico_pos, ft, fm, verbose); + int i; + word *w; + + for(i=0; i < queue_nbelem(bf); i++){ + w = queue_elt_n(bf, i); + w->signature = form2pos_get_signature(f2p, w->form); + } } -void simple_decoder_buffer(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose) + +void simple_decoder_buffer(context *ctx) { - config *c; + FILE *f = NULL; + dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); + feature_table *ft = feature_table_load(ctx->perc_model_filename); + config *c = NULL; int postag; feat_vec *fv = feat_vec_new(feature_types_nb); float max; int i; word *w; - c = config_initial(f, mcd_struct, 1000, 0); + if(ctx->conll_filename) + f= myfopen(ctx->conll_filename, "r"); + else + f= stdin; + + c = config_initial(f, ctx->mcd_struct, 1000, 0); /* read a sentence and put it in the buffer */ - while(queue_read_sentence(c->bf, f, mcd_struct)){ + while(queue_read_sentence(c->bf, f, ctx->mcd_struct)){ queue_remove(c->bf); /* get rid of dummy token */ + if(ctx->f2p) + add_signature_to_words_in_queue(c->bf, ctx->f2p); while(!config_is_terminal(c)){ - - config2feat_vec_cff(fm, c, dico_features, fv, LOOKUP_MODE); - + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + postag = feature_table_argmax(fv, ft, &max); if(postag != -1) movement_tagger(c, postag, max, 0); @@ -52,20 +61,27 @@ void simple_decoder_buffer(FILE *f, mcd *mcd_struct, dico *dico_features, dico * } /* config_free(c); */ - c = config_initial(f, mcd_struct, 1000, 0); + c = config_initial(f, ctx->mcd_struct, 1000, 0); } + if(ctx->conll_filename) + fclose(f); } -void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose) +void simple_decoder_stream(context *ctx) { config *c; feat_vec *fv = feat_vec_new(feature_types_nb); + FILE *f = NULL; + - c = config_initial(f, mcd_struct, 10, 5); + /* when in stream mode, force to renumber the tokens (ugly !) */ + ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1; + + c = config_initial(f, ctx->mcd_struct, 10, 5); while(!config_is_terminal(c)){ config_print(stdout, c); - config2feat_vec_cff(fm, c, dico_features, fv, LOOKUP_MODE); + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); } @@ -75,3 +91,19 @@ void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico * /* config_free(c); */ } + + +void simple_decoder_tagger(context *ctx) +/* (FILE *f, mcd *mcd_struct, dico *d_perceptron_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose, int stream_mode)*/ +{ + + /*conll_file, ctx->mcd_struct, ctx->d_perceptron_features, dico_pos, ft, ctx->features_model, ctx->verbose, ctx->stream_mode);*/ + + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + + if(ctx->stream_mode) + simple_decoder_stream(ctx); + else + simple_decoder_buffer(ctx); +} + diff --git a/maca_trans_parser/src/simple_decoder_tagger.h b/maca_trans_parser/src/simple_decoder_tagger.h index 2aea2ce2817da1c384fd88197400d90fe4379d18..28f4f1091d01c97638b0a891c76176c68042992a 100644 --- a/maca_trans_parser/src/simple_decoder_tagger.h +++ b/maca_trans_parser/src/simple_decoder_tagger.h @@ -1,6 +1,6 @@ #ifndef __SIMPLE_DECODER_TAGGER__ #define __SIMPLE_DECODER_TAGGER__ -void simple_decoder_tagger(FILE *f, mcd *mcd_struct, dico *d_perceptron_features, dico *d_labels, feature_table *ft, feat_model *fm, int verbose, int stream_mode); +void simple_decoder_tagger(context *ctx); #endif diff --git a/maca_trans_parser/src/word.c b/maca_trans_parser/src/word.c index bdb63930b3fa801c4608b1529ff5ce6607f89151..d2a28f71b0d2eca8ec7d6196f7d1c5e3b8483783 100644 --- a/maca_trans_parser/src/word.c +++ b/maca_trans_parser/src/word.c @@ -16,6 +16,7 @@ word *word_new(char *input) w->input = strdup(input); for(i=0; i < FEAT_TYPE_NB; i++) w->feat_array[i] = -1; + w->form = NULL; return w; } @@ -52,6 +53,7 @@ word *word_parse_buffer(char *buffer, mcd *mcd_struct) w->feat_array[mcd_struct->type[column_nb]] = mcd_get_code(mcd_struct, token, column_nb); } if(mcd_struct->type[column_nb] == FEAT_TYPE_FORM){ + w->form = strdup(token); w->U1 = isupper(token[0]) ? 1 : 0; } column_nb++; @@ -76,6 +78,7 @@ void word_free(word *w) { if(w == NULL) return; if(w->input) free(w->input); + if(w->form) free(w->form); free(w); } diff --git a/maca_trans_parser/src/word.h b/maca_trans_parser/src/word.h index 412f3a1361bbc80f23056ff34c20ee5827685a3f..94141984f25f9db625ff30e16d4efb6d71a50add 100644 --- a/maca_trans_parser/src/word.h +++ b/maca_trans_parser/src/word.h @@ -38,6 +38,7 @@ #define word_get_X(w) (w)->feat_array[FEAT_TYPE_X] #define word_get_Y(w) (w)->feat_array[FEAT_TYPE_Y] #define word_get_Z(w) (w)->feat_array[FEAT_TYPE_Z] +#define word_get_signature(w) (w)->signature #define word_set_index(w, val) (w)->feat_array[FEAT_TYPE_INDEX] = (val) #define word_set_form(w, val) (w)->feat_array[FEAT_TYPE_FORM] = (val) @@ -74,12 +75,15 @@ #define word_set_X(w, val) (w)->feat_array[FEAT_TYPE_X] = (val) #define word_set_Y(w, val) (w)->feat_array[FEAT_TYPE_Y] = (val) #define word_set_Z(w, val) (w)->feat_array[FEAT_TYPE_Z] = (val) +#define word_set_signature(w, val) (w)->signature = (val) typedef struct _word { int feat_array[FEAT_TYPE_NB]; char *input; - int U1; /* does the form begin with an uppercase character */ + int U1; /* does the form begin with an uppercase character */ + int signature; /* pos tags that this form can have (represented as a boolean string) */ int label; + char *form; } word; word *word_new(char *input);