Commit a4db64a2 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

added external lexicon to maca_trans_tagger

parent 4863067b
......@@ -5,6 +5,7 @@ set(SOURCES src/util.c
src/mcd.c
src/dico_vec.c
src/feat_types.c
src/form2pos.c
)
#compiling library
......
#ifndef __FORM2POS__
#define __FORM2POS__
#include"hash.h"
#include"dico.h"
typedef struct
{
int nbelem;
int pos_nb;
dico *d_pos;
dico *d_signature;
hash *h_form2signature;
} form2pos;
form2pos *form2pos_new(int nbelem, int pos_nb, char *pos_list);
void form2pos_free(form2pos *f2p);
form2pos *form2pos_read(char *filename);
int form2pos_get_signature(form2pos *f2p, char *form);
int form2pos_form_has_pos(form2pos *f2p, char *form, char *pos);
#endif
......@@ -17,6 +17,8 @@
#define mcd_get_dico_label(m) (m)->dico_array[FEAT_TYPE_LABEL]
#define mcd_get_form_col(m) (m)->type[FEAT_TYPE_FORM]
#define mcd_set_form_col(m, v) (m)->type[FEAT_TYPE_FORM] = (v)
typedef struct {
int nb_col;
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"form2pos.h"
#include"util.h"
form2pos *form2pos_new(int nbelem, int pos_nb, char *pos_list)
{
form2pos *f2p = memalloc(sizeof(form2pos));
char *token;
f2p->nbelem = nbelem;
f2p->pos_nb = pos_nb;
f2p->d_pos = dico_new("d_pos", pos_nb * 10);
f2p->d_signature = dico_new("d_signature", pos_nb * 10);
f2p->h_form2signature = hash_new(nbelem * 4);
token = strtok(pos_list, "\t");
do{
dico_add(f2p->d_pos, strdup(token));
}while((token = strtok(NULL, "\t")));
return f2p;
}
void form2pos_free(form2pos *f2p)
{
dico_free(f2p->d_pos);
dico_free(f2p->d_signature);
hash_free(f2p->h_form2signature);
free(f2p);
}
form2pos *form2pos_read(char *filename)
{
FILE *f = myfopen(filename, "r");
int nbelem;
int pos_nb;
char pos_list[10000];
char form[300];
char signature[200];
form2pos *f2p = NULL;
/* read number of forms */
fscanf(f, "%d\n", &nbelem);
/* read number of pos tags */
fscanf(f, "%d\n", &pos_nb);
/* read list of pos tags */
fgets(pos_list, 10000, f);
/* printf("form2pos read nbelem = %d pos nb = %d pos list = %s\n", nbelem, pos_nb, pos_list); */
f2p = form2pos_new(nbelem, pos_nb, pos_list);
while(!feof(f)){
fscanf(f, "%[^\t]\t%s\n", form, signature);
/* printf("form = %s signature = %s code = %d\n", form, signature, signature_code); */
hash_add(f2p->h_form2signature, strdup(form), dico_add(f2p->d_signature, signature));
}
return f2p;
}
int form2pos_get_signature(form2pos *f2p, char *form)
{
return hash_get_val(f2p->h_form2signature, form);
}
int form2pos_form_has_pos(form2pos *f2p, char *form, char *pos)
{
int pos_code = dico_string2int(f2p->d_pos, pos);
char *signature;
int signature_code;
if(pos_code == -1){
fprintf(stderr, "cat %s unknown\n", pos);
return -1;
}
signature_code = hash_get_val(f2p->h_form2signature, form);
if(signature_code == -1){
fprintf(stderr, "form %s unknown\n", form);
return -1;
}
signature = dico_int2string(f2p->d_signature, signature_code);
return signature[pos_code];
}
......@@ -20,7 +20,6 @@ mcd *mcd_new(int nb_col)
m->representation = (int *)memalloc(nb_col * sizeof(int));
m->type = (int *)memalloc(nb_col * sizeof(int));
m->type_str = (char **)memalloc(nb_col * sizeof(char *));
/* m->col2type = (int *)memalloc(nb_col * sizeof(int)); */
m->filename = (char **)memalloc(nb_col * sizeof(char *));
m->dico_array = (dico **)memalloc(nb_col * sizeof(dico *));
m->word_emb_array = (word_emb **)memalloc(nb_col * sizeof(word_emb *));
......@@ -29,7 +28,6 @@ mcd *mcd_new(int nb_col)
m->representation[i] = MCD_REPRESENTATION_NULL;
m->type[i] = -1;
m->type_str[i] = NULL;
/* m->col2type[i] = -1; */
m->filename[i] = NULL;
m->dico_array[i] = NULL;
m->word_emb_array[i] = NULL;;
......
......@@ -23,6 +23,7 @@ void context_free(context *ctx)
if(ctx->mcd_filename) free(ctx->mcd_filename);
if(ctx->stag_desc_filename) free(ctx->stag_desc_filename);
if(ctx->features_model_filename) free(ctx->features_model_filename);
if(ctx->f2p_filename) free(ctx->f2p_filename);
if(ctx->maca_data_path) free(ctx->maca_data_path);
if(ctx->language) free(ctx->language);
if(ctx->root_label) free(ctx->root_label);
......@@ -36,48 +37,58 @@ void context_free(context *ctx)
if(ctx->features_model)
feat_model_free(ctx->features_model);
if(ctx->f2p)
form2pos_free(ctx->f2p);
free(ctx);
}
context *context_new(void)
{
context *c = (context *)memalloc(sizeof(context));
c->verbose = 0;
c->program_name = NULL;
c->conll_filename = NULL;
c->perc_model_filename = NULL;
c->dnn_model_filename = NULL;
c->dico_features_filename = NULL;
c->dico_classes_filename = NULL;
c->cff_filename = NULL;
c->fann_filename = NULL;
c->stag_desc_filename = NULL;
c->mcd_filename = NULL;
c->features_model_filename = NULL;
c->vocabs_filename = NULL;
c->maca_data_path = NULL;
c->language = strdup("fr");
c->root_label = strdup("root");
c->d_perceptron_features = NULL;
c->mcd_struct = NULL;
c->features_model = NULL;
c->vocabs = NULL;
c->dico_labels = NULL;
c->iteration_nb = 4;
c->debug_mode = 0;
c->feature_cutoff = 0;
c->help = 0;
c->hash_ratio = 0.5;
c->mode = TRAIN_MODE;
c->beam_width = 1;
c->sent_nb = 1000000;
c->hidden_neurons_nb = 100;
c->stream_mode = 0;
return c;
context *ctx = (context *)memalloc(sizeof(context));
ctx->verbose = 0;
ctx->program_name = NULL;
ctx->conll_filename = NULL;
ctx->perc_model_filename = NULL;
ctx->dnn_model_filename = NULL;
ctx->dico_features_filename = NULL;
ctx->dico_classes_filename = NULL;
ctx->cff_filename = NULL;
ctx->fann_filename = NULL;
ctx->stag_desc_filename = NULL;
ctx->mcd_filename = NULL;
ctx->features_model_filename = NULL;
ctx->vocabs_filename = NULL;
ctx->f2p_filename = NULL;
ctx->maca_data_path = NULL;
ctx->language = strdup("fr");
ctx->root_label = strdup("root");
ctx->d_perceptron_features = NULL;
ctx->mcd_struct = NULL;
ctx->features_model = NULL;
ctx->vocabs = NULL;
ctx->dico_labels = NULL;
ctx->f2p = NULL;
ctx->iteration_nb = 4;
ctx->debug_mode = 0;
ctx->feature_cutoff = 0;
ctx->help = 0;
ctx->hash_ratio = 0.5;
ctx->mode = TRAIN_MODE;
ctx->beam_width = 1;
ctx->sent_nb = 1000000;
ctx->hidden_neurons_nb = 100;
ctx->stream_mode = 0;
ctx->form_column = -1;
return ctx;
}
void context_general_help_message(context *ctx)
......@@ -160,6 +171,10 @@ void context_root_label_help_message(context *ctx){
fprintf(stderr, "\t-R --root_label : name of the root label (default is \"root\")\n");
}
void context_f2p_filename_help_message(context *ctx){
fprintf(stderr, "\t-P --f2p : form to pos (f2p) filename\n");
}
context *context_read_options(int argc, char *argv[])
{
int c;
......@@ -168,7 +183,7 @@ context *context_read_options(int argc, char *argv[])
ctx->program_name = strdup(argv[0]);
static struct option long_options[26] =
static struct option long_options[28] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
......@@ -195,12 +210,14 @@ context *context_read_options(int argc, char *argv[])
{"stream", required_argument, 0, 'T'},
{"language", required_argument, 0, 'X'},
{"maca_data_path", required_argument, 0, 'Y'},
{"root_label", required_argument, 0, 'R'}
{"root_label", required_argument, 0, 'R'},
{"form_col", required_argument, 0, 'O'},
{"f2p", required_argument, 0, 'P'}
};
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "dhvT:m:f:c:i:n:x:u:r:o:b:y:s:M:H:S:C:F:V:X:Y:R:", long_options, &option_index)) != -1){
while ((c = getopt_long (argc, argv, "dhvT:m:f:c:i:n:x:u:r:o:b:y:s:M:H:S:C:F:V:X:Y:R:O:P:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
......@@ -279,6 +296,13 @@ context *context_read_options(int argc, char *argv[])
case 'R':
ctx->root_label = strdup(optarg);
break;
case 'O':
ctx->form_column = atoi(optarg);
break;
case 'P':
ctx->f2p_filename = strdup(optarg);
ctx->f2p = form2pos_read(ctx->f2p_filename);
break;
}
}
......@@ -299,7 +323,18 @@ context *context_read_options(int argc, char *argv[])
feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb);
*/
if(ctx->mcd_filename == NULL){
/* if the form column has been set by user, change it in the mcd file */
/* if(ctx->form_column != -1){
ctx->mcd_struct = mcd_new(ctx->form_column + 1);
mcd_set_form_col(ctx->mcd_struct, ctx->form_column);
ctx->mcd_struct->representation[ctx->form_column] = MCD_REPRESENTATION_VOCAB;
ctx->mcd_struct->filename[ctx->form_column] = strdup("_");
ctx->mcd_struct->dico_array[ctx->form_column] = NULL;
ctx->mcd_struct->type_str[ctx->form_column] = strdup("FORM");
}*/
if(ctx->mcd_struct == NULL){
ctx->mcd_struct = mcd_build_conll07();
}
......@@ -353,3 +388,51 @@ void context_set_linguistic_resources_filenames(context *ctx)
fprintf(stdout, "perc_features_model_filename = %s\n", ctx->features_model_filename);*/
}
void context_set_linguistic_resources_filenames_tagger(context *ctx)
{
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
else
strcat(absolute_path, getenv("MACAON_DIR"));
strcat(absolute_path, "/");
strcat(absolute_path, ctx->language);
strcat(absolute_path, "/bin/");
if(!ctx->perc_model_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MODEL_TAGGER_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_VOCABS_TAGGER_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
/* if(!ctx->mcd_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_TAGGER_FILENAME);
ctx->mcd_filename = strdup(absolute_filename);
}*/
if(!ctx->features_model_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_TAGGER_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
/* fprintf(stdout, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stdout, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stdout, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stdout, "perc_features_model_filename = %s\n", ctx->features_model_filename);*/
}
......@@ -4,15 +4,22 @@
#define TEST_MODE 1
#define TRAIN_MODE 2
#define DEFAULT_MULTI_COL_DESC_FILENAME "maca_trans_parser.mcd"
#define DEFAULT_FEATURES_MODEL_FILENAME "maca_trans_parser.fm"
#define DEFAULT_VOCABS_FILENAME "maca_trans_parser.vocab"
#define DEFAULT_MODEL_FILENAME "maca_trans_parser.model"
#define DEFAULT_MULTI_COL_DESC_TAGGER_FILENAME "maca_trans_tagger.mcd"
#define DEFAULT_FEATURES_MODEL_TAGGER_FILENAME "maca_trans_tagger.fm"
#define DEFAULT_VOCABS_TAGGER_FILENAME "maca_trans_tagger.vocab"
#define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model"
#include "dico_vec.h"
#include "feat_model.h"
#include "mcd.h"
#include "stdlib.h"
#include "form2pos.h"
typedef struct {
int help;
......@@ -25,6 +32,7 @@ typedef struct {
char *cff_filename;
char *fann_filename;
char *stag_desc_filename;
char *f2p_filename;
int hidden_neurons_nb;
int iteration_nb;
int debug_mode;
......@@ -48,6 +56,8 @@ typedef struct {
char *maca_data_path;
char *language;
char *root_label;
int form_column;
form2pos *f2p;
} context;
context *context_new(void);
......@@ -79,5 +89,7 @@ void context_print_alphabets(context *ctx);
void context_language_help_message(context *ctx);
void context_maca_data_path_help_message(context *ctx);
void context_f2p_filename_help_message(context *ctx);
#endif
......@@ -8,6 +8,7 @@
#include"feature_table.h"
#include"dico.h"
#include"beam.h"
#include"form2pos.h"
#include"simple_decoder_tagger.h"
/*#include"dnn_decoder.h"*/
#include"config2feat_vec.h"
......@@ -22,9 +23,10 @@ void decode_help_message(context *ctx)
context_model_help_message(ctx);
context_vocabs_help_message(ctx);
context_features_model_help_message(ctx);
context_features_model_help_message(ctx);
context_language_help_message(ctx);
context_maca_data_path_help_message(ctx);
context_f2p_filename_help_message(ctx);
}
void decode_check_options(context *ctx){
......@@ -44,58 +46,17 @@ int main(int argc, char *argv[])
{
FILE *conll_file = NULL;
context *ctx;
feature_table *ft;
/* struct fann *ann; */
int root_label;
dico *dico_pos;
ctx = context_read_options(argc, argv);
decode_check_options(ctx);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs);
dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS");
/* when in stream mode, force to renumber the tokens (ugly !) */
if(ctx->stream_mode){
ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1;
}
/* load models */
if(ctx->perc_model_filename){
/* ctx->d_perceptron_features = dico_read(ctx->perceptron_features_filename, ctx->hash_ratio); */
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
ft = feature_table_load(ctx->perc_model_filename);
/* hash_stats(dico_features->htable); */
}
/* else if(ctx->dnn_model_filename){
ann = fann_create_from_file(ctx->dnn_model_filename);
if(!ann){
fprintf(stderr, "Error creating ann --- ABORTING.\n");
return -1;
}
}
else{*/
if(ctx->conll_filename)
conll_file= myfopen(ctx->conll_filename, "r");
else
conll_file = stdin;
if(ctx->perc_model_filename){
if(ctx->beam_width == 1){
simple_decoder_tagger(conll_file, ctx->mcd_struct, ctx->d_perceptron_features, dico_pos, ft, ctx->features_model, ctx->verbose, ctx->stream_mode);
}
else
beam_decoder(conll_file, ctx->mcd_struct, ctx->d_perceptron_features, dico_pos, ft, ctx->features_model, ctx->verbose, root_label, ctx->beam_width, ctx->mvt_nb);
}
/* else if(ctx->dnn_model_filename){
dnn_decoder(conll_file, ctx->mcd_struct, ann, ctx->features_model, ctx->verbose, root_label, ctx->stream_mode);
}*/
if(ctx->beam_width == 1)
simple_decoder_tagger(ctx);
context_free(ctx);
return 0;
}
......
......@@ -41,6 +41,7 @@ int s0Y(config *c) {return (stack_nbelem(config_get_stack(c)) < 1) ? -1 : word_g
int s0Z(config *c) {return (stack_nbelem(config_get_stack(c)) < 1) ? -1 : word_get_Z(stack_elt_n(config_get_stack(c), 0));}
int s0U1(config *c) {return (stack_nbelem(config_get_stack(c)) < 1) ? -1 : stack_elt_n(config_get_stack(c), 0)->U1;}
int s0sgn(config *c) {return (stack_nbelem(config_get_stack(c)) < 1) ? -1 : stack_elt_n(config_get_stack(c), 0)->signature;}
int s1f(config *c) {return (stack_nbelem(config_get_stack(c)) < 2) ? -1 : word_get_form(stack_elt_n(config_get_stack(c), 1));}
int s1l(config *c) {return (stack_nbelem(config_get_stack(c)) < 2) ? -1 : word_get_lemma(stack_elt_n(config_get_stack(c), 1));}
......@@ -175,6 +176,7 @@ int b0Y(config *c) {return (queue_nbelem(config_get_buffer(c)) < 1) ? -1 : word_
int b0Z(config *c) {return (queue_nbelem(config_get_buffer(c)) < 1) ? -1 : word_get_Z(queue_elt_n(config_get_buffer(c), 0));}
int b0U1(config *c) {return (queue_nbelem(config_get_buffer(c)) < 1) ? -1 : queue_elt_n(config_get_buffer(c), 0)->U1;}
int b0sgn(config *c) {return (queue_nbelem(config_get_buffer(c)) < 1) ? -1 : queue_elt_n(config_get_buffer(c), 0)->signature;}
int b1f(config *c) {return (queue_nbelem(config_get_buffer(c)) < 2) ? -1 : word_get_form(queue_elt_n(config_get_buffer(c), 1));}
......@@ -211,6 +213,7 @@ int b1Y(config *c) {return (queue_nbelem(config_get_buffer(c)) < 2) ? -1 : word_
int b1Z(config *c) {return (queue_nbelem(config_get_buffer(c)) < 2) ? -1 : word_get_Z(queue_elt_n(config_get_buffer(c), 1));}
int b1U1(config *c) {return (queue_nbelem(config_get_buffer(c)) < 2) ? -1 : queue_elt_n(config_get_buffer(c), 1)->U1;}
int b1sgn(config *c) {return (queue_nbelem(config_get_buffer(c)) < 2) ? -1 : queue_elt_n(config_get_buffer(c), 1)->signature;}
int b2f(config *c) {return (queue_nbelem(config_get_buffer(c)) < 3) ? -1 : word_get_form(queue_elt_n(config_get_buffer(c), 2));}
int b2l(config *c) {return (queue_nbelem(config_get_buffer(c)) < 3) ? -1 : word_get_lemma(queue_elt_n(config_get_buffer(c), 2));}
......
......@@ -40,6 +40,7 @@ int s0Y(config *c);
int s0Z(config *c);
int s0U1(config *c);
int s0sgn(config *c);
int s0r(config *c);
......@@ -181,7 +182,9 @@ int b0X(config *c);
int b0Y(config *c);
int b0Z(config *c);
int b0r(config *c);
int b0U1(config *c);
int b0sgn(config *c);
int b1f(config *c);
int b1l(config *c);
......@@ -215,7 +218,9 @@ int b1W(config *c);
int b1X(config *c);
int b1Y(config *c);
int b1Z(config *c);
int b1U1(config *c);
int b1sgn(config *c);
int b1r(config *c);
......
......@@ -68,6 +68,7 @@ feat_lib *feat_lib_build(void)
feat_lib_add(fl, FEAT_TYPE_Y, (char *)"s0Z", s0Z);
feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"s0U1", s0U1);
feat_lib_add(fl, FEAT_TYPE_INT, (char *)"s0sgn", s0sgn);
feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"s1f", s1f);
......@@ -213,6 +214,7 @@ feat_lib *feat_lib_build(void)
feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b0Z", b0Z);
feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"b0U1", b0U1);
feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b0sgn", b0sgn);
feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"b1f", b1f);
feat_lib_add(fl, FEAT_TYPE_LEMMA, (char *)"b1l", b1l);
......@@ -249,6 +251,7 @@ feat_lib *feat_lib_build(void)
feat_lib_add(fl, FEAT_TYPE_Y, (char *)"b1Z", b1Z);
feat_lib_add(fl, FEAT_TYPE_INT_3, (char *)"b1U1", b1U1);
feat_lib_add(fl, FEAT_TYPE_INT, (char *)"b1sgn", b1sgn);
feat_lib_add(fl, FEAT_TYPE_FORM, (char *)"b2f", b2f);
......
......@@ -13,6 +13,19 @@
#include"word_emb.h"
#include"config2feat_vec.h"
void add_signature_to_words_in_queue(queue *bf, form2pos *f2p)
{
int i;
word *w;
for(i=0; i < queue_nbelem(bf); i++){
w = queue_elt_n(bf, i);
/* printf("add signature %d to word %s\n", form2pos_get_signature(f2p, w->form), w->form); */
w->signature = form2pos_get_signature(f2p, w->form);
}
}
void maca_trans_parser_conll2cff_help_message(context *ctx)
{
context_general_help_message(ctx);
......@@ -94,6 +107,11 @@ void generate_training_file_buffer(FILE *output_file, context *ctx)
/* sentence_print(stdout, ref, NULL); */
queue_read_sentence(c->bf, conll_file, ctx->mcd_struct);
queue_remove(c->bf); /* get rid of dummy token */
if(ctx->f2p)
add_signature_to_words_in_queue(c->bf, ctx->f2p);
while(!config_is_terminal(c)){