Commit e20a7679 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

fixing stream mode of trans_tagger and trans_parser

parent 4a73fa70
......@@ -20,16 +20,29 @@
#define mcd_get_form_col(m) (m)->type[FEAT_TYPE_FORM]
#define mcd_set_form_col(m, v) (m)->type[FEAT_TYPE_FORM] = (v)
/* mcd (multi column description) files describe the format of corpus files */
/* every line of an mcd file describes the content of a column of the corpus file */
/* every line contains four fields separated by a space character */
/* first field is the index of the column described (first column corresponds to index zero) */
/* second field is the name of the column. Such must be taken from the following list: */
/* INDEX, FORM, LEMMA, CPOS, POS, FEAT, LABEL, STAG, INT, GOV, A ... Z */
/* third field correspond to the internal representation of the tokens found in the column described. Four values are possible : */
/* VOCAB if the internal representation is an integer code corresponding to the token */
/* INT if the token is already an integer and its corresponding internal value is the same integer */
/* EMB if the internal representation of the token is a real valued vector. */
/* _ if no internal representation is associated to the field */
/* fourth field is the name of a file in which the encoding is represented, this file can either be a dico (see dico.h) format file or an embedding file (see word_emb.h)*/
typedef struct {
int nb_col;
int type2col[FEAT_TYPE_NB];
int nb_col; /* number of columns in the mcd file */
int type2col[FEAT_TYPE_NB]; /* in which column is represented is the form (FEAT_TYPE_FORM) lemma ... represented */
/* int *col2type; */
int *type;
char **type_str;
int *representation;
char **filename;
dico **dico_array;
word_emb **word_emb_array;
int *type; /* array containing the type of every column */
char **type_str; /* a string version of array type */
int *representation; /* array containing the representation mode of every column (integer, vocabulary, embedding, NULL) */
char **filename; /* array containing the file in which the different values for a columnn is represented */
dico **dico_array; /* array containing the dico corresponding to each column (NULL if no file) */
word_emb **word_emb_array; /* array containing the word embedding structure corresponding to each column (NULL if no file) */
} mcd;
mcd *mcd_build_conll07(void);
......
......@@ -17,12 +17,12 @@ mcd *mcd_new(int nb_col)
for(i=0; i < FEAT_TYPE_NB; i++)
m->type2col[i] = -1;
m->representation = (int *)memalloc(nb_col * sizeof(int));
m->type = (int *)memalloc(nb_col * sizeof(int));
m->type_str = (char **)memalloc(nb_col * sizeof(char *));
m->filename = (char **)memalloc(nb_col * sizeof(char *));
m->dico_array = (dico **)memalloc(nb_col * sizeof(dico *));
m->word_emb_array = (word_emb **)memalloc(nb_col * sizeof(word_emb *));
m->representation = (int *) memalloc(nb_col * sizeof(int));
m->type = (int *) memalloc(nb_col * sizeof(int));
m->type_str = (char **) memalloc(nb_col * sizeof(char *));
m->filename = (char **) memalloc(nb_col * sizeof(char *));
m->dico_array = (dico **) memalloc(nb_col * sizeof(dico *));
m->word_emb_array = (word_emb **) memalloc(nb_col * sizeof(word_emb *));
for(i=0; i < nb_col; i++){
m->representation[i] = MCD_REPRESENTATION_NULL;
......@@ -52,6 +52,10 @@ void mcd_free(mcd *m)
free(m);
}
/* this function is used when reading an corpus file which structure is described in mcd m */
/* it returns the code associated to string str found in column col */
/* the code depends on the way the column is represented (vocabulary, embedding or integer) */
int mcd_get_code(mcd *m, char *str, int col){
if(m->representation[col] == MCD_REPRESENTATION_VOCAB)
return dico_string2int(m->dico_array[col], str);
......@@ -62,6 +66,8 @@ int mcd_get_code(mcd *m, char *str, int col){
return MCD_INVALID_VALUE;
}
/* look for the number of columns in an mcd file */
int mcd_max_column_index_in_file(char *mcd_filename)
{
int max_col = -1;
......@@ -183,6 +189,7 @@ mcd *mcd_read(char *mcd_filename, int verbose)
return m;
}
/* builds an mcd corresponding to the conll07 format */
mcd *mcd_build_conll07(void)
{
......@@ -238,6 +245,8 @@ mcd *mcd_build_conll07(void)
return m;
}
/* builds an mcd corresponding to the ifpls (index, form, pos, lemma, syntax) format */
mcd *mcd_build_ifpls(void)
{
mcd *m = mcd_new(6);
......@@ -350,6 +359,7 @@ mcd *mcd_read_old(char *mcd_filename, char *corpus_filename, dico_vec *vocabs)
return m;
}
/* returns a dico_vec containing the different dictionnaries found in an mcd structure */
dico_vec *mcd_build_dico_vec(mcd *mcd_struct)
{
......
......@@ -33,8 +33,8 @@ context *context_new(void)
ctx->mcd_struct = NULL;
ctx->language = strdup("fr");
ctx->maca_data_path = NULL;
ctx->form_column = 1;
ctx->pos_column = 2;
ctx->form_column = -1;
ctx->pos_column = -1;
return ctx;
}
......@@ -64,10 +64,10 @@ void context_fplm_help_message(context *ctx){
fprintf(stderr, "\t-f --fplm <file> : fplm (form pos lemma morpho) file\n");
}
void context_mcd_help_message(context *ctx){
fprintf(stderr, "\t-m --mcd <file> : multi column description file name\n");
fprintf(stderr, "\t-C --mcd <file> : multi column description file name\n");
}
void context_language_help_message(context *ctx){
fprintf(stderr, "\t-C --language : identifier of the language to use\n");
fprintf(stderr, "\t-L --language : identifier of the language to use\n");
}
void context_maca_data_path_help_message(context *ctx){
fprintf(stderr, "\t-M --maca_data_path : path to maca_data directory\n");
......@@ -87,7 +87,7 @@ context *context_read_options(int argc, char *argv[])
{"verbose", no_argument, 0, 'v'},
{"debug", no_argument, 0, 'd'},
{"conll", required_argument, 0, 'i'},
{"mcd", required_argument, 0, 'm'},
{"mcd", required_argument, 0, 'C'},
{"language", required_argument, 0, 'L'},
{"fplm", required_argument, 0, 'f'},
{"form_column", required_argument, 0, 'F'},
......@@ -97,7 +97,7 @@ context *context_read_options(int argc, char *argv[])
optind = 0;
opterr = 0;
while ((c = getopt_long (argc, argv, "hvdi:f:m:L:M:F:D:", long_options, &option_index)) != -1){
while ((c = getopt_long (argc, argv, "hvdi:f:C:L:M:F:D:P:", long_options, &option_index)) != -1){
switch (c)
{
case 'd':
......@@ -121,7 +121,7 @@ context *context_read_options(int argc, char *argv[])
case 'i':
ctx->conll_filename = strdup(optarg);
break;
case 'm':
case 'C':
ctx->mcd_filename = strdup(optarg);
break;
case 'L':
......
......@@ -109,12 +109,12 @@ int main(int argc, char *argv[])
else
pos_column = ctx->mcd_struct->type2col[FEAT_TYPE_POS];
if(ctx->form_column != -1)
form_column = ctx->form_column;
else
form_column = ctx->mcd_struct->type2col[FEAT_TYPE_FORM];
if(ctx->conll_filename == NULL)
f = stdin;
else
......@@ -141,8 +141,9 @@ int main(int argc, char *argv[])
if(column_nb == form_column)
form = strdup(token);
/* if((column_nb < ctx->mcd_struct->nb_col) && (column_nb == pos_column)) */
if(column_nb == pos_column)
if(column_nb == pos_column){
pos = strdup(token);
}
column_nb++;
} while((token = strtok(NULL , "\t")));
......
......@@ -22,6 +22,7 @@ void look_for_number_of_features_and_classes(char *filename, int *max_feat, int
}
}
*max_feat = *max_feat + 1;
*max_class = *max_class + 1;
fclose(f);
......
......@@ -23,17 +23,18 @@ config *config_new(FILE *f, mcd *mcd_struct)
return c;
}
void config_add_next_word_to_buffer(config *c)
word *config_add_next_word_to_buffer(config *c)
{
word *w = NULL;
w = word_read(c->f, c->mcd_struct);
if(w == NULL) return;
if(w == NULL) return NULL;
if(word_get_index(w) == -1){
w->feat_array[FEAT_TYPE_INDEX] = c->current_index++;
printf("current index = %d\n", c->current_index);
/* printf("current index = %d\n", c->current_index); */
}
queue_add(c->bf, w);
return w;
}
void config_free(config *c)
......@@ -64,6 +65,17 @@ config *config_initial(FILE *f, mcd *mcd_struct, int lookahead)
return c;
}
config *config_initial_no_dummy_word(FILE *f, mcd *mcd_struct, int lookahead)
{
int i;
config *c = config_new(f, mcd_struct);
for(i=0; i < lookahead; i++)
config_add_next_word_to_buffer(c);
return c;
}
config *config_copy(config *o)
{
int i;
......
......@@ -34,12 +34,13 @@ int config_equal(config *c1, config *c2);
int config_equal2(config *c1, config *c2);
config *config_new(FILE *f, mcd *mcd_struct);
config *config_initial(FILE *f, mcd *mcd_struct, int lookahead);
config *config_initial_no_dummy_word(FILE *f, mcd *mcd_struct, int lookahead);
config *config_copy(config *o);
void config_print(FILE *buffer, config *c);
int config_is_terminal(config *c);
void config_free(config *c);
void config_add_mvt(config *c, int mvt);
void config_add_next_word_to_buffer(config *c);
word *config_add_next_word_to_buffer(config *c);
void config_connect_subtrees(config *c, int root_label);
......
......@@ -26,9 +26,10 @@ void context_free(context *ctx)
if(ctx->d_perceptron_features)
dico_free(ctx->d_perceptron_features);
/*
if(ctx->mcd_struct)
mcd_free(ctx->mcd_struct);
*/
if(ctx->features_model)
feat_model_free(ctx->features_model);
......@@ -88,11 +89,12 @@ void context_general_help_message(context *ctx)
{
fprintf(stderr, "usage: %s [options]\n", ctx->program_name);
fprintf(stderr, "Options:\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n");
fprintf(stderr, "\t-h --help : print this message\n");
fprintf(stderr, "\t-v --verbose : activate verbose mode\n");
fprintf(stderr, "\t-r --hratio <float> : set the occupation ratio of hash tables (default is 0.5)\n");
fprintf(stderr, "\t-D --maca_data_path <str> : path to the maca_data directory\n");
fprintf(stderr, "\t-L --language <str> : identifier of the language to use (default is fr)\n");
fprintf(stderr, "\t-L --language <str> : identifier of the language to use (default is fr)\n");
fprintf(stderr, "\t-S --stream : stream mode\n");
}
void context_model_help_message(context *ctx){
......
......@@ -91,5 +91,6 @@ void context_f2p_filename_help_message(context *ctx);
void context_conll_help_message(context *ctx);
void context_ifpls_help_message(context *ctx);
void context_input_help_message(context *ctx);
void context_root_label_help_message(context *ctx);
#endif
......@@ -25,6 +25,7 @@ void decode_help_message(context *ctx)
context_model_help_message(ctx);
context_vocabs_help_message(ctx);
context_features_model_help_message(ctx);
context_root_label_help_message(ctx);
}
void decode_check_options(context *ctx){
......
......@@ -81,7 +81,20 @@ void depset_print2(FILE *f, depset *d, dico *dico_labels)
fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label));
}
}
fprintf(f, "\n");
/* fprintf(f, "\n"); */
}
void depset_print3(FILE *f, depset *d, dico *dico_labels)
{
int i;
for(i=1; i < d->length; i++){
if((d->array[i].gov) && (d->array[i].dep)){
/* fprintf(f, "%s\t%d\t%s\n", d->array[i].dep->input, word_get_index(d->array[i].gov) - word_get_index(d->array[i].dep), dico_int2string(dico_labels, d->array[i].label ));*/
fprintf(f, "%d\t%s\t%d\t%s\n", word_get_index(d->array[i].dep), d->array[i].dep->input, word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label));
}
}
/* fprintf(f, "\n"); */
}
char *skip_index(char *buffer)
......@@ -100,7 +113,8 @@ void depset_print_new_index(FILE *f, depset *d, dico *dico_labels)
for(i=1; i < d->length; i++){
if((d->array[i].gov) && (d->array[i].dep)){
fprintf(f, "%d", word_get_index(d->array[i].dep));
/* fprintf(f, "%d\t", word_get_index(d->array[i].dep)); */
fprintf(f, "%d\t", word_get_index(d->array[i].dep));
fprintf(f, "%s\t%d\t%s\n", skip_index(d->array[i].dep->input), word_get_index(d->array[i].gov), dico_int2string(dico_labels, d->array[i].label));
}
}
......
......@@ -24,6 +24,7 @@ void depset_init(depset *d);
void depset_add(depset *d, word *gov, int label, word *dep);
void depset_print(FILE *f, depset *d);
void depset_print2(FILE *f, depset *d, dico *dico_labels);
void depset_print3(FILE *f, depset *d, dico *dico_labels);
void depset_print_new_index(FILE *f, depset *d, dico *dico_labels);
......
......@@ -113,9 +113,11 @@ int feat_model_get_feat_value_cff(feat_model *fm, config *c, dico *dico_features
catenate_int(fm->string, feat_val);
}
if(mode == LOOKUP_MODE)
if(mode == LOOKUP_MODE){
if(fm->string)
/* printf("fmstring = %s\n", fm->string); */
return dico_string2int(dico_features, fm->string);
}
return dico_add(dico_features, fm->string);
}
......
......@@ -150,7 +150,8 @@ int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max)
for(feat=0; feat < fv->nb; feat++){
for(cla=0; cla < classes_nb; cla++){
if(fv->t[feat] != -1){
if((fv->t[feat] != -1) && (fv->t[feat] < ft->features_nb)){
/* if(fv->t[feat] != -1){ */
/* printf("feat score = %f\n", ft->table[fv->t[feat]][cla]); */
classes_score[cla] += ft->table[fv->t[feat]][cla];
}
......
......@@ -36,7 +36,7 @@ void maca_trans_parser_conll2cff_check_options(context *ctx)
if(!ctx->input_filename
|| ctx->help
/* || !ctx->mcd_filename */
|| !(ctx->cff_filename || ctx->fann_filename)
/* || !(ctx->cff_filename || ctx->fann_filename) */
){
maca_trans_parser_conll2cff_help_message(ctx);
exit(1);
......
......@@ -20,8 +20,10 @@ void add_signature_to_words_in_queue(queue *bf, form2pos *f2p)
for(i=0; i < queue_nbelem(bf); i++){
w = queue_elt_n(bf, i);
/* printf("add signature %d to word %s\n", form2pos_get_signature(f2p, w->form), w->form); */
w->signature = form2pos_get_signature(f2p, w->form);
if(!w->signature){
/* printf("add signature %d to word %s\n", form2pos_get_signature(f2p, w->form), w->form); */
w->signature = form2pos_get_signature(f2p, w->form);
}
}
}
......@@ -61,27 +63,20 @@ void generate_training_file_stream(FILE *output_file, context *ctx)
{
config *c;
feat_vec *fv = feat_vec_new(feature_types_nb);
sentence *ref = NULL;
int sentence_nb = 0;
FILE *conll_file = myfopen(ctx->input_filename, "r");
FILE *conll_file_ref = myfopen(ctx->input_filename, "r");
int postag;
c = config_initial(conll_file, ctx->mcd_struct, 5);
while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){
/* sentence_print(stdout, ref, mcd_get_dico_label(ctx->mcd_struct)); */
while(1){
/* config_print(stdout,c); */
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
postag = oracle_tagger(c, ref);
c = config_initial_no_dummy_word(conll_file, ctx->mcd_struct, 5);
fprintf(output_file, "%d", postag);
feat_vec_print(output_file, fv);
if(postag != -1)
movement_tagger(c, postag, 0, 1);
}
while(!config_is_terminal(c)){
/* config_print(stdout,c); */
if(ctx->f2p)
add_signature_to_words_in_queue(c->bf, ctx->f2p);
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
postag = oracle_tagger(c, NULL);
fprintf(output_file, "%d", postag);
feat_vec_print(output_file, fv);
movement_tagger(c, postag, 0, 1);
}
}
......
......@@ -7,15 +7,19 @@
int movement_tagger(config *c, int postag, float score, int stream)
{
word *b0 = NULL;
int k = 5;
if(queue_is_empty(c->bf)) return 0;
b0 = queue_elt_n(c->bf, 0);
b0 = queue_remove(c->bf);
word_set_pos(b0, postag);
stack_push(c->st, queue_remove(c->bf));
stack_push(c->st, b0);
/* in stream mode, read a new word and add it to the buffer */
if(stream)
config_add_next_word_to_buffer(c);
/* in stream mode, read a new word, add it to the buffer and keen only (k = 5) elts in the stack */
if(stream){
stack_trim_to_size(c->st, k);
config_add_next_word_to_buffer(c);
}
return 1;
}
......@@ -3,12 +3,15 @@
int oracle_tagger(config *c, sentence *ref)
{
word *b0; /* next word in the bufer */
int b0_index;
/* int b0_index; */
int b0_pos;
if(!queue_is_empty(c->bf)){
b0 = queue_elt_n(c->bf, 0);
b0_index = word_get_index(b0);
return word_get_pos(ref->words[b0_index]);
b0_pos = word_get_pos(b0);
/* printf("b0_pos = %d\n", b0_pos); */
/* b0_index = word_get_index(b0); */
/* return word_get_pos(ref->words[b0_index]); */
return b0_pos;
}
return -1;
}
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"feat_fct.h"
#include"feature_table.h"
#include"config2feat_vec.h"
#include"util.h"
void perceptron_avg(char *filename, feature_table *ft, int n_iter)
{
......@@ -18,7 +17,8 @@ void perceptron_avg(char *filename, feature_table *ft, int n_iter)
int epoch;
int i,j;
float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float));
feat_vec *fv = feat_vec_new(feature_types_nb);
/* feat_vec *fv = feat_vec_new(feature_types_nb); */
feat_vec *fv = feat_vec_new(1);
char *token;
feature_table *ft_sum = feature_table_new(ft->features_nb, ft->classes_nb);
int counter = 1;
......@@ -100,7 +100,8 @@ void perceptron(char *filename, feature_table *ft, int n_iter)
int epoch;
int i;
float *classes_score = (float *)memalloc(ft->classes_nb * sizeof(float));
feat_vec *fv = feat_vec_new(feature_types_nb);
/* feat_vec *fv = feat_vec_new(feature_types_nb); */
feat_vec *fv = feat_vec_new(1);
char *token;
for(epoch = 0; epoch < n_iter; epoch++){
......
......@@ -28,7 +28,8 @@ int queue_read_sentence(queue *bf, FILE *f, mcd *mcd_struct)
}
queue_add(bf, w);
}
return bf->nbelem - 1; /* because of the dummy word */
/* return bf->nbelem - 1; */ /* because of the dummy word */
return bf->nbelem ;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment