Skip to content
Snippets Groups Projects
Commit d865188d authored by Alexis Nasr's avatar Alexis Nasr
Browse files

added a transition based tagger

parent 18b96fe3
No related branches found
No related tags found
No related merge requests found
...@@ -2,12 +2,15 @@ set(SOURCES src/context.c ...@@ -2,12 +2,15 @@ set(SOURCES src/context.c
src/feat_desc.c src/feat_desc.c
src/feature_table.c src/feature_table.c
src/movement.c src/movement.c
src/movement_tagger.c
src/sentence.c src/sentence.c
src/feat_fct.c src/feat_fct.c
src/feat_vec.c src/feat_vec.c
src/global_feat_vec.c src/global_feat_vec.c
src/oracle.c src/oracle.c
src/oracle_tagger.c
src/simple_decoder.c src/simple_decoder.c
src/simple_decoder_tagger.c
src/cf_file.c src/cf_file.c
src/feat_lib.c src/feat_lib.c
src/perceptron.c src/perceptron.c
...@@ -27,6 +30,11 @@ add_library(transparse STATIC ${SOURCES}) ...@@ -27,6 +30,11 @@ add_library(transparse STATIC ${SOURCES})
#compiling, linking and installing executables #compiling, linking and installing executables
add_executable(maca_trans_parser_conll2cff_tagger ./src/maca_trans_parser_conll2cff_tagger.c)
target_link_libraries(maca_trans_parser_conll2cff_tagger transparse)
target_link_libraries(maca_trans_parser_conll2cff_tagger maca_common)
install (TARGETS maca_trans_parser_conll2cff_tagger DESTINATION bin)
add_executable(maca_trans_parser_conll2fann ./src/maca_trans_parser_conll2fann.c) add_executable(maca_trans_parser_conll2fann ./src/maca_trans_parser_conll2fann.c)
target_link_libraries(maca_trans_parser_conll2fann transparse) target_link_libraries(maca_trans_parser_conll2fann transparse)
target_link_libraries(maca_trans_parser_conll2fann maca_common) target_link_libraries(maca_trans_parser_conll2fann maca_common)
...@@ -42,6 +50,11 @@ target_link_libraries(maca_trans_parser transparse) ...@@ -42,6 +50,11 @@ target_link_libraries(maca_trans_parser transparse)
target_link_libraries(maca_trans_parser maca_common) target_link_libraries(maca_trans_parser maca_common)
install (TARGETS maca_trans_parser DESTINATION bin) install (TARGETS maca_trans_parser DESTINATION bin)
add_executable(maca_trans_tagger ./src/decode_tagger.c)
target_link_libraries(maca_trans_tagger transparse)
target_link_libraries(maca_trans_tagger maca_common)
install (TARGETS maca_trans_tagger DESTINATION bin)
add_executable(maca_trans_parser_train ./src/train_perceptron.c) add_executable(maca_trans_parser_train ./src/train_perceptron.c)
target_compile_options(maca_trans_parser_train INTERFACE -Wall) target_compile_options(maca_trans_parser_train INTERFACE -Wall)
target_link_libraries(maca_trans_parser_train transparse) target_link_libraries(maca_trans_parser_train transparse)
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"context.h"
#include"feat_fct.h"
#include"feature_table.h"
#include"dico.h"
#include"beam.h"
#include"simple_decoder_tagger.h"
/*#include"dnn_decoder.h"*/
#include"config2feat_vec.h"
void decode_help_message(context *ctx)
{
context_general_help_message(ctx);
context_beam_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
context_mcd_help_message(ctx);
context_model_help_message(ctx);
context_vocabs_help_message(ctx);
context_features_model_help_message(ctx);
context_language_help_message(ctx);
context_maca_data_path_help_message(ctx);
}
void decode_check_options(context *ctx){
if(ctx->help
/*!ctx->conll_filename*/
/* || !ctx->perc_model_filename
|| !ctx->mcd_filename
|| !ctx->vocabs_filename
|| !ctx->features_model_filename*/
){
decode_help_message(ctx);
exit(1);
}
}
int main(int argc, char *argv[])
{
FILE *conll_file = NULL;
context *ctx;
feature_table *ft;
/* struct fann *ann; */
int root_label;
dico *dico_pos;
ctx = context_read_options(argc, argv);
decode_check_options(ctx);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs);
dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS");
/* when in stream mode, force to renumber the tokens (ugly !) */
if(ctx->stream_mode){
ctx->mcd_struct->type[ctx->mcd_struct->type2col[FEAT_TYPE_INDEX]] = -1;
}
/* load models */
if(ctx->perc_model_filename){
/* ctx->d_perceptron_features = dico_read(ctx->perceptron_features_filename, ctx->hash_ratio); */
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
ft = feature_table_load(ctx->perc_model_filename);
/* hash_stats(dico_features->htable); */
}
/* else if(ctx->dnn_model_filename){
ann = fann_create_from_file(ctx->dnn_model_filename);
if(!ann){
fprintf(stderr, "Error creating ann --- ABORTING.\n");
return -1;
}
}
else{*/
if(ctx->conll_filename)
conll_file= myfopen(ctx->conll_filename, "r");
else
conll_file = stdin;
if(ctx->perc_model_filename){
if(ctx->beam_width == 1){
simple_decoder_tagger(conll_file, ctx->mcd_struct, ctx->d_perceptron_features, dico_pos, ft, ctx->features_model, ctx->verbose, ctx->stream_mode);
}
else
beam_decoder(conll_file, ctx->mcd_struct, ctx->d_perceptron_features, dico_pos, ft, ctx->features_model, ctx->verbose, root_label, ctx->beam_width, ctx->mvt_nb);
}
/* else if(ctx->dnn_model_filename){
dnn_decoder(conll_file, ctx->mcd_struct, ann, ctx->features_model, ctx->verbose, root_label, ctx->stream_mode);
}*/
context_free(ctx);
return 0;
}
...@@ -144,14 +144,18 @@ int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max) ...@@ -144,14 +144,18 @@ int feature_table_argmax(feat_vec *fv, feature_table *ft, float *max)
int classes_nb = ft->classes_nb; int classes_nb = ft->classes_nb;
int feat; int feat;
/* printf("feat tabl argmax classes nb = %d\n", classes_nb); */
for(cla=0; cla < classes_nb; cla++) classes_score[cla] = 0; for(cla=0; cla < classes_nb; cla++) classes_score[cla] = 0;
for(feat=0; feat < fv->nb; feat++){ for(feat=0; feat < fv->nb; feat++){
for(cla=0; cla < classes_nb; cla++){ for(cla=0; cla < classes_nb; cla++){
if(fv->t[feat] != -1) if(fv->t[feat] != -1){
/* printf("feat score = %f\n", ft->table[fv->t[feat]][cla]); */
classes_score[cla] += ft->table[fv->t[feat]][cla]; classes_score[cla] += ft->table[fv->t[feat]][cla];
} }
} }
}
argmax = 0; argmax = 0;
*max = classes_score[0]; *max = classes_score[0];
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"movement_tagger.h"
#include"oracle.h"
#include"feat_fct.h"
#include"context.h"
#include"feat_vec.h"
#include"dico_vec.h"
#include"corpus.h"
#include"word_emb.h"
#include"config2feat_vec.h"
void maca_trans_parser_conll2cff_help_message(context *ctx)
{
context_general_help_message(ctx);
context_mode_help_message(ctx);
context_sent_nb_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
fprintf(stderr, "IN TEST MODE\n");
context_alphabet_help_message(ctx);
fprintf(stderr, "OUTPUT\n");
context_cff_help_message(ctx);
fprintf(stderr, "IN TRAIN MODE\n");
context_alphabet_help_message(ctx);
}
void maca_trans_parser_conll2cff_check_options(context *ctx)
{
if(!ctx->conll_filename
|| ctx->help
/* || !ctx->mcd_filename */
|| !(ctx->cff_filename || ctx->fann_filename)
){
maca_trans_parser_conll2cff_help_message(ctx);
exit(1);
}
}
void generate_training_file_stream(FILE *output_file, context *ctx)
{
config *c;
int mvt_code;
char mvt_type;
int mvt_label;
feat_vec *fv = feat_vec_new(feature_types_nb);
sentence *ref = NULL;
int sentence_nb = 0;
int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), ctx->root_label);
FILE *conll_file = myfopen(ctx->conll_filename, "r");
FILE *conll_file_ref = myfopen(ctx->conll_filename, "r");
int postag;
c = config_initial(conll_file, ctx->mcd_struct, 10, 5);
while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){
/* sentence_print(stdout, ref, mcd_get_dico_label(ctx->mcd_struct)); */
while(1){
/* config_print(stdout,c); */
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
postag = oracle_tagger(c, ref);
fprintf(output_file, "%d", postag);
feat_vec_print(output_file, fv);
if(postag != -1)
movement_tagger(c, postag, 0, 1);
}
}
}
void generate_training_file_buffer(FILE *output_file, context *ctx)
{
config *c;
int mvt_code;
char mvt_type;
int mvt_label;
feat_vec *fv = feat_vec_new(feature_types_nb);
sentence *ref = NULL;
int sentence_nb = 0;
FILE *conll_file = myfopen(ctx->conll_filename, "r");
FILE *conll_file_ref = myfopen(ctx->conll_filename, "r");
int postag;
c = config_initial(conll_file, ctx->mcd_struct, 1000, 0);
while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){
/* sentence_print(stdout, ref, NULL); */
queue_read_sentence(c->bf, conll_file, ctx->mcd_struct);
queue_remove(c->bf); /* get rid of dummy token */
while(!config_is_terminal(c)){
/* config_print(stdout, c); */
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
postag = oracle_tagger(c, ref);
fprintf(output_file, "%d", postag);
feat_vec_print(output_file, fv);
if(postag != -1)
movement_tagger(c, postag, 0, 0);
}
config_free(c);
c = config_initial(conll_file, ctx->mcd_struct, 1000, 0);
sentence_nb++;
}
}
int main(int argc, char *argv[])
{
context *ctx;
FILE *output_file;
ctx = context_read_options(argc, argv);
maca_trans_parser_conll2cff_check_options(ctx);
if(ctx->mode == TRAIN_MODE){
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->conll_filename);
ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct);
}
else if(ctx->mode == TEST_MODE){
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs);
}
feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb);
/* in train mode create feature dictionnary for perceptron */
if(ctx->mode == TRAIN_MODE)
ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000);
/* in test mode read feature dictionnary for perceptron */
if(ctx->mode == TEST_MODE)
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
/* add the feature dictionnary to the dico vector */
dico_vec_add(ctx->vocabs, ctx->d_perceptron_features);
/* open output file */
if(ctx->cff_filename)
output_file = myfopen(ctx->cff_filename, "w");
else
output_file = stdout;
if(ctx->stream_mode)
generate_training_file_stream(output_file, ctx);
else
generate_training_file_buffer(output_file, ctx);
if(ctx->mode == TRAIN_MODE){
/* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */
dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
}
if(ctx->cff_filename)
fclose(output_file);
context_free(ctx);
return 0;
}
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"util.h"
#include"movement_tagger.h"
int movement_tagger(config *c, int postag, float score, int stream)
{
word *b0 = NULL;
if(queue_is_empty(c->bf)) return 0;
b0 = queue_elt_n(c->bf, 0);
word_set_pos(b0, postag);
stack_push(c->st, queue_remove(c->bf));
/* in stream mode, read a new word and add it to the buffer */
if(stream)
config_add_next_word_to_buffer(c);
return 1;
}
#ifndef __MOVEMENT_TAGGER__
#define __MOVEMENT_TAGGER__
#include"config.h"
#include"feat_vec.h"
int movement_tagger(config *c, int postag, float score, int stream);
#endif
#include"oracle_tagger.h"
int oracle_tagger(config *c, sentence *ref)
{
word *b0; /* next word in the bufer */
int b0_index;
if(!queue_is_empty(c->bf)){
b0 = queue_elt_n(c->bf, 0);
b0_index = word_get_index(b0);
return word_get_pos(ref->words[b0_index]);
}
return -1;
}
#ifndef __ORACLE_TAGGER__
#define __ORACLE_TAGGER__
#include<stdio.h>
#include<stdlib.h>
#include"config.h"
#include"sentence.h"
int oracle_tagger(config *c, sentence *ref);
#endif
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"context.h"
#include"movement_tagger.h"
#include"feat_fct.h"
#include"config2feat_vec.h"
#include"feature_table.h"
#include"dico.h"
void simple_decoder_buffer(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose);
void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose);
void simple_decoder_tagger(FILE *f, mcd *mcd_struct, dico *d_perceptron_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose, int stream_mode)
{
if(stream_mode)
simple_decoder_stream(f, mcd_struct, d_perceptron_features, dico_pos, ft, fm, verbose);
else
simple_decoder_buffer(f, mcd_struct, d_perceptron_features, dico_pos, ft, fm, verbose);
}
void simple_decoder_buffer(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose)
{
config *c;
int postag;
feat_vec *fv = feat_vec_new(feature_types_nb);
float max;
int i;
word *w;
c = config_initial(f, mcd_struct, 1000, 0);
/* read a sentence and put it in the buffer */
while(queue_read_sentence(c->bf, f, mcd_struct)){
queue_remove(c->bf); /* get rid of dummy token */
while(!config_is_terminal(c)){
config2feat_vec_cff(fm, c, dico_features, fv, LOOKUP_MODE);
postag = feature_table_argmax(fv, ft, &max);
if(postag != -1)
movement_tagger(c, postag, max, 0);
}
/* config_print(stdout, c); */
for(i = stack_nbelem(c->st)-1; i >= 0 ; i--){
w = stack_elt_n(c->st, i);
printf("%s\t%s\n", w->input, dico_int2string(dico_pos, word_get_pos(w)));
}
/* config_free(c); */
c = config_initial(f, mcd_struct, 1000, 0);
}
}
void simple_decoder_stream(FILE *f, mcd *mcd_struct, dico *dico_features, dico *dico_pos, feature_table *ft, feat_model *fm, int verbose)
{
config *c;
feat_vec *fv = feat_vec_new(feature_types_nb);
c = config_initial(f, mcd_struct, 10, 5);
while(!config_is_terminal(c)){
config_print(stdout, c);
config2feat_vec_cff(fm, c, dico_features, fv, LOOKUP_MODE);
}
/* config_print(stdout, c); */
/* config_free(c); */
}
#ifndef __SIMPLE_DECODER_TAGGER__
#define __SIMPLE_DECODER_TAGGER__
void simple_decoder_tagger(FILE *f, mcd *mcd_struct, dico *d_perceptron_features, dico *d_labels, feature_table *ft, feat_model *fm, int verbose, int stream_mode);
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment