Commit 1cddee84 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

first version of tagparser results are still low !

parent ba20febe
set(SOURCES src/context.c
src/feat_desc.c
src/movement_parser_arc_eager.c
# src/movement_parser_arc_eager.c
src/movement_tagparser_arc_eager.c
src/movement_tagger.c
src/feat_fct.c
src/global_feat_vec.c
# src/oracle_parser.c
src/oracle_parser_arc_eager.c
# src/oracle_parser_arc_eager.c
src/oracle_tagparser_arc_eager.c
src/oracle_tagger.c
# src/simple_decoder_parser.c
src/simple_decoder_parser_arc_eager.c
src/simple_decoder_tagparser_arc_eager.c
src/simple_decoder_forrest.c
src/simple_decoder_tagger.c
src/feat_lib.c
......@@ -47,11 +50,17 @@ install (TARGETS maca_trans_tagger_mcf2cff DESTINATION bin)
#target_link_libraries(maca_trans_parser_mcf2cff maca_common)
#install (TARGETS maca_trans_parser_mcf2cff DESTINATION bin)
add_executable(maca_trans_parser_arc_eager_mcf2cff ./src/maca_trans_parser_arc_eager_mcf2cff.c)
target_link_libraries(maca_trans_parser_arc_eager_mcf2cff perceptron)
target_link_libraries(maca_trans_parser_arc_eager_mcf2cff transparse)
target_link_libraries(maca_trans_parser_arc_eager_mcf2cff maca_common)
install (TARGETS maca_trans_parser_arc_eager_mcf2cff DESTINATION bin)
#add_executable(maca_trans_parser_arc_eager_mcf2cff ./src/maca_trans_parser_arc_eager_mcf2cff.c)
#target_link_libraries(maca_trans_parser_arc_eager_mcf2cff perceptron)
#target_link_libraries(maca_trans_parser_arc_eager_mcf2cff transparse)
#target_link_libraries(maca_trans_parser_arc_eager_mcf2cff maca_common)
#install (TARGETS maca_trans_parser_arc_eager_mcf2cff DESTINATION bin)
add_executable(maca_trans_tagparser_arc_eager_mcf2cff ./src/maca_trans_tagparser_arc_eager_mcf2cff.c)
target_link_libraries(maca_trans_tagparser_arc_eager_mcf2cff perceptron)
target_link_libraries(maca_trans_tagparser_arc_eager_mcf2cff transparse)
target_link_libraries(maca_trans_tagparser_arc_eager_mcf2cff maca_common)
install (TARGETS maca_trans_tagparser_arc_eager_mcf2cff DESTINATION bin)
add_executable(compare_traces ./src/compare_traces.c)
target_link_libraries(compare_traces perceptron)
......@@ -65,6 +74,12 @@ target_link_libraries(maca_trans_parser transparse)
target_link_libraries(maca_trans_parser maca_common)
install (TARGETS maca_trans_parser DESTINATION bin)
add_executable(maca_trans_tagparser ./src/maca_trans_tagparser.c)
target_link_libraries(maca_trans_tagparser perceptron)
target_link_libraries(maca_trans_tagparser transparse)
target_link_libraries(maca_trans_tagparser maca_common)
install (TARGETS maca_trans_tagparser DESTINATION bin)
add_executable(maca_trans_parser_forrest ./src/decode_forrest.c)
target_link_libraries(maca_trans_parser_forrest perceptron)
target_link_libraries(maca_trans_parser_forrest transparse)
......
......@@ -65,6 +65,7 @@ context *context_new(void)
ctx->features_model = NULL;
ctx->vocabs = NULL;
ctx->dico_labels = NULL;
ctx->dico_postags = NULL;
ctx->f2p = NULL;
ctx->iteration_nb = 4;
......@@ -163,7 +164,7 @@ context *context_read_options(int argc, char *argv[])
ctx->program_name = strdup(argv[0]);
static struct option long_options[21] =
static struct option long_options[22] =
{
{"help", no_argument, 0, 'h'},
{"verbose", no_argument, 0, 'v'},
......
......@@ -16,6 +16,12 @@
#define DEFAULT_MODEL_TAGGER_FILENAME "maca_trans_tagger.model"
#define DEFAULT_F2P_FILENAME "fP"
#define DEFAULT_MULTI_COL_DESC_TAGPARSER_FILENAME "maca_trans_tagparser.mcd"
#define DEFAULT_FEATURES_MODEL_TAGPARSER_FILENAME "maca_trans_tagparser.fm"
#define DEFAULT_VOCABS_TAGPARSER_FILENAME "maca_trans_tagparser.vocab"
#define DEFAULT_MODEL_TAGPARSER_FILENAME "maca_trans_tagparser.model"
#define DEFAULT_F2P_FILENAME "fP"
#include "dico_vec.h"
#include "feat_model.h"
#include "mcd.h"
......@@ -52,6 +58,7 @@ typedef struct {
int stream_mode;
dico *d_perceptron_features;
dico *dico_labels;
dico *dico_postags;
char *maca_data_path;
char *language;
char *root_label;
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"context.h"
#include"feat_fct.h"
#include"feature_table.h"
#include"dico.h"
#include"beam.h"
#include"simple_decoder_tagparser_arc_eager.h"
#include"config2feat_vec.h"
void decode_help_message(context *ctx)
{
context_general_help_message(ctx);
/* context_beam_help_message(ctx); */
/* context_conll_help_message(ctx); */
fprintf(stderr, "INPUT\n");
context_input_help_message(ctx);
context_mcd_help_message(ctx);
context_model_help_message(ctx);
context_vocabs_help_message(ctx);
context_features_model_help_message(ctx);
context_root_label_help_message(ctx);
}
void decode_check_options(context *ctx){
if(ctx->help
/*!ctx->conll_filename*/
/* || !ctx->perc_model_filename
|| !ctx->mcd_filename
|| !ctx->vocabs_filename
|| !ctx->features_model_filename*/
){
decode_help_message(ctx);
exit(1);
}
}
void set_linguistic_resources_filenames_tagparser(context *ctx)
{
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
else
strcat(absolute_path, getenv("MACAON_DIR"));
strcat(absolute_path, "/");
strcat(absolute_path, ctx->language);
strcat(absolute_path, "/bin/");
if(!ctx->perc_model_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MODEL_TAGPARSER_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_VOCABS_TAGPARSER_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
/* if(!ctx->mcd_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_TAGPARSER_FILENAME);
ctx->mcd_filename = strdup(absolute_filename);
}*/
if(!ctx->features_model_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_TAGPARSER_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(!ctx->f2p_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_F2P_FILENAME);
ctx->f2p_filename = strdup(absolute_filename);
ctx->f2p = form2pos_read(ctx->f2p_filename);
}
if(ctx->verbose){
fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename);
}
}
int main(int argc, char *argv[])
{
context *ctx;
ctx = context_read_options(argc, argv);
decode_check_options(ctx);
set_linguistic_resources_filenames_tagparser(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
/* dico_vec_print(NULL, ctx->vocabs); */
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL");
ctx->dico_postags = dico_vec_get_dico(ctx->vocabs, (char *)"POS");
if(ctx->dico_labels == NULL){
fprintf(stderr, "cannot find label names\n");
return 1;
}
ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 3;
/* load models */
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
simple_decoder_tagparser_arc_eager(ctx);
context_free(ctx);
return 0;
}
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"movement_tagparser_arc_eager.h"
#include"oracle_tagparser_arc_eager.h"
#include"feat_fct.h"
#include"context.h"
#include"feat_vec.h"
#include"dico_vec.h"
#include"word_emb.h"
#include"config2feat_vec.h"
void maca_trans_parser_mcf2cff_help_message(context *ctx)
{
context_general_help_message(ctx);
context_mode_help_message(ctx);
context_sent_nb_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
fprintf(stderr, "IN TEST MODE\n");
context_vocabs_help_message(ctx);
fprintf(stderr, "OUTPUT\n");
context_cff_help_message(ctx);
fprintf(stderr, "IN TRAIN MODE\n");
context_vocabs_help_message(ctx);
}
void maca_trans_parser_mcf2cff_check_options(context *ctx)
{
if(!ctx->input_filename
|| ctx->help
/* || !ctx->mcd_filename */
/* || !(ctx->cff_filename || ctx->fann_filename) */
){
maca_trans_parser_mcf2cff_help_message(ctx);
exit(1);
}
}
void generate_training_file_stream(FILE *output_file, context *ctx)
{
config *c;
int mvt_code;
char mvt_type;
int mvt_label;
feat_vec *fv = feat_vec_new(feature_types_nb);
int sentence_nb = 0;
int root_label = dico_string2int(ctx->dico_labels, (char *) ctx->root_label);
word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct);
FILE *mcf_file = myfopen(ctx->input_filename, "r");
dico *dico_postag = dico_vec_get_dico(ctx->vocabs, (char *)"POS");
/* create an mcd that corresponds to ctx->mcd_struct, but without gov and label */
/* the idea is to ignore syntax in the mcf file that will be read */
/* it is ugly !!! */
mcd *mcd_struct_hyp = mcd_copy(ctx->mcd_struct);
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_POS);
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_GOV);
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_LABEL);
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_SENT_SEG);
c = config_initial(mcf_file, mcd_struct_hyp, 5);
while(!word_buffer_end(ref) && (sentence_nb < ctx->sent_nb)){
/*printf("************ REF ************\n");
word_buffer_print(stdout, ref);
printf("*****************************\n");*/
if(ctx->f2p)
add_signature_to_words_in_word_buffer(c->bf, ctx->f2p);
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
mvt_code = oracle_tagparser_arc_eager(c, ref, root_label);
mvt_type = movement_type(mvt_code);
mvt_label = movement_label(mvt_code);
if(ctx->debug_mode){
config_print(stdout,c);
movement_print(stdout, mvt_code, ctx->dico_labels, dico_postag);
fprintf(stdout, "\n");
}
if(ctx->trace_mode){
fprintf(output_file, "%d\t", word_get_index(word_buffer_b0(config_get_buffer(c))));
stack_print(output_file, c->st);
fprintf(output_file, "\t");
movement_print(output_file, mvt_code, ctx->dico_labels, dico_postag);
fprintf(output_file, "\t1\n");
}
else{
fprintf(output_file, "%d", mvt_code);
feat_vec_print(output_file, fv);
}
if(mvt_type == MVT_EOS){
movement_eos(c, 0);
sentence_nb++;
if(word_buffer_is_last(ref))
break;
}
if(mvt_type == MVT_POSTAG){
movement_add_pos(c, 0, mvt_label);
continue;
}
if(mvt_type == MVT_LEFT){
movement_left_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_RIGHT){
movement_right_arc(c, mvt_label, 0);
word_buffer_move_right(ref);
continue;
}
if(mvt_type == MVT_REDUCE){
movement_reduce(c, 0);
continue;
}
if(mvt_type == MVT_ROOT){
movement_root(c, 0, root_label);
continue;
}
if(mvt_type == MVT_SHIFT){
movement_shift(c, 1, 0);
word_buffer_move_right(ref);
continue;
}
}
}
int main(int argc, char *argv[])
{
context *ctx;
FILE *output_file;
ctx = context_read_options(argc, argv);
maca_trans_parser_mcf2cff_check_options(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose);
if(ctx->mode == TRAIN_MODE){
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename);
ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct);
}
else if(ctx->mode == TEST_MODE){
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
}
ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL");
if(ctx->dico_labels == NULL){
fprintf(stderr, "cannot find label names\n");
return 1;
}
ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 3;
feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb);
/* in train mode create feature dictionnary for perceptron */
if(ctx->mode == TRAIN_MODE)
ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000);
/* in test mode read feature dictionnary for perceptron */
if(ctx->mode == TEST_MODE)
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
/* add the feature dictionnary to the dico vector */
dico_vec_add(ctx->vocabs, ctx->d_perceptron_features);
/* open output file */
output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w") : stdout;
generate_training_file_stream(output_file, ctx);
if(ctx->mode == TRAIN_MODE)
dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
if(ctx->cff_filename)
fclose(output_file);
context_free(ctx);
return 0;
}
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include"util.h"
#include"movement_tagparser_arc_eager.h"
void movement_print(FILE *f, int mvt_code, dico *dico_labels, dico *dico_postag)
{
int mvt_type = movement_type(mvt_code);
int mvt_label = movement_label(mvt_code);
char *label;
if(mvt_type == MVT_SHIFT) {fprintf(f, "SHIFT"); return;}
if(mvt_type == MVT_REDUCE) {fprintf(f, "REDUCE"); return;}
if(mvt_type == MVT_ROOT) {fprintf(f, "ROOT"); return;}
if(mvt_type == MVT_EOS) {fprintf(f, "EOS"); return;}
if(mvt_type == MVT_POSTAG){
fprintf(f, "POSTAG");
label = dico_int2string(dico_postag, mvt_label);
fprintf(f, " %s", label);
return;
}
if(mvt_type == MVT_RIGHT) fprintf(f, "RIGHT");
else fprintf(f, "LEFT");
label = dico_int2string(dico_labels, mvt_label);
fprintf(f, " %s", label);
}
int movement_type(int mvt)
{
if(mvt == MVT_SHIFT) return MVT_SHIFT; /* 0 */
if(mvt == MVT_REDUCE) return MVT_REDUCE; /* 1 */
if(mvt == MVT_ROOT) return MVT_ROOT; /* 2 */
if(mvt == MVT_EOS) return MVT_EOS; /* 3 */
if(mvt % 3 == 0) return MVT_RIGHT; /* 4, 7, 10 ... */
if(mvt % 3 == 1) return MVT_POSTAG; /* 5, 8, 11 ... */
/*if(mvt % 3 == 2)*/ return MVT_LEFT; /* 6, 9, 12 ... */
}
int movement_label(int mvt)
{
if(mvt == MVT_SHIFT) return -1;
if(mvt == MVT_REDUCE) return -1;
if(mvt == MVT_ROOT) return -1;
if(mvt == MVT_EOS) return -1;
if(mvt % 3 == 1) /* pos movement */
return (mvt - 4) / 3;
if(mvt % 3 == 2) /* left movement */
return (mvt - 5) / 3;
/* if(mvt % 3 == 0)*/ /* right movement */
return (mvt - 6) / 3;
}
int movement_add_pos(config *c, float score, int pos)
{
if(word_buffer_b0(config_get_buffer(c)) == NULL) return 0;
if(word_get_pos(word_buffer_b0(config_get_buffer(c))) != -1) return 0;
word_set_pos(word_buffer_b0(config_get_buffer(c)), pos);
/* stack_push(config_get_stack(c), word_buffer_b0(config_get_buffer(c)));
word_buffer_move_right(config_get_buffer(c));*/
config_add_mvt(c, movement_postag(pos));
return 1;
}
int movement_eos(config *c, float score)
{
if(stack_is_empty(config_get_stack(c))) return 0;
if(word_get_sent_seg(stack_top(config_get_stack(c))) == 1) return 0;
/* word on the top of the stack is sent_seg */
word_set_sent_seg(stack_top(config_get_stack(c)), 1);
/* (config_get_stack(c))->top = 0; */
config_add_mvt(c, MVT_EOS);
return 1;
}
int movement_left_arc(config *c, int label, float score)
{
if(stack_is_empty(config_get_stack(c))) return 0;
/* if(word_buffer_is_empty(config_get_buffer(c))) return 0; */
/* word on top of the stack should not have a governor */
if(word_get_gov(stack_top(config_get_stack(c))) != WORD_INVALID_GOV) return 0;
word *gov = word_buffer_b0(config_get_buffer(c));
word *dep = stack_top(config_get_stack(c));
int dist = (word_get_index(gov)) - (word_get_index(dep));
/* create a new dependency */
word_set_gov(dep, dist);
word_set_label(dep, label);
stack_pop(config_get_stack(c));
config_add_mvt(c, movement_left_code(label));
return 1;
}
int movement_right_arc(config *c, int label, float score)
{
if(stack_is_empty(config_get_stack(c))) return 0;
word *gov = stack_top(config_get_stack(c));
word *dep = word_buffer_b0(config_get_buffer(c));
int dist = (word_get_index(gov)) - (word_get_index(dep));
/* create a new dependency */
word_set_gov(dep, dist);
word_set_label(dep, label);
stack_push(config_get_stack(c), word_buffer_b0(config_get_buffer(c)));
word_buffer_move_right(config_get_buffer(c));
config_add_mvt(c, movement_right_code(label));
return 1;
}
int movement_shift(config *c, int stream, float score)
{
if(word_buffer_is_empty(config_get_buffer(c))) return 0;
stack_push(config_get_stack(c), word_buffer_b0(config_get_buffer(c)));
word_buffer_move_right(config_get_buffer(c));
config_add_mvt(c, MVT_SHIFT);
return 1;
}
int movement_reduce(config *c, float score)
{
if(stack_nbelem(config_get_stack(c)) <= 1) return 0;
/* if(stack_is_empty(config_get_stack(c))) return 0; */
/* word on top of stack must have a governor */
if(word_get_gov(stack_top(config_get_stack(c))) == WORD_INVALID_GOV) return 0;
stack_pop(config_get_stack(c));
config_add_mvt(c, MVT_REDUCE);
return 1;
}
int movement_root(config *c, float score, int root_code)
{
word *s0 = stack_top(config_get_stack(c));
if(s0 == NULL) return 0;
word_set_gov(s0, 0);
word_set_label(s0, root_code);
s0->is_root = 1;
stack_pop(config_get_stack(c));
config_add_mvt(c, MVT_ROOT);
return 1;
}
#ifndef __MOVEMENT_TAGPARSER_ARC_EAGER__
#define __MOVEMENT_TAGPARSER_ARC_EAGER__
#include"config.h"
#include"feat_vec.h"
#define MVT_SHIFT 0
#define MVT_REDUCE 1
#define MVT_ROOT 2
#define MVT_EOS 3