Commit 72eba5aa authored by Alexis Nasr's avatar Alexis Nasr
Browse files

stable version of arc eager parser

parent 89a7f9f3
......@@ -53,7 +53,7 @@
#define word_set_gov(w, val) (w)->wf_array[MCD_WF_GOV] = (val)
#define word_set_label(w, val) (w)->wf_array[MCD_WF_LABEL] = (val)
#define word_set_stag(w, val) (w)->wf_array[MCD_WF_STAG] = (val)
#define word_set_word_seg(w) (w)->wf_array[MCD_WF_WORD_SEG] = (val)
#define word_set_sent_seg(w, val) (w)->wf_array[MCD_WF_SENT_SEG] = (val)
#define word_set_A(w, val) (w)->wf_array[MCD_WF_A] = (val)
#define word_set_B(w, val) (w)->wf_array[MCD_WF_B] = (val)
#define word_set_C(w, val) (w)->wf_array[MCD_WF_C] = (val)
......
......@@ -8,6 +8,7 @@ set(SOURCES src/context.c
src/oracle_parser_arc_eager.c
src/oracle_tagger.c
src/simple_decoder_parser.c
src/simple_decoder_parser_arc_eager.c
src/simple_decoder_forrest.c
src/simple_decoder_tagger.c
src/feat_lib.c
......@@ -46,6 +47,12 @@ target_link_libraries(maca_trans_parser_mcf2cff transparse)
target_link_libraries(maca_trans_parser_mcf2cff maca_common)
install (TARGETS maca_trans_parser_mcf2cff DESTINATION bin)
add_executable(maca_trans_parser_arc_eager_mcf2cff ./src/maca_trans_parser_arc_eager_mcf2cff.c)
target_link_libraries(maca_trans_parser_arc_eager_mcf2cff perceptron)
target_link_libraries(maca_trans_parser_arc_eager_mcf2cff transparse)
target_link_libraries(maca_trans_parser_arc_eager_mcf2cff maca_common)
install (TARGETS maca_trans_parser_arc_eager_mcf2cff DESTINATION bin)
add_executable(maca_trans_parser ./src/maca_trans_parser.c)
target_link_libraries(maca_trans_parser perceptron)
target_link_libraries(maca_trans_parser transparse)
......
......@@ -6,6 +6,7 @@
#include"context.h"
#include"movement_parser.h"
#include"oracle_parser.h"
#include"oracle_parser_arc_eager.h"
#include"feat_fct.h"
#include"feature_table.h"
#include"dico.h"
......
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"movement_parser_arc_eager.h"
#include"oracle_parser_arc_eager.h"
#include"feat_fct.h"
#include"context.h"
#include"feat_vec.h"
#include"dico_vec.h"
#include"word_emb.h"
#include"config2feat_vec.h"
void maca_trans_parser_mcf2cff_help_message(context *ctx)
{
context_general_help_message(ctx);
context_mode_help_message(ctx);
context_sent_nb_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
fprintf(stderr, "IN TEST MODE\n");
context_vocabs_help_message(ctx);
fprintf(stderr, "OUTPUT\n");
context_cff_help_message(ctx);
fprintf(stderr, "IN TRAIN MODE\n");
context_vocabs_help_message(ctx);
}
void maca_trans_parser_mcf2cff_check_options(context *ctx)
{
if(!ctx->input_filename
|| ctx->help
/* || !ctx->mcd_filename */
/* || !(ctx->cff_filename || ctx->fann_filename) */
){
maca_trans_parser_mcf2cff_help_message(ctx);
exit(1);
}
}
void generate_training_file_stream(FILE *output_file, context *ctx)
{
config *c;
int mvt_code;
char mvt_type;
int mvt_label;
feat_vec *fv = feat_vec_new(feature_types_nb);
int sentence_nb = 0;
int root_label = dico_string2int(ctx->dico_labels, (char *) ctx->root_label);
word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct);
FILE *mcf_file = myfopen(ctx->input_filename, "r");
int start_sentence_index = 0;
/* create an mcd that corresponds to ctx->mcd_struct, but without gov and label */
/* the idea is to ignore syntax in the mcf file that will be read */
/* it is ugly !!! */
mcd *mcd_struct_hyp = mcd_copy(ctx->mcd_struct);
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_GOV);
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_LABEL);
c = config_initial_no_dummy_word(mcf_file, mcd_struct_hyp, 5);
while(!word_buffer_end(ref) && (sentence_nb < ctx->sent_nb)){
/*printf("************ REF ************\n");
word_buffer_print(stdout, ref);
printf("*****************************\n");*/
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
mvt_code = oracle_parser_arc_eager(c, ref, start_sentence_index, root_label);
mvt_type = movement_type(mvt_code);
mvt_label = movement_label(mvt_code);
/* config_print(stdout,c); */
/* movement_print(stdout, mvt_code, ctx->dico_labels); */
fprintf(output_file, "%d", mvt_code);
feat_vec_print(output_file, fv);
if(mvt_type == MVT_EOS){
movement_eos(c, 0);
sentence_nb++;
start_sentence_index = word_get_index(word_buffer_b0(config_get_buffer(c))) - 1;
/* config_print(stdout,c); */
if(word_buffer_is_last(ref))
break;
}
if(mvt_type == MVT_LEFT){
movement_left_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_RIGHT){
movement_right_arc(c, mvt_label, 0);
word_buffer_move_right(ref);
continue;
}
if(mvt_type == MVT_REDUCE){
movement_reduce(c, 0);
continue;
}
if(mvt_type == MVT_ROOT){
movement_root(c, 0, root_label);
continue;
}
if(mvt_type == MVT_SHIFT){
movement_shift(c, 1, 0);
word_buffer_move_right(ref);
continue;
}
}
}
int main(int argc, char *argv[])
{
context *ctx;
FILE *output_file;
ctx = context_read_options(argc, argv);
maca_trans_parser_mcf2cff_check_options(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose);
if(ctx->mode == TRAIN_MODE){
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename);
ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct);
}
else if(ctx->mode == TEST_MODE){
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
}
ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL");
if(ctx->dico_labels == NULL){
fprintf(stderr, "cannot find label names\n");
return 1;
}
ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 1;
feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb);
/* in train mode create feature dictionnary for perceptron */
if(ctx->mode == TRAIN_MODE)
ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000);
/* in test mode read feature dictionnary for perceptron */
if(ctx->mode == TEST_MODE)
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
/* add the feature dictionnary to the dico vector */
dico_vec_add(ctx->vocabs, ctx->d_perceptron_features);
/* open output file */
output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w") : stdout;
generate_training_file_stream(output_file, ctx);
if(ctx->mode == TRAIN_MODE)
dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
if(ctx->cff_filename)
fclose(output_file);
context_free(ctx);
return 0;
}
......@@ -3,8 +3,8 @@
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"movement_parser_arc_eager.h"
#include"oracle_parser_arc_eager.h"
#include"movement_parser.h"
#include"oracle_parser.h"
#include"feat_fct.h"
#include"context.h"
#include"feat_vec.h"
......@@ -12,7 +12,7 @@
#include"word_emb.h"
#include"config2feat_vec.h"
void maca_trans_parser_mcf2cff_help_message(context *ctx)
void maca_trans_parser_conll2cff_help_message(context *ctx)
{
context_general_help_message(ctx);
context_mode_help_message(ctx);
......@@ -30,14 +30,14 @@ void maca_trans_parser_mcf2cff_help_message(context *ctx)
}
void maca_trans_parser_mcf2cff_check_options(context *ctx)
void maca_trans_parser_conll2cff_check_options(context *ctx)
{
if(!ctx->input_filename
|| ctx->help
/* || !ctx->mcd_filename */
/* || !(ctx->cff_filename || ctx->fann_filename) */
){
maca_trans_parser_mcf2cff_help_message(ctx);
maca_trans_parser_conll2cff_help_message(ctx);
exit(1);
}
}
......@@ -49,86 +49,122 @@ void generate_training_file_stream(FILE *output_file, context *ctx)
char mvt_type;
int mvt_label;
feat_vec *fv = feat_vec_new(feature_types_nb);
sentence *ref = NULL;
int sentence_nb = 0;
/* int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), (char *) ctx->root_label); */
int root_label = dico_string2int(ctx->dico_labels, (char *) ctx->root_label);
int eos_label = dico_string2int(ctx->dico_labels, "eos");
word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct);
FILE *mcf_file = myfopen(ctx->input_filename, "r");
int start_sentence_index = 0;
/* create an mcd that corresponds to ctx->mcd_struct, but without gov and label */
/* the idea is to ignore syntax in the mcf file that will be read */
/* it is ugly !!! */
mcd *mcd_struct_hyp = mcd_copy(ctx->mcd_struct);
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_GOV);
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_LABEL);
c = config_initial_no_dummy_word(mcf_file, mcd_struct_hyp, 5);
FILE *conll_file = myfopen(ctx->input_filename, "r");
FILE *conll_file_ref = myfopen(ctx->input_filename, "r");
c = config_initial(conll_file, ctx->mcd_struct, 5);
while(!word_buffer_end(ref)){
/*printf("************ REF ************\n");
word_buffer_print(stdout, ref);
printf("*****************************\n");*/
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
/* feat_vec_print(stdout, fv); */
mvt_code = oracle_parser_arc_eager(c, ref, start_sentence_index, root_label);
while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){
/* sentence_print(stdout, ref, ctx->dico_labels); */
while(1){
/* config_print(stdout,c); */
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
mvt_type = movement_type(mvt_code);
mvt_label = movement_label(mvt_code);
/* config_print(stdout,c); */
/* movement_print(stdout, mvt_code, ctx->dico_labels); */
fprintf(output_file, "%d", mvt_code);
feat_vec_print(output_file, fv);
if(mvt_type == MVT_EOS){
/* printf("************BEFORE *****************\n"); */
/* config_print(stdout,c); */
movement_eos(c, 0);
/* feat_vec_print(stdout, fv); */
mvt_code = oracle_parser(c, ref);
mvt_type = movement_type(mvt_code);
mvt_label = movement_label(mvt_code);
/* printf("************AFTER*****************\n"); */
/* config_print(stdout,c); */
start_sentence_index = word_get_index(word_buffer_b0(config_get_buffer(c))) - 1;
/* printf("%d\n", start_sentence_index); */
/* printf("mvt code = %d\n", mvt_code); */
/* movement_print(stdout, mvt_code, ctx->dico_labels); */
fprintf(output_file, "%d", mvt_code);
feat_vec_print(output_file, fv);
if(queue_is_empty(c->bf)) break;
if(word_buffer_is_last(ref)){
/* printf("it is the end\n"); */
if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ /* sentence is complete */
/* create the root arc */
movement_right_arc(c, mvt_label, 0);
/* shift dummy word in stack */
movement_shift(c, 1, 0);
/* printf("sentence complete config : ");
config_print(stdout,c); */
/* empty depset */
depset_free(c->ds);
c->ds = depset_new();
sentence_free(ref);
sentence_nb++;
c->current_index = queue_renumber_words(c->bf);
break;
}
if(mvt_type == MVT_LEFT){
movement_left_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_RIGHT){
movement_right_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_SHIFT){
movement_shift(c, 1, 0);
continue;
}
}
if(mvt_type == MVT_LEFT){
movement_left_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_RIGHT){
movement_right_arc(c, mvt_label, 0);
word_buffer_move_right(ref);
continue;
}
if(mvt_type == MVT_REDUCE){
movement_reduce(c, 0);
continue;
}
if(mvt_type == MVT_ROOT){
movement_root(c, 0, root_label);
continue;
}
}
}
void generate_training_file_buffer(FILE *output_file, context *ctx)
{
config *c;
int mvt_code;
char mvt_type;
int mvt_label;
feat_vec *fv = feat_vec_new(feature_types_nb);
sentence *ref = NULL;
int sentence_nb = 0;
FILE *conll_file = myfopen(ctx->input_filename, "r");
FILE *conll_file_ref = myfopen(ctx->input_filename, "r");
if(mvt_type == MVT_SHIFT){
movement_shift(c, 1, 0);
word_buffer_move_right(ref);
continue;
c = config_initial(conll_file, ctx->mcd_struct, 0);
while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){
/* sentence_print(stdout, ref, NULL); */
queue_read_sentence(c->bf, conll_file, ctx->mcd_struct);
while(!config_is_terminal(c)){
/* config_print(stdout,c); */
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
mvt_code = oracle_parser(c, ref);
mvt_type = movement_type(mvt_code);
mvt_label = movement_label(mvt_code);
/* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */
fprintf(output_file, "%d", mvt_code);
feat_vec_print(output_file, fv);
if(mvt_type == MVT_LEFT){
movement_left_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_RIGHT){
movement_right_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_SHIFT){
movement_shift(c, 0, 0);
continue;
}
}
config_free(c);
c = config_initial(conll_file, ctx->mcd_struct, 0);
sentence_nb++;
}
}
......@@ -138,7 +174,7 @@ int main(int argc, char *argv[])
FILE *output_file;
ctx = context_read_options(argc, argv);
maca_trans_parser_mcf2cff_check_options(ctx);
maca_trans_parser_conll2cff_check_options(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose);
......@@ -182,18 +218,12 @@ int main(int argc, char *argv[])
else
output_file = stdout;
generate_training_file_stream(output_file, ctx);
#if 0
=======
if(ctx->stream_mode){
generate_training_file_stream(output_file, ctx);
}
else{
generate_training_file_buffer(output_file, ctx);
}
>>>>>>> master
#endif
if(ctx->mode == TRAIN_MODE){
/* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */
dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
......
......@@ -8,10 +8,11 @@ void movement_print(FILE *f, int mvt_code, dico *dico_labels){
int mvt_type = movement_type(mvt_code);
int mvt_label = movement_label(mvt_code);
char *label;
if(mvt_type == MVT_SHIFT) {fprintf(f, "SHIFT\n"); return;}
if(mvt_type == MVT_SHIFT) {fprintf(f, "SHIFT\n"); return;}
if(mvt_type == MVT_REDUCE) {fprintf(f, "REDUCE\n"); return;}
if(mvt_type == MVT_ROOT) {fprintf(f, "ROOT\n"); return;}
if(mvt_type == MVT_RIGHT) fprintf(f, "RIGHT");
if(mvt_type == MVT_ROOT) {fprintf(f, "ROOT\n"); return;}
if(mvt_type == MVT_EOS) {fprintf(f, "EOS\n"); return;}
if(mvt_type == MVT_RIGHT) fprintf(f, "RIGHT");
else fprintf(f, "LEFT");
label = dico_int2string(dico_labels, mvt_label);
fprintf(f, " %s\n", label);
......@@ -19,20 +20,20 @@ void movement_print(FILE *f, int mvt_code, dico *dico_labels){
int movement_type(int mvt)
{
if(mvt == 0) return MVT_SHIFT; /* 0 is the code of shift */
if(mvt == 1) return MVT_REDUCE; /* 1 is the code of reduce */
if(mvt == 2) return MVT_ROOT; /* 2 is the code of root */
if(mvt == 3) return MVT_EOS; /* 2 is the code of root */
if(mvt % 2 == 0) return MVT_LEFT; /* even movements are left movements */
return MVT_RIGHT; /* odd movements are right movements */
if(mvt == MVT_SHIFT) return MVT_SHIFT; /* 0 is the code of shift */
if(mvt == MVT_REDUCE) return MVT_REDUCE; /* 1 is the code of reduce */
if(mvt == MVT_ROOT) return MVT_ROOT; /* 2 is the code of root */
if(mvt == MVT_EOS) return MVT_EOS; /* 3 is the code of root */
if(mvt % 2 == 0) return MVT_LEFT; /* even movements are left movements */
return MVT_RIGHT; /* odd movements are right movements */
}
int movement_label(int mvt)
{
if(mvt == 0) return -1; /* 0 is the code of shift */
if(mvt == 1) return -1; /* 1 is the code of reduce */
if(mvt == 2) return -1; /* 2 is the code of root */
if(mvt == 3) return -1; /* 3 is the code of eos */
if(mvt == MVT_SHIFT) return -1; /* 0 is the code of shift */
if(mvt == MVT_REDUCE) return -1; /* 1 is the code of reduce */
if(mvt == MVT_ROOT) return -1; /* 2 is the code of root */
if(mvt == MVT_EOS) return -1; /* 3 is the code of eos */
if(mvt % 2 == 0) /* even codes correspond to left movements */
return mvt / 2 - 2;
return (mvt - 1) / 2 - 2; /* odd codes correspond to right movements */
......@@ -40,11 +41,18 @@ int movement_label(int mvt)
int movement_eos(config *c, float score)
{
if(stack_is_empty(config_get_stack(c))) return 1;
/* word on the top of the stack is sent_seg */
word_set_sent_seg(stack_top(config_get_stack(c)), 1);
/* perform all pending reduce */
while(movement_reduce(c,0));
/* remove root from stack */
stack_pop(config_get_stack(c));
config_add_mvt(c, MVT_EOS);
return 1;
}
......@@ -82,7 +90,6 @@ int movement_right_arc(config *c, int label, float score)
/* printf("create right arc %d -> %d dist = %d\n", word_get_index(gov), word_get_index(dep), dist); */
/* create a new dependency */
word_set_gov(dep, dist);
word_set_label(dep, label);
......@@ -118,8 +125,6 @@ int movement_root(config *c, float score, int root_code)
word *b0 = word_buffer_b0(config_get_buffer(c));
word_set_gov(b0, 0);
word_set_label(b0, root_code);
/* stack_push(config_get_stack(c), b0); */
/* word_buffer_move_right(config_get_buffer(c)); */
config_add_mvt(c, MVT_ROOT);
return 1;
}
......
......@@ -4,12 +4,55 @@
#include<unistd.h>
#include<getopt.h>
#include"context.h"
#include"movement_parser_arc_eager.h"
#include"movement_parser.h"
#include"oracle_parser.h"
#include"feat_fct.h"
#include"config2feat_vec.h"
#include"feature_table.h"
#include"dico.h"
void simple_decoder_buffer(context *ctx, FILE *f, feature_table *ft, int root_label)
{
int mvt_code;
int mvt_type;
int mvt_label;
float max;
feat_vec *fv = feat_vec_new(feature_types_nb);
config *c = config_initial(f, ctx->mcd_struct, 0);
/* read a sentence and put it in the buffer */
while(queue_read_sentence(c->bf, f, ctx->mcd_struct) > 1){
while(!config_is_terminal(c)){
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
mvt_code = feature_table_argmax(fv, ft, &max);
mvt_type = movement_type(mvt_code);
mvt_label = movement_label(mvt_code);
if(mvt_type == MVT_LEFT)
if(movement_left_arc(c, mvt_label, max))
continue;
if(mvt_type == MVT_RIGHT)
if(movement_right_arc(c, mvt_label, max))
continue;
movement_shift(c, 0, max);
}
/* config_print(stdout, c); */
config_connect_subtrees(c, root_label);
depset_print2(stdout, c->ds, ctx->dico_labels);