Select Git revision
CITATION.cff
-
Stephane Chavin authoredStephane Chavin authored
maca_trans_parser_mcf2cff.c 6.40 KiB
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include"movement_parser.h"
#include"oracle_parser.h"
#include"feat_fct.h"
#include"context.h"
#include"feat_vec.h"
#include"dico_vec.h"
#include"word_emb.h"
#include"config2feat_vec.h"
void maca_trans_parser_conll2cff_help_message(context *ctx)
{
context_general_help_message(ctx);
context_mode_help_message(ctx);
context_sent_nb_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
fprintf(stderr, "IN TEST MODE\n");
context_vocabs_help_message(ctx);
fprintf(stderr, "OUTPUT\n");
context_cff_help_message(ctx);
fprintf(stderr, "IN TRAIN MODE\n");
context_vocabs_help_message(ctx);
}
void maca_trans_parser_conll2cff_check_options(context *ctx)
{
if(!ctx->input_filename
|| ctx->help
/* || !ctx->mcd_filename */
/* || !(ctx->cff_filename || ctx->fann_filename) */
){
maca_trans_parser_conll2cff_help_message(ctx);
exit(1);
}
}
void generate_training_file_stream(FILE *output_file, context *ctx)
{
config *c;
int mvt_code;
char mvt_type;
int mvt_label;
feat_vec *fv = feat_vec_new(feature_types_nb);
sentence *ref = NULL;
int sentence_nb = 0;
/* int root_label = dico_string2int(mcd_get_dico_label(ctx->mcd_struct), (char *) ctx->root_label); */
int root_label = dico_string2int(ctx->dico_labels, (char *) ctx->root_label);
FILE *conll_file = myfopen(ctx->input_filename, "r");
FILE *conll_file_ref = myfopen(ctx->input_filename, "r");
c = config_initial(conll_file, ctx->mcd_struct, 5);
while((ref = sentence_read(conll_file_ref , ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){
/* sentence_print(stdout, ref, ctx->dico_labels); */
while(1){
/* config_print(stdout,c); */
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
/* feat_vec_print(stdout, fv); */
mvt_code = oracle_parser(c, ref);
mvt_type = movement_type(mvt_code);
mvt_label = movement_label(mvt_code);
/* printf("mvt code = %d\n", mvt_code); */
/* movement_print(stdout, mvt_code, ctx->dico_labels); */
fprintf(output_file, "%d", mvt_code);
feat_vec_print(output_file, fv);
if(queue_is_empty(c->bf)) break;
if((mvt_type == MVT_RIGHT) && (mvt_label == root_label)){ /* sentence is complete */
/* create the root arc */
movement_right_arc(c, mvt_label, 0);
/* shift dummy word in stack */
movement_shift(c, 1, 0);
/* printf("sentence complete config : ");
config_print(stdout,c); */
/* empty depset */
depset_free(c->ds);
c->ds = depset_new();
sentence_free(ref);
sentence_nb++;
c->current_index = queue_renumber_words(c->bf);
break;
}
if(mvt_type == MVT_LEFT){
movement_left_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_RIGHT){
movement_right_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_SHIFT){
movement_shift(c, 1, 0);
continue;
}
}
}
}
void generate_training_file_buffer(FILE *output_file, context *ctx)
{
config *c;
int mvt_code;
char mvt_type;
int mvt_label;
feat_vec *fv = feat_vec_new(feature_types_nb);
sentence *ref = NULL;
int sentence_nb = 0;
FILE *conll_file = myfopen(ctx->input_filename, "r");
FILE *conll_file_ref = myfopen(ctx->input_filename, "r");
c = config_initial(conll_file, ctx->mcd_struct, 0);
while((ref = sentence_read(conll_file_ref, ctx->mcd_struct)) && (sentence_nb < ctx->sent_nb)){
/* sentence_print(stdout, ref, NULL); */
queue_read_sentence(c->bf, conll_file, ctx->mcd_struct);
while(!config_is_terminal(c)){
/* config_print(stdout,c); */
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, ctx->mode);
mvt_code = oracle_parser(c, ref);
mvt_type = movement_type(mvt_code);
mvt_label = movement_label(mvt_code);
/* printf("mvt type = %d mvt label = %d\n", mvt_type, mvt_label); */
fprintf(output_file, "%d", mvt_code);
feat_vec_print(output_file, fv);
if(mvt_type == MVT_LEFT){
movement_left_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_RIGHT){
movement_right_arc(c, mvt_label, 0);
continue;
}
if(mvt_type == MVT_SHIFT){
movement_shift(c, 0, 0);
continue;
}
}
config_free(c);
c = config_initial(conll_file, ctx->mcd_struct, 0);
sentence_nb++;
}
}
int main(int argc, char *argv[])
{
context *ctx;
FILE *output_file;
ctx = context_read_options(argc, argv);
maca_trans_parser_conll2cff_check_options(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose);
if(ctx->mode == TRAIN_MODE){
mcd_extract_dico_from_corpus(ctx->mcd_struct, ctx->input_filename);
ctx->vocabs = mcd_build_dico_vec(ctx->mcd_struct);
}
else if(ctx->mode == TEST_MODE){
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
}
/* dico_vec_print(NULL, ctx->vocabs); */
ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL");
if(ctx->dico_labels == NULL){
fprintf(stderr, "cannot find label names\n");
return 1;
}
ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 1;
feat_model_compute_ranges(ctx->features_model, ctx->mcd_struct, ctx->mvt_nb);
/* in train mode create feature dictionnary for perceptron */
if(ctx->mode == TRAIN_MODE)
ctx->d_perceptron_features = dico_new((char *)"d_perceptron_features", 10000000);
/* in test mode read feature dictionnary for perceptron */
if(ctx->mode == TEST_MODE)
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
/* add the feature dictionnary to the dico vector */
dico_vec_add(ctx->vocabs, ctx->d_perceptron_features);
/* open output file */
if(ctx->cff_filename)
output_file = myfopen(ctx->cff_filename, "w");
else
output_file = stdout;
if(ctx->stream_mode)
generate_training_file_stream(output_file, ctx);
else
generate_training_file_buffer(output_file, ctx);
if(ctx->mode == TRAIN_MODE){
/* dico_print(ctx->perceptron_features_filename, ctx->d_perceptron_features); */
dico_vec_print(ctx->vocabs_filename, ctx->vocabs);
}
if(ctx->cff_filename)
fclose(output_file);
context_free(ctx);
return 0;
}