Select Git revision
maca_tm_mcf2scf.c
maca_tm_mcf2scf.c 6.80 KiB
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
/* #include"movement_parser_arc_eager.h" */
#include"movements.h"
#include"oracle_parser_arc_eager.h"
#include"oracle_tagger.h"
#include"feat_fct.h"
#include"context.h"
#include"feat_vec.h"
#include"dico_vec.h"
#include"word_emb.h"
#include"config2feat_vec.h"
#include"classifier.h"
int oracle_morpho(config *c, mvt_tagset *tagset)
{
int morpho_feats = word_get_feats(word_buffer_b0(config_get_buffer(c)));
int mvt_code = mvt_tagset_get_code(tagset, MVT_MORPHO, morpho_feats);
return mvt_code;
}
void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p)
{
int i;
word *w;
char lower_form[100];
for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){
w = word_buffer_get_word_n(bf, i);
if(word_get_signature(w) != -1) break;
w->signature = form2pos_get_signature(f2p, w->form);
if(w->signature == -1){
strcpy(lower_form, w->form);
to_lower_string(lower_form);
w->signature = form2pos_get_signature(f2p, lower_form);
}
}
}
void maca_tm_mcf2scf_help_message(context *ctx)
{
context_general_help_message(ctx);
context_mode_help_message(ctx);
context_sent_nb_help_message(ctx);
context_tm_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_conll_help_message(ctx);
fprintf(stderr, "IN TEST MODE\n");
context_vocabs_help_message(ctx);
fprintf(stderr, "OUTPUT\n");
context_cff_help_message(ctx);
fprintf(stderr, "IN TRAIN MODE\n");
context_vocabs_help_message(ctx);
}
void maca_tm_mcf2scf_check_options(context *ctx)
{
if(!ctx->input_filename
|| ctx->help
/* || !ctx->mcd_filename */
/* || !(ctx->cff_filename || ctx->fann_filename) */
){
maca_tm_mcf2scf_help_message(ctx);
exit(1);
}
}
void generate_scf_file(context *ctx)
{
config *c;
int mvt_code;
int sentence_nb = 0;
int root_label = 0;
word_buffer *ref = word_buffer_load_mcf(ctx->input_filename, ctx->mcd_struct);
FILE *mcf_file = myfopen(ctx->input_filename, "r");
tm *machine = ctx->machine;
tm_state *current_state = NULL;
int mvt_type;
dico *d_synt_labels;
classifier *classif = NULL;
FILE *output_file;
int parser_state_nb = tm_get_parser_state(machine);
int morpho_state_nb = tm_get_morpho_state(machine);
int tagger_state_nb = tm_get_tagger_state(machine);
/* create an mcd that corresponds to ctx->mcd_struct, but without gov and label */
/* the idea is to ignore syntax in the mcf file that will be read */
/* it is ugly !!! */
mcd *mcd_struct_hyp = mcd_copy(ctx->mcd_struct);
/*mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_POS); */
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_GOV);
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_LABEL);
mcd_remove_wf_column(mcd_struct_hyp, MCD_WF_SENT_SEG);
/* open output file */
output_file = (ctx->cff_filename) ? myfopen_no_exit(ctx->cff_filename, "w", ctx->verbose) : stdout;
d_synt_labels = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"LABEL");
if(d_synt_labels == NULL){
fprintf(stderr, "cannot find syntactic label alphabet in transition machine tape dictionaries\n");
}
else{
root_label = dico_string2int(d_synt_labels, (char *) ctx->root_label);
}
/* c = config_new(mcf_file, ctx->mcd_struct, 5); */
c = config_new(mcf_file, mcd_struct_hyp, 5);
/* c = config_new_load_all_mcf(ctx->input_filename, ctx->mcd_struct); */
//while(!word_buffer_end(ref) && (sentence_nb < ctx->sent_nb)){
while(!word_buffer_end(config_get_buffer(c)) && (sentence_nb < ctx->sent_nb)){
current_state = machine->state_array[c->current_state_nb];
classif = current_state->classif;
if(ctx->f2p)
add_signature_to_words_in_word_buffer(c->bf, ctx->f2p);
if(c->current_state_nb == parser_state_nb){
mvt_code = oracle_parser_arc_eager(c, ref, root_label, classifier_get_output_tagset(classif));
}
else if(c->current_state_nb == morpho_state_nb){
mvt_code = oracle_morpho(c, classifier_get_output_tagset(classif));
}
else if(c->current_state_nb == tagger_state_nb){
mvt_code = oracle_tagger(c, classifier_get_output_tagset(classif));
}
else{
fprintf(stderr, "do not know which oracle to use for state %s\n", current_state->name);
exit(1);
}
mvt_type = mvt_tagset_get_type(classifier_get_output_tagset(classif), mvt_code);
config2feat_vec_cff(classif->fm, c, classif->d_features, classif->fv, ctx->mode);
if(ctx->debug_mode){
config_print(stdout,c);
mvt_tagset_print_mvt(stdout, classifier_get_output_tagset(classif), mvt_code);
fprintf(stdout, "\n");
}
movement_apply(c, mvt_code, classifier_get_output_tagset(classif), root_label, machine);
/* advance head in ref word buffer */
if((mvt_type == MVT_RIGHT) || (mvt_type == MVT_SHIFT)){
word_buffer_move_right(ref);
}
fprintf(output_file, "%d", current_state->classifier_nb);
fprintf(output_file, "\t%d", mvt_code);
feat_vec_print(output_file, classif->fv);
if(mvt_type == MVT_EOS)
if((++sentence_nb % 100) == 0) fprintf(stderr, "\rsentence %d", sentence_nb);
/* fprintf(stderr, "hyp index = %d ref_index = %d\n", word_get_index(word_buffer_b0(config_get_buffer(c))), word_get_index(word_buffer_b0(ref))); */
/* if(ctx->trace_mode){
fprintf(output_file, "%d\t", word_get_index(word_buffer_b0(config_get_buffer(c))));
stack_print(output_file, c->st);
fprintf(output_file, "\t");
movement_parser_print(output_file, mvt_code, ctx->dico_labels);
fprintf(output_file, "\t1\n");
}*/
}
fprintf(stderr, "\n");
if(ctx->cff_filename)
fclose(output_file);
}
int main(int argc, char *argv[])
{
classifier *classif = NULL;
context *ctx = context_read_options(argc, argv);
int i;
tm *machine;
char string[100];
maca_tm_mcf2scf_check_options(ctx);
machine = tm_load(ctx->tm_filename, ctx->maca_data_path, ctx->verbose);
ctx->machine = machine;
mcd_link_to_dico(ctx->mcd_struct, machine->d_tapes, ctx->verbose);
/* create perceptron features dictionnaries for all classifiers of the machine */
for(i=0; i < machine->classif_vec->nb; i++){
classif = machine->classif_vec->array[i];
strcpy(string, classifier_get_name(classif));
classifier_set_d_features(classif, dico_new(string, 1000000));
/* classifier_set_d_features(classif, dico_new((char *)"d_perceptron_features", 100000)); */
}
generate_scf_file(ctx);
/* in train mode print all feature dictionnaries that have been created as well as classifiers descriptions */
if(ctx->mode == TRAIN_MODE){
for(i=0; i < machine->classif_vec->nb; i++){
classif = machine->classif_vec->array[i];
classifier_print_d_features(classif);
classifier_print_desc_file(classif->filename, classif);
}
}
context_free(ctx);
return 0;
}