Select Git revision
maca_tm_decoder.c
maca_tm_decoder.c 10.31 KiB
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<unistd.h>
#include<getopt.h>
#include<ctype.h>
#include"context.h"
#include"feat_fct.h"
#include"feature_table.h"
#include"dico.h"
#include"form2pos.h"
#include"simple_decoder_tagger.h"
#include"config2feat_vec.h"
#include"movements.h"
#include"config2feat_vec.h"
#include"dico.h"
#include"mcd.h"
#include"fplm.h"
void maca_tm_decoder_help_message(context *ctx)
{
context_general_help_message(ctx);
context_beam_help_message(ctx);
context_conll_help_message(ctx);
fprintf(stderr, "INPUT\n");
context_input_help_message(ctx);
context_mcd_help_message(ctx);
context_model_help_message(ctx);
context_vocabs_help_message(ctx);
context_features_model_help_message(ctx);
context_f2p_filename_help_message(ctx);
}
void maca_tm_decoder_check_options(context *ctx){
if(ctx->help
/*!ctx->conll_filename*/
/* || !ctx->perc_model_filename
|| !ctx->mcd_filename
|| !ctx->vocabs_filename
|| !ctx->features_model_filename*/
){
maca_tm_decoder_help_message(ctx);
exit(1);
}
}
void maca_tm_decoder_set_linguistic_resources_filenames(context *ctx)
{
char absolute_filename[500];
/* if(!ctx->classif_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_CLASSIFIER_TAGGER_FILENAME);
ctx->classif_filename = strdup(absolute_filename);
}*/
if(!ctx->f2p_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_F2P_FILENAME);
ctx->f2p_filename = strdup(absolute_filename);
ctx->f2p = form2pos_read(ctx->f2p_filename);
}
if(!ctx->tm_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_TRANS_MACHINE_TAGPARSER_FILENAME);
ctx->tm_filename = strdup(absolute_filename);
}
if(!ctx->fplm_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FPLM_FILENAME);
ctx->fplm_filename = strdup(absolute_filename);
}
if(ctx->verbose){
fprintf(stderr, "tm_filename = %s\n", ctx->tm_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "mcf_filename = %s\n", ctx->input_filename);
fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename);
}
}
void maca_tm_decoder_set_linguistic_resources_filenames_old(context *ctx)
{
char absolute_filename[500];
if(!ctx->perc_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MODEL_TAGGER_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_VOCABS_TAGGER_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
/* if(!ctx->mcd_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_TAGGER_FILENAME);
ctx->mcd_filename = strdup(absolute_filename);
}*/
if(!ctx->features_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_TAGGER_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(!ctx->f2p_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_F2P_FILENAME);
ctx->f2p_filename = strdup(absolute_filename);
ctx->f2p = form2pos_read(ctx->f2p_filename);
}
if(ctx->verbose){
fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename);
}
}
#if 1
void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p)
{
int i;
word *w;
char lower_form[100];
for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){
w = word_buffer_get_word_n(bf, i);
if(word_get_signature(w) != -1) break;
w->signature = form2pos_get_signature(f2p, w->form);
if(w->signature == -1){
strcpy(lower_form, w->form);
to_lower_string(lower_form);
w->signature = form2pos_get_signature(f2p, lower_form);
}
}
}
#endif
#if 0
void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p, dico *dico_pos)
{
int i;
word *w;
int signature;
char *pos;
for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){
w = word_buffer_get_word_n(bf, i);
if(word_get_signature(w) != -1) break;
signature = form2pos_get_signature(f2p, w->form);
w->signature = signature;
if(form2pos_word_is_non_ambiguous(f2p, w->form, &pos)){
/* printf("%s non ambigu code = %d \n", pos, dico_string2int(dico_pos, pos)); */
word_set_pos(w, dico_string2int(dico_pos, pos));
}
}
}
#endif
void print_word_buffer_tagparser(config *c, dico *dico_labels, dico *dico_pos, dico *dico_forms, dico *dico_morpho)
{
int i;
word *dep;
char *label;
char *pos;
char *lemma;
char *morpho;
for(i=0; i < config_get_buffer(c)->nbelem; i++){
dep = word_buffer_get_word_n(config_get_buffer(c), i);
printf("%s\t", word_get_input(dep));
pos = (word_get_pos(dep) == -1)? NULL : dico_int2string(dico_pos, word_get_pos(dep));
if(pos != NULL)
printf("%s\t", pos) ;
else
printf("_\t");
morpho = (word_get_feats(dep) == -1)? NULL : dico_int2string(dico_morpho, word_get_feats(dep));
if(morpho != NULL)
printf("%s\t", morpho) ;
else
printf("_\t");
lemma = (word_get_lemma(dep) == -1)? NULL : dico_int2string(dico_forms, word_get_lemma(dep));
if(lemma != NULL)
printf("%s\t", lemma) ;
else
printf("_\t");
printf("%d\t", word_get_gov(dep));
label = (word_get_label(dep) == -1)? NULL : dico_int2string(dico_labels, word_get_label(dep));
if(label != NULL)
printf("%s\t", label) ;
else
printf("_\t");
if(word_get_sent_seg(dep) == 1)
printf("1\n") ;
else
printf("0\n");
}
}
void maca_tm_decoder(context *ctx)
{
FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
config *c = config_new(f, ctx->mcd_struct, 5);
int mvt_code;
int root_label;
classifier *classif;
dico *dico_pos = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"POS");
dico *dico_forms = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"FORM");
dico *dico_labels = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"LABEL");
dico *dico_morpho = dico_vec_get_dico(tm_get_d_tapes(ctx->machine), (char *)"FEATS");
int result;
fplm_struct *fplm = fplm_load(ctx->fplm_filename, ctx->debug_mode);
char form_str[200];
char *pos_str;
char *lemma_str;
int lemma_code;
tm_state *current_state = NULL;
int parser_state_nb = tm_get_parser_state(ctx->machine);
int morpho_state_nb = tm_get_morpho_state(ctx->machine);
int tagger_state_nb = tm_get_tagger_state(ctx->machine);
mvt_tagset *std_mvt_tagset = mvt_tagset_std();
/* printf("tagger state nb = %d\n", tagger_state_nb); */
/* printf("parser state nb = %d\n", parser_state_nb); */
/* printf("morpho state nb = %d\n", morpho_state_nb); */
root_label = dico_string2int(dico_labels, ctx->root_label);
if(root_label == -1) root_label = 0;
while(!config_is_terminal(c)){
if(ctx->f2p)
add_signature_to_words_in_word_buffer(c->bf, ctx->f2p);
while(tm_state_num_has_forced_transition(ctx->machine, c->current_state_nb)){
tm_take_forced_transition(c, ctx->machine, std_mvt_tagset);
}
/* horrible trick : when at the end of buffer, skip tagger mode stay in parser mode */
if((c->current_state_nb == tagger_state_nb) && word_buffer_end(config_get_buffer(c)))
c->current_state_nb = parser_state_nb;
current_state = ctx->machine->state_array[c->current_state_nb];
classif = current_state->classif;
mvt_code = classifier_argmax(classif, c);
if(ctx->debug_mode){
fprintf(stdout, "***********************************\n");
fprintf(stdout, "%s ", ctx->machine->state_array[c->current_state_nb]->name);
config_print(stdout, c);
classifier_print_vcode_array(stdout, classif, c, 4);
}
result = movement_apply(c, mvt_code, classifier_get_output_tagset(classif), root_label, ctx->machine);
/* in tagger state , look for lemma */
if(c->current_state_nb == tagger_state_nb){
/* it is bm1 rather than b0 because the machine changed state after applying the pos movement */
word_sprint_col_n(form_str, word_buffer_bm1(config_get_buffer(c)), mcd_get_form_col(ctx->mcd_struct));
pos_str = dico_int2string(dico_pos, word_get_pos(word_buffer_bm1(config_get_buffer(c))));
lemma_code = -1;
if(pos_str){
lemma_str = fplm_lookup_lemma(fplm, form_str, pos_str, ctx->verbose);
lemma_code = dico_string2int(dico_forms, lemma_str);
}
word_set_lemma(word_buffer_bm1(config_get_buffer(c)), lemma_code);
/* printf("--------> form = %s pos = %s lemma = %s code = %d\n", form_str, pos_str, lemma_str, lemma_code); */
}
if(result == 0){
if(ctx->debug_mode) fprintf(stdout, "WARNING : movement cannot be executed doing a SHIFT instead !\n");
mvt_code = mvt_tagset_get_code(classifier_get_output_tagset(classif), MVT_SHIFT, 0);
movement_apply(c, mvt_code, classifier_get_output_tagset(classif), -1, ctx->machine);
if(result == 0){ /* SHIFT failed no more words to read, let's get out of here ! */
if(ctx->debug_mode) fprintf(stdout, "WARNING : cannot exectue a SHIFT emptying stack !\n");
while(!stack_is_empty(config_get_stack(c))){
mvt_code = mvt_tagset_get_code(classifier_get_output_tagset(classif), MVT_ROOT, 0);
movement_apply(c, mvt_code, classifier_get_output_tagset(classif), root_label, ctx->machine);
}
}
}
}
print_word_buffer_tagparser(c, dico_labels, dico_pos, dico_forms, dico_morpho);
config_free(c);
if(ctx->input_filename) fclose(f);
}
int main(int argc, char *argv[])
{
context *ctx = context_read_options(argc, argv);
maca_tm_decoder_check_options(ctx);
maca_tm_decoder_set_linguistic_resources_filenames(ctx);
ctx->machine = tm_load(ctx->tm_filename, ctx->maca_data_path, ctx->verbose);
mcd_link_to_dico(ctx->mcd_struct, ctx->machine->d_tapes, ctx->verbose);
maca_tm_decoder(ctx);
context_free(ctx);
return 0;
}