Select Git revision
maca_trans_parser_export.cc
Johannes Heinecke authored
maca_trans_parser_export.cc 9.56 KiB
#include <cstdio>
#include <cstring>
//#ifdef __cplusplus
//extern "C"{
//#endif
#include "simple_decoder_parser_arc_eager.h"
#include "movement_parser_arc_eager.h"
#include "feat_fct.h"
#include "config2feat_vec.h"
#include "feature_table.h"
#include "dico.h"
//#ifdef __cplusplus
//}
//#endif
#include "maca_trans_parser_export.h"
/** initialises class variables
resultstring: which keeps last result
ctx: current context
*/
MacaonTransParser::MacaonTransParser(char *lg, char *mcd) {
resultstring = NULL;
initOK = 1;
char * argv[] = { (char *)"initParser",
(char *)"-L", lg,
(char *)"-C", mcd,
0
};
ctx = context_read_options(5, argv);
set_linguistic_resources_filenames_parser(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, feat_lib_build(), ctx->verbose);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL");
if(ctx->dico_labels == NULL){
fprintf(stderr, "cannot find label names\n");
initOK = 0;
}
ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 3;
/* load models */
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
}
MacaonTransParser::~MacaonTransParser() {
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
context_free(ctx);
}
const char *MacaonTransParser::parsemcf(const char *mcf) {
simple_decoder_parser_arc_eager_str(ctx, mcf);
//printf("rrr %s\n", resultstring);
//return "abcdef";
return resultstring;
}
/** taken as is from maca_trans_parser.c, since this function is not in the
libtransparse.a library */
void MacaonTransParser::set_linguistic_resources_filenames_parser(context *ctx) {
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
if(!ctx->perc_model_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MODEL_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_VOCABS_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
/* if(!ctx->mcd_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_FILENAME);
ctx->mcd_filename = strdup(absolute_filename);
}*/
if(!ctx->features_model_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(ctx->verbose){
fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
}
}
/** taken from simple_decode_parser_arc_eager.c and modified in order to
taken an input string (in mcf format) which is read through a FILE * via fmemopen()
instead reading a file or stdin.
It writes the result to a FILE * opened with open_memstream() in order to get the result in a char *
*/
void MacaonTransParser::simple_decoder_parser_arc_eager_str(context *ctx, const char *mcfString) {
FILE *f = fmemopen ((void *)mcfString, strlen(mcfString), "r");
feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
int root_label;
int mvt_code;
int mvt_type;
int mvt_label;
float max;
feat_vec *fv = feat_vec_new(feature_types_nb);
config *c = NULL;
int result;
/* float entropy; */
/* float delta; */
int argmax1, argmax2;
float max1, max2;
int index;
root_label = dico_string2int(ctx->dico_labels, ctx->root_label);
if(root_label == -1) root_label = 0;
c = config_new(f, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
if(ctx->debug_mode){
fprintf(stdout, "***********************************\n");
config_print(stdout, c);
}
/* forced EOS (the element on the top of the stack is eos, but the preceding movement is not MVT_PARSER_EOS */
/* which means that the top of the stack got its eos status from input */
/* force the parser to finish parsing the sentence (perform all pending reduce actions) and determine root of the sentence */
if((word_get_sent_seg(stack_top(config_get_stack(c))) == 1) && (mvt_get_type(mvt_stack_top(config_get_history(c))) != MVT_PARSER_EOS)){
word_set_sent_seg(stack_top(config_get_stack(c)), -1);
movement_parser_eos(c);
while(movement_parser_reduce(c));
while(movement_parser_root(c, root_label));
if(ctx->debug_mode) printf("force EOS\n");
}
/* normal behavious, ask classifier what is the next movement to do and do it */
else{
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
mvt_code = feature_table_argmax(fv, ft, &max);
if(ctx->debug_mode){
vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
for(int i=0; i < 3; i++){
printf("%d\t", i);
movement_parser_print(stdout, vcode_array[i].class_code, ctx->dico_labels);
printf("\t%.4f\n", vcode_array[i].score);
}
free(vcode_array);
}
if(ctx->trace_mode){
index = word_get_index(word_buffer_b0(config_get_buffer(c)));
fprintf(stdout, "%d\t", index);
stack_print(stdout, c->st);
fprintf(stdout, "\t");
movement_parser_print(stdout, mvt_code, ctx->dico_labels);
fprintf(stdout, "\t");
feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2);
printf("%f\n", max1 - max2);
}
mvt_type = movement_parser_type(mvt_code);
mvt_label = movement_parser_label(mvt_code);
result = 0;
switch(mvt_type){
case MVT_PARSER_LEFT :
result = movement_parser_left_arc(c, mvt_label);
break;
case MVT_PARSER_RIGHT:
result = movement_parser_right_arc(c, mvt_label);
break;
case MVT_PARSER_REDUCE:
result = movement_parser_reduce(c);
break;
case MVT_PARSER_ROOT:
result = movement_parser_root(c, root_label);
break;
case MVT_PARSER_EOS:
result = movement_parser_eos(c);
break;
case MVT_PARSER_SHIFT:
result = movement_parser_shift(c);
}
if(result == 0){
if(ctx->debug_mode) fprintf(stdout, "WARNING : movement cannot be executed doing a SHIFT instead !\n");
result = movement_parser_shift(c);
if(result == 0){ /* SHIFT failed no more words to read, let's get out of here ! */
if(ctx->debug_mode) fprintf(stdout, "WARNING : cannot exectue a SHIFT emptying stack !\n");
while(!stack_is_empty(config_get_stack(c)))
movement_parser_root(c, root_label);
}
}
}
}
//if(!ctx->trace_mode) {
size_t size;
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
FILE *outstream = open_memstream (&resultstring, &size);
print_word_buffer_fp(c, ctx->dico_labels, ctx->mcd_struct, outstream);
fclose(outstream);
config_free(c);
feat_vec_free(fv);
feature_table_free(ft);
fclose(f);
}
/** taken from simple_decode_parser_arc_eager.c and modified in order to write to any FILE* not only stdout */
void MacaonTransParser::print_word_buffer_fp(config *c, dico *dico_labels, mcd *mcd_struct, FILE *out) {
int i;
word *w;
char *label;
char *buffer = NULL;
char *token = NULL;
int col_nb = 0;
for(i=0; i < config_get_buffer(c)->nbelem; i++){
w = word_buffer_get_word_n(config_get_buffer(c), i);
if((mcd_get_gov_col(mcd_struct) == -1)
&& (mcd_get_label_col(mcd_struct) == -1)
&& (mcd_get_sent_seg_col(mcd_struct) == -1)){
fprintf(out, "%s\t", word_get_input(w));
fprintf(out, "%d\t", word_get_gov(w));
label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w));
if(label != NULL)
fprintf(out, "%s\t", label) ;
else
fprintf(out, "_\t");
if(word_get_sent_seg(w) == 1)
fprintf(out, "1\n") ;
else
fprintf(out, "0\n");
}
else{
buffer = strdup(w->input);
token = strtok(buffer, "\t");
col_nb = 0;
while(token){
if(col_nb != 0) fprintf(out, "\t");
if(col_nb == mcd_get_gov_col(mcd_struct)){
fprintf(out, "%d", word_get_gov(w));
}
else
if(col_nb == mcd_get_label_col(mcd_struct)){
label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w));
if(label != NULL)
fprintf(out, "%s", label) ;
else
fprintf(out, "_");
}
else
if(col_nb == mcd_get_sent_seg_col(mcd_struct)){
if(word_get_sent_seg(w) == 1)
fprintf(out, "1") ;
else
fprintf(out, "0");
}
else{
word_print_col_n(out, w, col_nb);
}
col_nb++;
token = strtok(NULL, "\t");
}
if((col_nb <= mcd_get_gov_col(mcd_struct)) || (mcd_get_gov_col(mcd_struct) == -1)){
fprintf(out, "\t%d", word_get_gov(w));
}
if((col_nb <= mcd_get_label_col(mcd_struct)) || (mcd_get_label_col(mcd_struct) == -1)){
label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w));
if(label != NULL)
fprintf(out, "\t%s", label) ;
else
fprintf(out, "\t_");
}
if((col_nb <= mcd_get_sent_seg_col(mcd_struct)) || (mcd_get_sent_seg_col(mcd_struct) == -1)){
if(word_get_sent_seg(w) == 1)
fprintf(out, "\t1") ;
else
fprintf(out, "\t0");
}
fprintf(out, "\n");
free(buffer);
}
}
}