Skip to content
Snippets Groups Projects

add first version of exernal-dependency-free graph parser

Merged Alexis Nasr requested to merge maca_graph_parser into master
2 files
+ 33
37
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 223
0
#include <string>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include"simple_parser.h"
extern "C" {
//#include <maca_macaon.h>
#include "maca_common.h"
#include "maca_constants.h"
#include "maca_graph_parser_sentence.h"
#include "maca_graph_parser_decoder1.h"
#include "maca_graph_parser_decoder2.h"
#include "maca_graph_parser_decoder.h"
#include "maca_graph_parser_model.h"
#include "maca_graph_parser_features.h"
#include "maca_graph_parser_dep_count_table.h"
#include "maca_graph_parser_feature_table.h"
}
bool macaon::Parser::ProcessSentence(const std::vector<std::string> &words,
const std::vector<std::string> &tags,
const std::vector<std::string> &lemmas,
std::vector<ParsedWord>& output)
{
maca_graph_parser_sentence *maca_s = NULL;
int code_postag, code_lemma, code_form, code_label;
maca_s = maca_graph_parser_allocate_sentence(ctx);
size_t i;
int ill_formed = 0;
for(i=0; i < words.size(); i++){
code_postag = maca_alphabet_get_code(ctx->pos_alphabet, (char *)tags[i].c_str());
if(code_postag == -1){
if(ctx->verbose_flag > 0){
maca_msg(ctx->module, MACA_WARNING);
fprintf(stderr,"pos %s unknown, skipping sentence\n", (char *)tags[i].c_str());
}
ill_formed = 1;
}
code_lemma = lemmas[i] != "" ? maca_alphabet_add_symbol(ctx->words_alphabet, (char *)lemmas[i].c_str()) : -1;
code_form = maca_alphabet_add_symbol(ctx->words_alphabet, (char *)words[i].c_str());
code_label = 0;
maca_graph_parser_sentence_add_word(ctx, maca_s, NULL, code_form, code_lemma, code_postag, -1, code_label, 0, NULL);
}
if(ill_formed){
maca_graph_parser_free_sentence(maca_s);
return false;
}
maca_graph_parser_decoder_parse(ctx, maca_s);
/*ParsedWord pw0;
pw0.id = 0;
pw0.word = "ROOT";
// pw0.lemma = "ROOT";
pw0.posTag = "NA";
pw0.dependencyLabel = "NA";
pw0.dependencyParent = -1;
output.push_back(pw0);*/
char buffer[128];
for(i=1; i < (size_t) maca_s->l; i++){
ParsedWord pw;
pw.id = i;
pw.word = words[i-1];
pw.lemma = lemmas[i-1];
pw.posTag = tags[i-1];
maca_alphabet_get_symbol(ctx->labels_alphabet, maca_s->label[i], buffer, sizeof(buffer));
pw.dependencyLabel = buffer;
pw.dependencyParent = maca_s->gov[i];
output.push_back(pw);
}
maca_graph_parser_free_sentence(maca_s);
return true;
}
macaon::Parser::Parser(
const char * cfg, /*!< config/language selected */
int verbose_flag, /*!< verbose flag */
const char *model_file_name,
const char *alphabet_file_name,
const char *dep_count_file_name,
int order)
{
const char* argv[] = {"macaon", "-C", cfg, "-m", model_file_name, "-a", alphabet_file_name, "-d", dep_count_file_name};
int argc = sizeof(argv) / sizeof(char*);
ctx = maca_graph_parser_LoadCTX(argc, (char**) argv);
ctx->verbose_flag = verbose_flag;
int i;
int sent_num;
/* alphabets */
/* load alphabets */
maca_alphabet_array *alpha_array = maca_alphabet_array_new_from_file(ctx->alphabet_file_name);
if (alpha_array == NULL) {
maca_msg(ctx->module, MACA_ERROR);
fprintf(stderr, "couldn't open the alphabet file!\n");
exit(1);
}
ctx->words_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_WORDS);
ctx->words_nb = (ctx->words_alphabet != NULL) ? maca_alphabet_size(ctx->words_alphabet) : 0;
ctx->labels_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_LABELS);
ctx->labels_nb = (ctx->labels_alphabet != NULL) ? maca_alphabet_size(ctx->labels_alphabet) : 0;
ctx->pos_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_POS);
ctx->pos_nb = (ctx->pos_alphabet != NULL) ? maca_alphabet_size(ctx->pos_alphabet) : 0;
ctx->morpho_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_MORPHO);
ctx->morpho_nb = (ctx->morpho_alphabet != NULL) ? maca_alphabet_size(ctx->morpho_alphabet) : 0;
maca_alphabet_array_delete(alpha_array);
/* store special values in ctx and check that every necessary alphabet is loaded */
if (ctx->use_full_forms || ctx->use_lemmas) {
if (ctx->words_alphabet == NULL) {
maca_msg(ctx->module, MACA_ERROR);
fprintf(stderr, "missing the '" MACA_ALPHABET_WORDS "' alphabet in the alphabet file\n");
exit(1);
}
ctx->w_start = maca_alphabet_get_code(ctx->words_alphabet, "__START__");
ctx->w_end = maca_alphabet_get_code(ctx->words_alphabet, "__END__");
}
if (ctx->pos_alphabet == NULL) {
maca_msg(ctx->module, MACA_ERROR);
fprintf(stderr, "missing the '" MACA_ALPHABET_POS "' alphabet in the alphabet file\n");
exit(1);
}
ctx->pos_start = maca_alphabet_get_code(ctx->pos_alphabet, "__START__");
ctx->pos_end = maca_alphabet_get_code(ctx->pos_alphabet, "__END__");
if (ctx->labels_alphabet == NULL) {
maca_msg(ctx->module, MACA_ERROR);
fprintf(stderr, "missing the '" MACA_ALPHABET_LABELS "' alphabet in the alphabet file\n");
exit(1);
}
ctx->fct_joker = maca_alphabet_get_code(ctx->labels_alphabet, "__JOKER__");
/* end alphabets */
/* template library allocator needs: words_nb, pos_nb, labels_nb */
ctx->e = maca_graph_parser_templ_library_allocator(ctx);
/* load dep_count_table */
ctx->dep_count_table = maca_graph_parser_dep_count_table_read(ctx, ctx->dep_count_table_file_name);
/* model */
//ctx->model = maca_graph_parser_model_mmap(ctx, ctx->model_file_name);
ctx->model = maca_graph_parser_model_load(ctx, ctx->model_file_name);
/* model2 */
if(ctx->model2_file_name != NULL){
ctx->model2 = maca_graph_parser_model_load(ctx, ctx->model2_file_name);
} else {
ctx->model2 = NULL;
}
/* set active feature types for the decoder */
ctx->min_dep_count = ctx->model->min_dep_count;
ctx->use_lemmas = ctx->model->use_lemmas;
ctx->use_full_forms = ctx->model->use_full_forms;
ctx->basic_features = ctx->model->basic_features;
ctx->first_features = ctx->model->first_features;
ctx->grandchildren_features = ctx->model->grandchildren_features;
ctx->sibling_features = ctx->model->sibling_features;
if(ctx->sibling_features || ctx->grandchildren_features) ctx->order = 2;
/* allocate feature table */
if(ctx->store_in_feature_table){
maca_graph_parser_feature_table_allocator(ctx);
}
ctx->s = maca_graph_parser_allocate_sentence(ctx);
if(ctx->print_ctx) maca_graph_parser_print_ctx(ctx);
loaded = true;
}
macaon::Parser::~Parser()
{
//maca_close();
maca_graph_parser_free_all(ctx);
}
/* C interface to generic parser */
macaon::Parser* Parser_new(char * cfg, int verbose_flag, char *model_file_name, char *alphabet_file_name, char *dep_count_file_name, int order) {
return new macaon::Parser(cfg, verbose_flag, model_file_name, alphabet_file_name, dep_count_file_name, order);
}
void Parser_free(macaon::Parser* parser) {
delete parser;
}
bool Parser_ProcessSentence(macaon::Parser* parser, int num_words, char** words, char** tags, char** lemmas, int* governors, const char** labels) {
std::vector<std::string> word_vector;
std::vector<std::string> tag_vector;
std::vector<std::string> lemma_vector;
std::vector<macaon::ParsedWord> output;
for(int i = 0; i < num_words; i++) {
word_vector.push_back(words[i]);
tag_vector.push_back(tags[i]);
lemma_vector.push_back(lemmas[i]);
}
bool result = parser->ProcessSentence(word_vector, tag_vector, lemma_vector, output);
for(int i = 0; i < num_words; i++) {
governors[i] = output[i].dependencyParent;
labels[i] = strdup(output[i].dependencyLabel.c_str()); // do not forget to free
}
return result;
}
Loading