Skip to content
Snippets Groups Projects
Commit 903e8c2f authored by Johannes Heinecke's avatar Johannes Heinecke
Browse files

wrapping for usage with python or java

parent 73cea196
No related branches found
No related tags found
1 merge request!5Johannes: wrapping to use macaon_trans_tagger, macaon_trans_lemmatizer and macaon_trans_parser within python or java programmes
cmake_minimum_required(VERSION 2.8.7)
project(macaon2)
# for gcc < 5.3
#add_definitions("-Wall -std=gnu11" )
add_definitions("-Wall" )
# activate with cmake -DMACA_EXPORT=TRUE
# to use macaon in python/java (with swig)
if(MACA_EXPORT)
# swig needs that c/c++ code is compiled with -fPIC
set (CMAKE_POSITION_INDEPENDENT_CODE TRUE)
endif()
include_directories(maca_common/include)
include_directories(perceptron/lib/include)
......@@ -13,4 +23,8 @@ add_subdirectory(perceptron)
add_subdirectory(maca_trans_parser)
add_subdirectory(maca_crf_tagger)
if(MACA_EXPORT)
add_subdirectory(maca_export)
endif()
#set(CMAKE_INSTALL_PREFIX ../)
......@@ -14,7 +14,10 @@ The basic procedure to build and install macaon from sources is the following.
cmake -DCMAKE_BUILD_TYPE=Debug ..
If you want to install macaon locally, you can specify the install path with :
cmake -DCMAKE_INSTALL_PREFIX:PATH=/absolute/path/to/macaon_install_dir
cmake -DCMAKE_INSTALL_PREFIX:PATH=/absolute/path/to/macaon_install_dir ..
If you want to create a library to use in python
cmake -DMACA_EXPORT=TRUE ..
- Build the sources with:
make
......
if(MACA_EXPORT)
FIND_PACKAGE(SWIG 3.0)
if (SWIG_FOUND)
FIND_PACKAGE(PythonLibs)
if(PYTHONLIBS_FOUND)
set(SOURCES src/maca_export.i)
set(PYTHON_MODULE_NAME Macaon)
include_directories(${PYTHON_INCLUDE_PATH})
include_directories(../maca_trans_parser/src)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
#message(eeeeeeeeeeee ${CMAKE_CURRENT_SOURCE_DIR})
add_custom_command(
DEPENDS ${SOURCES}
OUTPUT maca_export_py.cc
COMMAND swig -python -c++ -o maca_export_py.cc ../../maca_export/src/maca_export.i
)
# Specify the lib
add_library(${PYTHON_MODULE_NAME} SHARED
maca_export_py.cc
src/maca_trans_tagger_export.cc
src/maca_lemmatizer_export.cc
src/maca_trans_parser_export.cc
)
set_target_properties(${PYTHON_MODULE_NAME} PROPERTIES PREFIX _)
target_link_libraries(${PYTHON_MODULE_NAME} transparse maca_common perceptron)
else()
message("pythonlibs not installed on your system")
endif()
FIND_PACKAGE(Java 1.7)
FIND_PACKAGE(JNI)
if (JNI_FOUND AND Java_FOUND)
set(JAVA_MODULE_NAME Macaon)
set(JAVA_LIBRARY MacaonJava)
set(JAVA_CLASS_TAGGER MacaonTransTagger)
set(JAVA_CLASS_LEMMATIZER MacaonTransLemmatizer)
set(JAVA_CLASS_TRANSPARSER MacaonTransParser)
set(JAVA_PACKAGE lif)
set(JAR_FILENAME macaon)
#set(ADDITIONNAL_JAVA_FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/MacaonTransParserWrapper.java)
# Add include directories
include_directories(${JAVA_INCLUDE_PATH} ${JAVA_INCLUDE_PATH2})
# Run swig executable to generate java code
add_custom_command(
DEPENDS ${SOURCES}
OUTPUT maca_export_java.cc
COMMAND swig -java -package ${JAVA_PACKAGE} -c++ -o maca_export_java.cc ../../maca_export/src/maca_export.i
)
add_library(${JAVA_LIBRARY} SHARED
maca_export_java.cc
src/maca_trans_tagger_export.cc
src/maca_lemmatizer_export.cc
src/maca_trans_parser_export.cc
)
target_link_libraries(${JAVA_LIBRARY} transparse maca_common perceptron)
string(REGEX REPLACE "[.]" "/" JAVA_PACKAGE_DIR ${JAVA_PACKAGE})
add_custom_command(
TARGET ${JAVA_LIBRARY} POST_BUILD
COMMAND mkdir -p ${JAVA_PACKAGE_DIR}
COMMAND ${Java_JAVAC_EXECUTABLE} -encoding utf8 -d ${CMAKE_CURRENT_BINARY_DIR} #${JAVA_PACKAGE_DIR}
${JAVA_MODULE_NAME}JNI.java
${JAVA_CLASS_TAGGER}.java
${JAVA_CLASS_LEMMATIZER}.java
${JAVA_CLASS_TRANSPARSER}.java
${JAVA_MODULE_NAME}.java
${ADDITIONNAL_JAVA_FILES}
DEPENDS ${JAVA_MODULE_NAME}JNI.java ${JAVA_CLASS_TRANSPARSER}.java ${JAVA_CLASS_TAGGER}.java ${JAVA_CLASS_LEMMATIZER}.java ${JAVA_MODULE_NAME}.java ${ADDITIONNAL_JAVA_FILES}
)
add_custom_command(
TARGET ${JAVA_LIBRARY} POST_BUILD
COMMAND ${Java_JAR_EXECUTABLE} -cvf ${JAR_FILENAME}.jar -C ${CMAKE_CURRENT_BINARY_DIR} ${JAVA_PACKAGE_DIR}
DEPENDS ${JAVA_MODULE_NAME}JNI.java ${JAVA_CLASS_TRANSPARSER}.java ${JAVA_CLASS_TAGGER}.java ${JAVA_CLASS_LEMMATIZER}.java ${JAVA_MODULE_NAME}.java ${ADDITIONNAL_JAVA_FILES}
#WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Building ${JAR_FILENAME}"
VERBATIM
)
else()
message("Java JNI support not installed")
endif()
else()
message("swig >= 3.0 not installed on your system")
endif()
endif()
import lif.*;
/** example to use the macaon parser with java
compile (in maca_data2)
javac -cp ../macaon2/build_debug/maca_export/macaon.jar ../macaon2/maca_export/example/example.java
run
java -cp ../macaon2/build_debug/maca_export/macaon.jar:../macaon2/maca_export/example -Djava.library.path=../macaon2/build_debug/maca_export/ example
*/
public class example {
public static void main(String []args) {
System.loadLibrary("MacaonJava"); // use libMacaonExport.so
/* StringBuilder mcf1 = new StringBuilder("La D le\n");
mcf1.append("pose N pose\n");
mcf1.append("d' P de\n");
mcf1.append("un D un\n");
mcf1.append("panneau N panneau\n");
mcf1.append("stop N stop\n");
mcf1.append("paraît V paraître\n");
mcf1.append("être V être\n");
mcf1.append("la D le\n");
mcf1.append("formule N formule\n");
mcf1.append("la D le\n");
mcf1.append("mieux ADV mieux\n");
mcf1.append("adaptée A adapté\n");
mcf1.append("pour P pour\n");
mcf1.append("assurer V assurer\n");
mcf1.append("la D le\n");
mcf1.append("sécurité N sécurité\n");
mcf1.append("des P+D de\n");
mcf1.append("usagers N usager\n");
mcf1.append(". PONCT .\n");*/
StringBuilder mcf1 = new StringBuilder("La\n");
mcf1.append("pose\n");
mcf1.append("d'\n");
mcf1.append("un\n");
mcf1.append("panneau\n");
mcf1.append("stop\n");
mcf1.append("paraît\n");
mcf1.append("être\n");
mcf1.append("la\n");
mcf1.append("formule\n");
mcf1.append("la\n");
mcf1.append("mieux\n");
mcf1.append("adaptée\n");
mcf1.append("pour\n");
mcf1.append("assurer\n");
mcf1.append("la\n");
mcf1.append("sécurité\n");
mcf1.append("des\n");
mcf1.append("usagers\n");
mcf1.append(".\n");
MacaonTransTagger mt = new MacaonTransTagger("jh-seq", "jh-seq/eval/wplgfs.mcd");
MacaonTransLemmatizer ml = new MacaonTransLemmatizer("jh-seq", "jh-seq/eval/wplgfs.mcd");
MacaonTransParser mp = new MacaonTransParser("jh-seq", "jh-seq/eval/wplgfs.mcd");
String tags = mt.tagmcf(mcf1.toString());
System.out.println(tags);
String lemmas = ml.lemmatizemcf(tags);
System.out.println(lemmas);
System.out.println(mp.parsemcf(lemmas));
}
}
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os
currentdir = os.path.dirname(os.path.abspath(__file__))
import sys
sys.path.append(currentdir + "/../../build_debug/maca_export")
import Macaon
# for this example you should be in maca_data2
mt = Macaon.MacaonTransTagger("jh-seq", "jh-seq/eval/wplgfs.mcd")
ml = Macaon.MacaonTransLemmatizer("jh-seq", "jh-seq/eval/wplgfs.mcd")
mp = Macaon.MacaonTransParser("jh-seq", "jh-seq/eval/wplgfs.mcd")
mcf="""La
grosse
souris
verte
a
mangé
le
bon
fromage
hier
soir"""
mcf0 ="""La D
pose N
d' P
un D
panneau N
stop N
paraît V"""
mcf1 = """La D le
pose N pos
d' P de
un D un
panneau N panneau
stop N stop
paraît V paraître
être V être
la D le
formule N formule
la D le
mieux ADV mieux
adaptée A adapté
pour P pour
assurer V assurer
la D le
sécurité N sécurité
des P+D de
usagers N usager
. PONCT ."""
mcf2 = """Une D un
réflexion N réflexion
commune A commun
est V être
menée V mener
avec P avec
les D le
enseignants N enseignant
et C et
les D le
délégués N délégué
de P de
parents N parent
d' P de
élèves N élève
, PONCT ,
sous P sous
la D le
conduite N conduite
du P+D de
CAUE N CAUE
. PONCT ."""
#print mp.parsemcf(mcf1)
#print mp.parsemcf(mcf2)
#print ml.lemmatizemcf(mcf0)
tags = mt.tagmcf(mcf)
print tags
lemmas = ml.lemmatizemcf(tags)
print lemmas
print mp.parsemcf(lemmas)
del mp
del ml
del mt
%module Macaon
%{
#include <maca_trans_tagger_export.h>
#include <maca_lemmatizer_export.h>
#include <maca_trans_parser_export.h>
%}
class MacaonTransTagger {
public:
MacaonTransTagger(char *lg, char *mcd);
~MacaonTransTagger();
const char *tagmcf(const char *mcf);
};
class MacaonTransLemmatizer {
public:
MacaonTransLemmatizer(char *lg, char *mcd);
~MacaonTransLemmatizer();
const char *lemmatizemcf(const char *mcf);
};
class MacaonTransParser {
public:
MacaonTransParser(char *lg, char *mcd);
~MacaonTransParser();
const char *parsemcf(const char *mcf);
};
#include <stdio.h>
#include <string.h>
#ifdef __cplusplus
extern "C"{
#endif
#include "context.h"
#include "feat_fct.h"
#include "config2feat_vec.h"
#include "feature_table.h"
#include "dico.h"
#ifdef __cplusplus
}
#endif
#include "maca_lemmatizer_export.h"
MacaonTransLemmatizer::MacaonTransLemmatizer(char *lg, char *mcd) {
char * argv[] = { (char *)"initParser",
(char *)"-L", lg,
(char *)"-C", mcd,
0
};
ctx = context_read_options(5, argv);
form_pos_ht = hash_new(1000000);
//maca_lemmatizer_check_options(ctx);
maca_lemmatizer_set_linguistic_resources_filenames(ctx);
lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht, ctx->debug_mode, &lemma_array_size);
resultstring = NULL;
}
MacaonTransLemmatizer::~MacaonTransLemmatizer() {
hash_free(form_pos_ht);
for(int i=0; i<lemma_array_size; ++i) {
if (lemma_array[i]) free(lemma_array[i]);
}
free(lemma_array);
context_free(ctx);
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
}
const char *MacaonTransLemmatizer::lemmatizemcf(const char *mcfString) {
word *b0;
FILE *f = fmemopen ((void *)mcfString, strlen(mcfString), "r");
config *c = config_new(f, ctx->mcd_struct, 5);
char lemma[200];
char form[200];
char pos[200];
size_t size;
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
FILE *outstream = open_memstream (&resultstring, &size);
while(!config_is_terminal(c)){
b0 = word_buffer_b0(c->bf);
word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct));
word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct));
word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct));
/* if lemma is not specified in input it is looked up */
if(strlen(lemma) && strcmp(lemma, "_"))
print_word(b0, ctx->mcd_struct, lemma, outstream);
else
print_word(b0, ctx->mcd_struct, lookup_lemma(form, pos, form_pos_ht, lemma_array, ctx->verbose), outstream);
word_buffer_move_right(c->bf);
}
fclose(f);
config_free(c);
fclose(outstream);
return resultstring;
}
void MacaonTransLemmatizer::print_word(word *w, mcd *mcd_struct, char *lemma, FILE *outstream) {
char *buffer = NULL;
char *token = NULL;
int col_nb = 0;
if(mcd_get_lemma_col(mcd_struct) == -1){
fprintf(outstream, "%s\t%s\n", w->input, lemma);
}
else{
buffer = strdup(w->input);
token = strtok(buffer, "\t");
col_nb = 0;
while(token){
if(col_nb != 0) fprintf(outstream, "\t");
if(col_nb == mcd_get_lemma_col(mcd_struct))
fprintf(outstream, "%s", lemma);
else
word_print_col_n(outstream, w, col_nb);
col_nb++;
token = strtok(NULL, "\t");
}
if(col_nb <= mcd_get_lemma_col(mcd_struct))
fprintf(outstream, "\t%s", lemma);
fprintf(outstream, "\n");
free(buffer);
}
}
// taken as is from maca_lemmatizer.c
void MacaonTransLemmatizer::maca_lemmatizer_set_linguistic_resources_filenames(context *ctx) {
char absolute_filename[500];
if(!ctx->fplm_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FPLM_FILENAME);
ctx->fplm_filename = strdup(absolute_filename);
}
if(ctx->verbose){
fprintf(stderr, "fplm_filename = %s\n", ctx->fplm_filename);
}
}
char **MacaonTransLemmatizer::read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, int *lemma_array_size) {
char form[1000];
char pos[1000];
char lemma[1000];
char morpho[1000];
int num = 0;
char **lemma_array;
//int lemma_array_size = 10000;
*lemma_array_size = 10000;
char buffer[10000];
int fields_nb;
FILE *f= myfopen(fplm_filename, "r");
lemma_array = (char **)memalloc((*lemma_array_size) * sizeof(char *));
while(fgets(buffer, 10000, f)){
fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho);
/* if(!strcmp(form, "d")) */
/* fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); */
if(fields_nb != 4){
if(debug_mode){
fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma);
fprintf(stderr, "incorrect fplm entry, skipping it\n");
}
continue;
}
strcat(form, "/");
strcat(form, pos);
// TODO: memory leak: if form is already in the hash, it is not added and the memory
// allocated by strdup() is leaked
// solutions: hash_add does the strdup() if necessary (check else where !)
// or return code to indicate whether form has been added or not
hash_add(form_pos_ht, strdup(form), num);
if(num >= *lemma_array_size){
*lemma_array_size = 2 * (*lemma_array_size) + 1;
lemma_array = (char **)realloc(lemma_array, (*lemma_array_size) * sizeof(char *));
// initialize in order to be able to free correctly and the end
for(int i=num; i<*lemma_array_size; ++i) {
lemma_array[i] = NULL;
}
}
/* if(lemma_array[num] == NULL) */
lemma_array[num] = strdup(lemma);
num++;
}
/* fprintf(stderr, "%d entries loaded\n", num); */
fclose(f);
return lemma_array;
}
char *MacaonTransLemmatizer::lookup_lemma(char *form, char *pos, hash *form_pos_ht, char **lemma_array, int verbose) {
char form_pos[1000];
int index_form_pos;
strcpy(form_pos, form);
strcat(form_pos, "/");
strcat(form_pos, pos);
index_form_pos = hash_get_val(form_pos_ht, form_pos);
if(index_form_pos != HASH_INVALID_VAL) /* couple form/pos found in the hash table */
return lemma_array[index_form_pos];
strcpy(form_pos, form);
to_lower_string(form_pos); /* change form to lower case and look it up again */
strcat(form_pos, "/");
strcat(form_pos, pos);
index_form_pos = hash_get_val(form_pos_ht, form_pos);
if(index_form_pos != HASH_INVALID_VAL)
return lemma_array[index_form_pos];
/* even in lower case couple form/pos is not found, return the form as lemma */
if(verbose)
fprintf(stderr, "cannot find an entry for %s %s\n", form, pos);
return form;
}
#include "context.h"
class MacaonTransLemmatizer {
public:
/**
creates instance, assumes that the environment variable MACAON_DIR
is defined
@param lg language to be used (in the sense of sub-dir in MACAON_DIR)
@param mcd the filename of the mcd definitions
*/
MacaonTransLemmatizer(char *lg, char *mcd);
~MacaonTransLemmatizer();
/** call lemmatizer
@param mcfString a string containing the sentence to be analysed in mcf format
(at least the columns form, pos, lemma must be present
@return the parser output
*/
const char *lemmatizemcf(const char *mcfString);
int initOK;
private:
/// keeps parser context
context *ctx;
/// keeps last result (or NULL)
char *resultstring;
hash *form_pos_ht = NULL;
char **lemma_array = NULL;
int lemma_array_size;
void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx);
char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, int *lemma_array_size);
void print_word(word *w, mcd *mcd_struct, char *lemma, FILE *stream);
char *lookup_lemma(char *form, char *pos, hash *form_pos_ht, char **lemma_array, int verbose);
};
#include <stdio.h>
#include <string.h>
#ifdef __cplusplus
extern "C"{
#endif
#include "context.h"
#include "simple_decoder_parser_arc_eager.h"
#include "movement_parser_arc_eager.h"
#include "feat_fct.h"
#include "config2feat_vec.h"
#include "feature_table.h"
#include "dico.h"
#ifdef __cplusplus
}
#endif
#include "maca_trans_parser_export.h"
/** initialises class variables
resultstring: which keeps last result
ctx: current context
*/
MacaonTransParser::MacaonTransParser(char *lg, char *mcd) {
resultstring = NULL;
initOK = 1;
char * argv[] = { (char *)"initParser",
(char *)"-L", lg,
(char *)"-C", mcd,
0
};
ctx = context_read_options(5, argv);
set_linguistic_resources_filenames_parser(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL");
if(ctx->dico_labels == NULL){
fprintf(stderr, "cannot find label names\n");
initOK = 0;
}
ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 3;
/* load models */
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
}
MacaonTransParser::~MacaonTransParser() {
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
context_free(ctx);
}
const char *MacaonTransParser::parsemcf(const char *mcf) {
simple_decoder_parser_arc_eager_str(ctx, mcf);
//printf("rrr %s\n", resultstring);
//return "abcdef";
return resultstring;
}
/** taken as is from maca_trans_parser.c, since this function is not in the
libtransparse.a library */
void MacaonTransParser::set_linguistic_resources_filenames_parser(context *ctx) {
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
if(!ctx->perc_model_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MODEL_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_VOCABS_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
/* if(!ctx->mcd_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_FILENAME);
ctx->mcd_filename = strdup(absolute_filename);
}*/
if(!ctx->features_model_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(ctx->verbose){
fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
}
}
/** taken from simple_decode_parser_arc_eager.c and modified in order to
taken an input string (in mcf format) which is read through a FILE * via fmemopen()
instead reading a file or stdin.
It writes the result to a FILE * opened with open_memstream() in order to get the result in a char *
*/
void MacaonTransParser::simple_decoder_parser_arc_eager_str(context *ctx, const char *mcfString) {
FILE *f = fmemopen ((void *)mcfString, strlen(mcfString), "r");
feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
int root_label;
int mvt_code;
int mvt_type;
int mvt_label;
float max;
feat_vec *fv = feat_vec_new(feature_types_nb);
config *c = NULL;
int result;
/* float entropy; */
/* float delta; */
int argmax1, argmax2;
float max1, max2;
int index;
root_label = dico_string2int(ctx->dico_labels, ctx->root_label);
if(root_label == -1) root_label = 0;
c = config_new(f, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
if(ctx->debug_mode){
fprintf(stdout, "***********************************\n");
config_print(stdout, c);
}
/* forced EOS (the element on the top of the stack is eos, but the preceding movement is not MVT_PARSER_EOS */
/* which means that the top of the stack got its eos status from input */
/* force the parser to finish parsing the sentence (perform all pending reduce actions) and determine root of the sentence */
if((word_get_sent_seg(stack_top(config_get_stack(c))) == 1) && (mvt_get_type(mvt_stack_top(config_get_history(c))) != MVT_PARSER_EOS)){
word_set_sent_seg(stack_top(config_get_stack(c)), -1);
movement_parser_eos(c);
while(movement_parser_reduce(c));
while(movement_parser_root(c, root_label));
if(ctx->debug_mode) printf("force EOS\n");
}
/* normal behavious, ask classifier what is the next movement to do and do it */
else{
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
mvt_code = feature_table_argmax(fv, ft, &max);
if(ctx->debug_mode){
vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
for(int i=0; i < 3; i++){
printf("%d\t", i);
movement_parser_print(stdout, vcode_array[i].class_code, ctx->dico_labels);
printf("\t%.4f\n", vcode_array[i].score);
}
free(vcode_array);
}
if(ctx->trace_mode){
index = word_get_index(word_buffer_b0(config_get_buffer(c)));
fprintf(stdout, "%d\t", index);
stack_print(stdout, c->st);
fprintf(stdout, "\t");
movement_parser_print(stdout, mvt_code, ctx->dico_labels);
fprintf(stdout, "\t");
feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2);
printf("%f\n", max1 - max2);
}
mvt_type = movement_parser_type(mvt_code);
mvt_label = movement_parser_label(mvt_code);
result = 0;
switch(mvt_type){
case MVT_PARSER_LEFT :
result = movement_parser_left_arc(c, mvt_label);
break;
case MVT_PARSER_RIGHT:
result = movement_parser_right_arc(c, mvt_label);
break;
case MVT_PARSER_REDUCE:
result = movement_parser_reduce(c);
break;
case MVT_PARSER_ROOT:
result = movement_parser_root(c, root_label);
break;
case MVT_PARSER_EOS:
result = movement_parser_eos(c);
break;
case MVT_PARSER_SHIFT:
result = movement_parser_shift(c);
}
if(result == 0){
if(ctx->debug_mode) fprintf(stdout, "WARNING : movement cannot be executed doing a SHIFT instead !\n");
result = movement_parser_shift(c);
if(result == 0){ /* SHIFT failed no more words to read, let's get out of here ! */
if(ctx->debug_mode) fprintf(stdout, "WARNING : cannot exectue a SHIFT emptying stack !\n");
while(!stack_is_empty(config_get_stack(c)))
movement_parser_root(c, root_label);
}
}
}
}
//if(!ctx->trace_mode) {
size_t size;
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
FILE *outstream = open_memstream (&resultstring, &size);
print_word_buffer_fp(c, ctx->dico_labels, ctx->mcd_struct, outstream);
fclose(outstream);
config_free(c);
feat_vec_free(fv);
feature_table_free(ft);
fclose(f);
}
/** taken from simple_decode_parser_arc_eager.c and modified in order to write to any FILE* not only stdout */
void MacaonTransParser::print_word_buffer_fp(config *c, dico *dico_labels, mcd *mcd_struct, FILE *out) {
int i;
word *w;
char *label;
char *buffer = NULL;
char *token = NULL;
int col_nb = 0;
for(i=0; i < config_get_buffer(c)->nbelem; i++){
w = word_buffer_get_word_n(config_get_buffer(c), i);
if((mcd_get_gov_col(mcd_struct) == -1)
&& (mcd_get_label_col(mcd_struct) == -1)
&& (mcd_get_sent_seg_col(mcd_struct) == -1)){
fprintf(out, "%s\t", word_get_input(w));
fprintf(out, "%d\t", word_get_gov(w));
label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w));
if(label != NULL)
fprintf(out, "%s\t", label) ;
else
fprintf(out, "_\t");
if(word_get_sent_seg(w) == 1)
fprintf(out, "1\n") ;
else
fprintf(out, "0\n");
}
else{
buffer = strdup(w->input);
token = strtok(buffer, "\t");
col_nb = 0;
while(token){
if(col_nb != 0) fprintf(out, "\t");
if(col_nb == mcd_get_gov_col(mcd_struct)){
fprintf(out, "%d", word_get_gov(w));
}
else
if(col_nb == mcd_get_label_col(mcd_struct)){
label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w));
if(label != NULL)
fprintf(out, "%s", label) ;
else
fprintf(out, "_");
}
else
if(col_nb == mcd_get_sent_seg_col(mcd_struct)){
if(word_get_sent_seg(w) == 1)
fprintf(out, "1") ;
else
fprintf(out, "0");
}
else{
word_print_col_n(out, w, col_nb);
}
col_nb++;
token = strtok(NULL, "\t");
}
if((col_nb <= mcd_get_gov_col(mcd_struct)) || (mcd_get_gov_col(mcd_struct) == -1)){
fprintf(out, "\t%d", word_get_gov(w));
}
if((col_nb <= mcd_get_label_col(mcd_struct)) || (mcd_get_label_col(mcd_struct) == -1)){
label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w));
if(label != NULL)
fprintf(out, "\t%s", label) ;
else
fprintf(out, "\t_");
}
if((col_nb <= mcd_get_sent_seg_col(mcd_struct)) || (mcd_get_sent_seg_col(mcd_struct) == -1)){
if(word_get_sent_seg(w) == 1)
fprintf(out, "\t1") ;
else
fprintf(out, "\t0");
}
fprintf(out, "\n");
free(buffer);
}
}
}
#include "context.h"
class MacaonTransParser {
public:
/**
creates instance of MacaonParser, assumes that the environment variable MACAON_DIR
is defined
@param lg language to be used (in the sense of sub-dir in MACAON_DIR)
@param mcd the filename of the mcd definitions
*/
MacaonTransParser(char *lg, char *mcd);
~MacaonTransParser();
/** call dependency parser
@param mcfString a string containing the sentence to be analysed in mcf format
(at least the columns form, pos, lemma must be present
@return the parser output
*/
const char *parsemcf(const char *mcfString);
int initOK;
private:
/// keeps parser context
context *ctx;
/// keeps last result (or NULL)
char *resultstring;
void set_linguistic_resources_filenames_parser(context *ctx);
void print_word_buffer_fp(config *c, dico *dico_labels, mcd *mcd_struct, FILE *out);
void simple_decoder_parser_arc_eager_str(context *ctx, const char *mcfString);
};
#include <stdio.h>
#include <string.h>
#ifdef __cplusplus
extern "C"{
#endif
#include "context.h"
#include "feat_fct.h"
#include "config2feat_vec.h"
#include "feature_table.h"
#include "dico.h"
#include "movement_tagger.h"
#ifdef __cplusplus
}
#endif
#include "maca_trans_tagger_export.h"
MacaonTransTagger::MacaonTransTagger(char *lg, char *mcd) {
char * argv[] = { (char *)"initParser",
(char *)"-L", lg,
(char *)"-C", mcd,
0
};
ctx = context_read_options(5, argv);
decode_tagger_set_linguistic_resources_filenames(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
resultstring = NULL;
}
MacaonTransTagger::~MacaonTransTagger() {
context_free(ctx);
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
}
const char *MacaonTransTagger::tagmcf(const char *mcfString) {
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
if(ctx->beam_width == 1)
simple_decoder_tagger(ctx, mcfString);
return resultstring;
}
// taken as is
void MacaonTransTagger::decode_tagger_set_linguistic_resources_filenames(context *ctx) {
char absolute_filename[500];
if(!ctx->perc_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MODEL_TAGGER_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_VOCABS_TAGGER_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
/* if(!ctx->mcd_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_TAGGER_FILENAME);
ctx->mcd_filename = strdup(absolute_filename);
}*/
if(!ctx->features_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_TAGGER_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(!ctx->f2p_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_F2P_FILENAME);
ctx->f2p_filename = strdup(absolute_filename);
ctx->f2p = form2pos_read(ctx->f2p_filename);
}
if(ctx->verbose){
fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename);
}
}
// taken from simple_decoder_tagger.c and modified to read from string
void MacaonTransTagger::simple_decoder_tagger(context *ctx, const char *mcfString) {
config *c;
feat_vec *fv = feat_vec_new(feature_types_nb);
//FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
FILE *f = fmemopen ((void *)mcfString, strlen(mcfString), "r");
feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
int postag;
float max;
word *b0;
dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS");
c = config_new(f, ctx->mcd_struct, 5);
size_t size;
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
FILE *outstream = open_memstream (&resultstring, &size);
while(!config_is_terminal(c)){
if(ctx->f2p)
/* add_signature_to_words_in_word_buffer(c->bf, ctx->f2p, dico_pos); */
add_signature_to_words_in_word_buffer(c->bf, ctx->f2p);
b0 = word_buffer_b0(c->bf);
postag = word_get_pos(b0);
if(ctx->debug_mode){
fprintf(stderr, "***********************************\n");
config_print(stderr, c);
}
/* if postag is not specified in input it is predicted */
if(postag == -1){
/* config_print(stdout, c); */
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
/* feat_vec_print(stdout, fv); */
postag = feature_table_argmax(fv, ft, &max);
/* printf("postag = %d\n", postag); */
if(ctx->debug_mode){
vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
for(int i=0; i < 3; i++){
fprintf(stderr, "%d\t", i);
fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score);
}
free(vcode_array);
}
}
print_word(b0, ctx->mcd_struct, dico_pos, postag, outstream);
movement_tagger(c, postag);
}
fclose(outstream);
/* config_print(stdout, c); */
feat_vec_free(fv);
feature_table_free(ft);
config_free(c);
fclose(f);
}
void MacaonTransTagger::print_word(word *w, mcd *mcd_struct, dico *dico_pos, int postag, FILE *outstream) {
char *buffer = NULL;
char *token = NULL;
int col_nb = 0;
if(mcd_get_pos_col(mcd_struct) == -1){
fprintf(outstream, "%s\t%s\n", w->input, dico_int2string(dico_pos, postag));
}
else{
buffer = strdup(w->input);
token = strtok(buffer, "\t");
col_nb = 0;
while(token){
if(col_nb != 0) printf("\t");
if(col_nb == mcd_get_pos_col(mcd_struct))
fprintf(outstream, "%s", dico_int2string(dico_pos, postag));
else
word_print_col_n(outstream, w, col_nb);
col_nb++;
token = strtok(NULL, "\t");
}
if(col_nb <= mcd_get_pos_col(mcd_struct))
fprintf(outstream, "\t%s", dico_int2string(dico_pos, postag));
fprintf(outstream, "\n");
free(buffer);
}
}
void MacaonTransTagger::add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p) {
int i;
word *w;
char lower_form[100];
for(i = word_buffer_get_nbelem(bf) - 1; i >=0 ; i--){
w = word_buffer_get_word_n(bf, i);
if(word_get_signature(w) != -1) break;
w->signature = form2pos_get_signature(f2p, w->form);
if(w->signature == -1){
strcpy(lower_form, w->form);
to_lower_string(lower_form);
w->signature = form2pos_get_signature(f2p, lower_form);
}
}
}
#include "context.h"
class MacaonTransTagger {
public:
/**
creates instance, assumes that the environment variable MACAON_DIR
is defined
@param lg language to be used (in the sense of sub-dir in MACAON_DIR)
@param mcd the filename of the mcd definitions
*/
MacaonTransTagger(char *lg, char *mcd);
~MacaonTransTagger();
/** call tagger
@param mcfString a string containing the sentence to be analysed in mcf format
(at least the columns form, pos, lemma must be present
@return the parser output
*/
const char *tagmcf(const char *mcfString);
int initOK;
private:
/// keeps parser context
context *ctx;
/// keeps last result (or NULL)
char *resultstring;
void decode_tagger_set_linguistic_resources_filenames(context *ctx);
void simple_decoder_tagger(context *ctx, const char *mcf);
void print_word(word *w, mcd *mcd_struct, dico *dico_pos, int postag, FILE *stream);
void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p);
};
//extern void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p);
......@@ -136,5 +136,5 @@ void simple_decoder_tagger(context *ctx)
feat_vec_free(fv);
feature_table_free(ft);
config_free(c);
fclose(f);
if (ctx->input_filename) fclose(f);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment