Skip to content
Snippets Groups Projects
Commit 0dbcd01b authored by Alexis Nasr's avatar Alexis Nasr
Browse files

solved conflict in CMakelists.txt

parents 44a96605 79dc9d86
Branches
No related tags found
No related merge requests found
Showing with 1175 additions and 4 deletions
......@@ -2,7 +2,17 @@ cmake_minimum_required(VERSION 2.8.7)
project(macaon2)
add_definitions("-Wall")
find_package(FLEX)
# for gcc < 5.3
#add_definitions("-Wall -std=gnu11" )
add_definitions("-Wall" )
# activate with cmake -DMACA_EXPORT=TRUE
# to use macaon in python/java (with swig)
if(MACA_EXPORT)
# swig needs that c/c++ code is compiled with -fPIC
set (CMAKE_POSITION_INDEPENDENT_CODE TRUE)
endif()
include_directories(maca_common/include)
include_directories(perceptron/lib/include)
......@@ -16,4 +26,8 @@ add_subdirectory(maca_trans_parser)
add_subdirectory(maca_crf_tagger)
add_subdirectory(maca_graph_parser)
if(MACA_EXPORT)
add_subdirectory(maca_export)
endif()
#set(CMAKE_INSTALL_PREFIX ../)
......@@ -14,7 +14,10 @@ The basic procedure to build and install macaon from sources is the following.
cmake -DCMAKE_BUILD_TYPE=Debug ..
If you want to install macaon locally, you can specify the install path with :
cmake -DCMAKE_INSTALL_PREFIX:PATH=/absolute/path/to/macaon_install_dir
cmake -DCMAKE_INSTALL_PREFIX:PATH=/absolute/path/to/macaon_install_dir ..
If you want to create a library to use in python
cmake -DMACA_EXPORT=TRUE ..
- Build the sources with:
make
......
if(MACA_EXPORT)
FIND_PACKAGE(SWIG 3.0)
if (SWIG_FOUND)
FIND_PACKAGE(PythonLibs)
if(PYTHONLIBS_FOUND)
set(SOURCES src/maca_export.i)
set(PYTHON_MODULE_NAME Macaon)
include_directories(${PYTHON_INCLUDE_PATH})
include_directories(../maca_trans_parser/src)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
#message(eeeeeeeeeeee ${CMAKE_CURRENT_SOURCE_DIR})
add_custom_command(
DEPENDS ${SOURCES}
OUTPUT maca_export_py.cc
COMMAND swig -python -c++ -o maca_export_py.cc ../../maca_export/src/maca_export.i
)
# Specify the lib
add_library(${PYTHON_MODULE_NAME} SHARED
maca_export_py.cc
src/maca_trans_tagger_export.cc
src/maca_lemmatizer_export.cc
src/maca_trans_parser_export.cc
)
set_target_properties(${PYTHON_MODULE_NAME} PROPERTIES PREFIX _)
target_link_libraries(${PYTHON_MODULE_NAME} transparse maca_common perceptron)
else()
message("pythonlibs not installed on your system")
endif()
FIND_PACKAGE(Java 1.7)
FIND_PACKAGE(JNI)
if (JNI_FOUND AND Java_FOUND)
set(JAVA_MODULE_NAME Macaon)
set(JAVA_LIBRARY MacaonJava)
set(JAVA_CLASS_TAGGER MacaonTransTagger)
set(JAVA_CLASS_LEMMATIZER MacaonTransLemmatizer)
set(JAVA_CLASS_TRANSPARSER MacaonTransParser)
set(JAVA_PACKAGE lif)
set(JAR_FILENAME macaon)
#set(ADDITIONNAL_JAVA_FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/MacaonTransParserWrapper.java)
# Add include directories
include_directories(${JAVA_INCLUDE_PATH} ${JAVA_INCLUDE_PATH2})
# Run swig executable to generate java code
add_custom_command(
DEPENDS ${SOURCES}
OUTPUT maca_export_java.cc
COMMAND swig -java -package ${JAVA_PACKAGE} -c++ -o maca_export_java.cc ../../maca_export/src/maca_export.i
)
add_library(${JAVA_LIBRARY} SHARED
maca_export_java.cc
src/maca_trans_tagger_export.cc
src/maca_lemmatizer_export.cc
src/maca_trans_parser_export.cc
)
target_link_libraries(${JAVA_LIBRARY} transparse maca_common perceptron)
string(REGEX REPLACE "[.]" "/" JAVA_PACKAGE_DIR ${JAVA_PACKAGE})
add_custom_command(
TARGET ${JAVA_LIBRARY} POST_BUILD
COMMAND mkdir -p ${JAVA_PACKAGE_DIR}
COMMAND ${Java_JAVAC_EXECUTABLE} -encoding utf8 -d ${CMAKE_CURRENT_BINARY_DIR} #${JAVA_PACKAGE_DIR}
${JAVA_MODULE_NAME}JNI.java
${JAVA_CLASS_TAGGER}.java
${JAVA_CLASS_LEMMATIZER}.java
${JAVA_CLASS_TRANSPARSER}.java
${JAVA_MODULE_NAME}.java
${ADDITIONNAL_JAVA_FILES}
DEPENDS ${JAVA_MODULE_NAME}JNI.java ${JAVA_CLASS_TRANSPARSER}.java ${JAVA_CLASS_TAGGER}.java ${JAVA_CLASS_LEMMATIZER}.java ${JAVA_MODULE_NAME}.java ${ADDITIONNAL_JAVA_FILES}
)
add_custom_command(
TARGET ${JAVA_LIBRARY} POST_BUILD
COMMAND ${Java_JAR_EXECUTABLE} -cvf ${JAR_FILENAME}.jar -C ${CMAKE_CURRENT_BINARY_DIR} ${JAVA_PACKAGE_DIR}
DEPENDS ${JAVA_MODULE_NAME}JNI.java ${JAVA_CLASS_TRANSPARSER}.java ${JAVA_CLASS_TAGGER}.java ${JAVA_CLASS_LEMMATIZER}.java ${JAVA_MODULE_NAME}.java ${ADDITIONNAL_JAVA_FILES}
#WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Building ${JAR_FILENAME}"
VERBATIM
)
else()
message("Java JNI support not installed")
endif()
else()
message("swig >= 3.0 not installed on your system")
endif()
endif()
import lif.*;
/** example to use the macaon parser with java
compile (in maca_data2)
javac -cp ../macaon2/build_debug/maca_export/macaon.jar ../macaon2/maca_export/example/example.java
run
java -cp ../macaon2/build_debug/maca_export/macaon.jar:../macaon2/maca_export/example -Djava.library.path=../macaon2/build_debug/maca_export/ example
*/
public class example {
public static void main(String []args) {
System.loadLibrary("MacaonJava"); // use libMacaonExport.so
/* StringBuilder mcf1 = new StringBuilder("La D le\n");
mcf1.append("pose N pose\n");
mcf1.append("d' P de\n");
mcf1.append("un D un\n");
mcf1.append("panneau N panneau\n");
mcf1.append("stop N stop\n");
mcf1.append("paraît V paraître\n");
mcf1.append("être V être\n");
mcf1.append("la D le\n");
mcf1.append("formule N formule\n");
mcf1.append("la D le\n");
mcf1.append("mieux ADV mieux\n");
mcf1.append("adaptée A adapté\n");
mcf1.append("pour P pour\n");
mcf1.append("assurer V assurer\n");
mcf1.append("la D le\n");
mcf1.append("sécurité N sécurité\n");
mcf1.append("des P+D de\n");
mcf1.append("usagers N usager\n");
mcf1.append(". PONCT .\n");*/
StringBuilder mcf1 = new StringBuilder("La\n");
mcf1.append("pose\n");
mcf1.append("d'\n");
mcf1.append("un\n");
mcf1.append("panneau\n");
mcf1.append("stop\n");
mcf1.append("paraît\n");
mcf1.append("être\n");
mcf1.append("la\n");
mcf1.append("formule\n");
mcf1.append("la\n");
mcf1.append("mieux\n");
mcf1.append("adaptée\n");
mcf1.append("pour\n");
mcf1.append("assurer\n");
mcf1.append("la\n");
mcf1.append("sécurité\n");
mcf1.append("des\n");
mcf1.append("usagers\n");
mcf1.append(".\n");
MacaonTransTagger mt = new MacaonTransTagger("jh-seq", "jh-seq/eval/wplgfs.mcd");
MacaonTransLemmatizer ml = new MacaonTransLemmatizer("jh-seq", "jh-seq/eval/wplgfs.mcd");
MacaonTransParser mp = new MacaonTransParser("jh-seq", "jh-seq/eval/wplgfs.mcd");
String tags = mt.tagmcf(mcf1.toString());
System.out.println(tags);
String lemmas = ml.lemmatizemcf(tags);
System.out.println(lemmas);
System.out.println(mp.parsemcf(lemmas));
}
}
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os
currentdir = os.path.dirname(os.path.abspath(__file__))
import sys
sys.path.append(currentdir + "/../../build_debug/maca_export")
import Macaon
# for this example you should be in maca_data2
mt = Macaon.MacaonTransTagger("jh-seq", "jh-seq/eval/wplgfs.mcd")
ml = Macaon.MacaonTransLemmatizer("jh-seq", "jh-seq/eval/wplgfs.mcd")
mp = Macaon.MacaonTransParser("jh-seq", "jh-seq/eval/wplgfs.mcd")
mcf="""La
grosse
souris
verte
a
mangé
le
bon
fromage
hier
soir"""
mcf0 ="""La D
pose N
d' P
un D
panneau N
stop N
paraît V"""
mcf1 = """La D le
pose N pos
d' P de
un D un
panneau N panneau
stop N stop
paraît V paraître
être V être
la D le
formule N formule
la D le
mieux ADV mieux
adaptée A adapté
pour P pour
assurer V assurer
la D le
sécurité N sécurité
des P+D de
usagers N usager
. PONCT ."""
mcf2 = """Une D un
réflexion N réflexion
commune A commun
est V être
menée V mener
avec P avec
les D le
enseignants N enseignant
et C et
les D le
délégués N délégué
de P de
parents N parent
d' P de
élèves N élève
, PONCT ,
sous P sous
la D le
conduite N conduite
du P+D de
CAUE N CAUE
. PONCT ."""
#print mp.parsemcf(mcf1)
#print mp.parsemcf(mcf2)
#print ml.lemmatizemcf(mcf0)
tags = mt.tagmcf(mcf)
print tags
lemmas = ml.lemmatizemcf(tags)
print lemmas
print mp.parsemcf(lemmas)
del mp
del ml
del mt
%module Macaon
%{
#include <maca_trans_tagger_export.h>
#include <maca_lemmatizer_export.h>
#include <maca_trans_parser_export.h>
%}
class MacaonTransTagger {
public:
MacaonTransTagger(char *lg, char *mcd);
~MacaonTransTagger();
const char *tagmcf(const char *mcf);
};
class MacaonTransLemmatizer {
public:
MacaonTransLemmatizer(char *lg, char *mcd);
~MacaonTransLemmatizer();
const char *lemmatizemcf(const char *mcf);
};
class MacaonTransParser {
public:
MacaonTransParser(char *lg, char *mcd);
~MacaonTransParser();
const char *parsemcf(const char *mcf);
};
#include <stdio.h>
#include <string.h>
#ifdef __cplusplus
extern "C"{
#endif
#include "context.h"
#include "feat_fct.h"
#include "config2feat_vec.h"
#include "feature_table.h"
#include "dico.h"
#ifdef __cplusplus
}
#endif
#include "maca_lemmatizer_export.h"
MacaonTransLemmatizer::MacaonTransLemmatizer(char *lg, char *mcd) {
char * argv[] = { (char *)"initParser",
(char *)"-L", lg,
(char *)"-C", mcd,
0
};
ctx = context_read_options(5, argv);
form_pos_ht = hash_new(1000000);
//maca_lemmatizer_check_options(ctx);
maca_lemmatizer_set_linguistic_resources_filenames(ctx);
lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht, ctx->debug_mode, &lemma_array_size);
resultstring = NULL;
}
MacaonTransLemmatizer::~MacaonTransLemmatizer() {
hash_free(form_pos_ht);
for(int i=0; i<lemma_array_size; ++i) {
if (lemma_array[i]) free(lemma_array[i]);
}
free(lemma_array);
context_free(ctx);
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
}
const char *MacaonTransLemmatizer::lemmatizemcf(const char *mcfString) {
word *b0;
FILE *f = fmemopen ((void *)mcfString, strlen(mcfString), "r");
config *c = config_new(f, ctx->mcd_struct, 5);
//char lemma[200];
//char form[200];
//char pos[200];
size_t size;
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
FILE *outstream = open_memstream (&resultstring, &size);
while(!config_is_terminal(c)){
b0 = word_buffer_b0(c->bf);
word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct));
word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct));
word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct));
/* if lemma is not specified in input it is looked up */
if(strlen(lemma) && strcmp(lemma, "_"))
print_word(b0, ctx->mcd_struct, lemma, outstream);
else
print_word(b0, ctx->mcd_struct, lookup_lemma(form, pos, form_pos_ht, lemma_array, ctx->verbose), outstream);
word_buffer_move_right(c->bf);
}
fclose(f);
config_free(c);
fclose(outstream);
return resultstring;
}
/** taken from maca_trans_lemmatizer and added FILE * outstream to write the result to
*/
void MacaonTransLemmatizer::print_word(word *w, mcd *mcd_struct, char *lemma, FILE *outstream) {
char *buffer = NULL;
char *token = NULL;
int col_nb = 0;
if(mcd_get_lemma_col(mcd_struct) == -1){
fprintf(outstream, "%s\t%s\n", w->input, lemma);
}
else{
buffer = strdup(w->input);
token = strtok(buffer, "\t");
col_nb = 0;
while(token){
if(col_nb != 0) fprintf(outstream, "\t");
if(col_nb == mcd_get_lemma_col(mcd_struct))
fprintf(outstream, "%s", lemma);
else
word_print_col_n(outstream, w, col_nb);
col_nb++;
token = strtok(NULL, "\t");
}
if(col_nb <= mcd_get_lemma_col(mcd_struct))
fprintf(outstream, "\t%s", lemma);
fprintf(outstream, "\n");
free(buffer);
}
}
/** taken as is from maca_lemmatizer.c since it is not included in libtransparse.a */
void MacaonTransLemmatizer::maca_lemmatizer_set_linguistic_resources_filenames(context *ctx) {
char absolute_filename[500];
if(!ctx->fplm_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FPLM_FILENAME);
ctx->fplm_filename = strdup(absolute_filename);
}
if(ctx->verbose){
fprintf(stderr, "fplm_filename = %s\n", ctx->fplm_filename);
}
}
/** taken as is from maca_trans_lemmatizer.c since it is not included in libtransparse.a */
char **MacaonTransLemmatizer::read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, int *lemma_array_size) {
char form[1000];
char pos[1000];
char lemma[1000];
char morpho[1000];
int num = 0;
char **lemma_array;
//int lemma_array_size = 10000;
*lemma_array_size = 10000;
char buffer[10000];
int fields_nb;
FILE *f= myfopen(fplm_filename, "r");
lemma_array = (char **)memalloc((*lemma_array_size) * sizeof(char *));
while(fgets(buffer, 10000, f)){
fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho);
/* if(!strcmp(form, "d")) */
/* fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); */
if(fields_nb != 4){
if(debug_mode){
fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma);
fprintf(stderr, "incorrect fplm entry, skipping it\n");
}
continue;
}
strcat(form, "/");
strcat(form, pos);
// TODO: memory leak: if form is already in the hash, it is not added and the memory
// allocated by strdup() is leaked
// solutions: hash_add does the strdup() if necessary (check else where !)
// or return code to indicate whether form has been added or not
hash_add(form_pos_ht, strdup(form), num);
if(num >= *lemma_array_size){
*lemma_array_size = 2 * (*lemma_array_size) + 1;
lemma_array = (char **)realloc(lemma_array, (*lemma_array_size) * sizeof(char *));
// initialize in order to be able to free correctly and the end
for(int i=num; i<*lemma_array_size; ++i) {
lemma_array[i] = NULL;
}
}
/* if(lemma_array[num] == NULL) */
lemma_array[num] = strdup(lemma);
num++;
}
/* fprintf(stderr, "%d entries loaded\n", num); */
fclose(f);
return lemma_array;
}
/** taken as is from maca_trans_lemmatizer.c since it is not included in libtransparse.a */
char *MacaonTransLemmatizer::lookup_lemma(char *form, char *pos, hash *form_pos_ht, char **lemma_array, int verbose) {
//char form_pos[1000];
int index_form_pos;
strcpy(form_pos, form);
strcat(form_pos, "/");
strcat(form_pos, pos);
index_form_pos = hash_get_val(form_pos_ht, form_pos);
if(index_form_pos != HASH_INVALID_VAL) /* couple form/pos found in the hash table */
return lemma_array[index_form_pos];
strcpy(form_pos, form);
to_lower_string(form_pos); /* change form to lower case and look it up again */
strcat(form_pos, "/");
strcat(form_pos, pos);
index_form_pos = hash_get_val(form_pos_ht, form_pos);
if(index_form_pos != HASH_INVALID_VAL)
return lemma_array[index_form_pos];
/* even in lower case couple form/pos is not found, return the form as lemma */
if(verbose)
fprintf(stderr, "cannot find an entry for %s %s\n", form, pos);
return form;
}
#include "context.h"
class MacaonTransLemmatizer {
public:
/**
creates instance, assumes that the environment variable MACAON_DIR
is defined. One instance for each thread has to be created (this means loading resources for each thread).
@param lg language to be used (in the sense of sub-dir in MACAON_DIR)
@param mcd the filename of the mcd definitions
*/
MacaonTransLemmatizer(char *lg, char *mcd);
~MacaonTransLemmatizer();
/** call lemmatizer
@param mcfString a string containing the sentence to be analysed in mcf format
(at least the columns form, pos must be present
@return the parser output
*/
const char *lemmatizemcf(const char *mcfString);
int initOK;
private:
/// keeps parser context
context *ctx;
/// keeps last result (or NULL)
char *resultstring;
/// variables used during lemmatization
char form_pos[1000];
char lemma[200];
char form[200];
char pos[200];
/// variables to stock data
hash *form_pos_ht = NULL;
char **lemma_array = NULL;
int lemma_array_size;
void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx);
char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, int *lemma_array_size);
void print_word(word *w, mcd *mcd_struct, char *lemma, FILE *stream);
char *lookup_lemma(char *form, char *pos, hash *form_pos_ht, char **lemma_array, int verbose);
};
#include <stdio.h>
#include <string.h>
#ifdef __cplusplus
extern "C"{
#endif
#include "context.h"
#include "simple_decoder_parser_arc_eager.h"
#include "movement_parser_arc_eager.h"
#include "feat_fct.h"
#include "config2feat_vec.h"
#include "feature_table.h"
#include "dico.h"
#ifdef __cplusplus
}
#endif
#include "maca_trans_parser_export.h"
/** initialises class variables
resultstring: which keeps last result
ctx: current context
*/
MacaonTransParser::MacaonTransParser(char *lg, char *mcd) {
resultstring = NULL;
initOK = 1;
char * argv[] = { (char *)"initParser",
(char *)"-L", lg,
(char *)"-C", mcd,
0
};
ctx = context_read_options(5, argv);
set_linguistic_resources_filenames_parser(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL");
if(ctx->dico_labels == NULL){
fprintf(stderr, "cannot find label names\n");
initOK = 0;
}
ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 3;
/* load models */
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
}
MacaonTransParser::~MacaonTransParser() {
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
context_free(ctx);
}
const char *MacaonTransParser::parsemcf(const char *mcf) {
simple_decoder_parser_arc_eager_str(ctx, mcf);
//printf("rrr %s\n", resultstring);
//return "abcdef";
return resultstring;
}
/** taken as is from maca_trans_parser.c, since this function is not in the
libtransparse.a library */
void MacaonTransParser::set_linguistic_resources_filenames_parser(context *ctx) {
char absolute_path[500];
char absolute_filename[500];
absolute_path[0] = '\0';
if(ctx->maca_data_path)
strcat(absolute_path, ctx->maca_data_path);
if(!ctx->perc_model_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MODEL_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_VOCABS_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
/* if(!ctx->mcd_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_FILENAME);
ctx->mcd_filename = strdup(absolute_filename);
}*/
if(!ctx->features_model_filename){
strcpy(absolute_filename, absolute_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(ctx->verbose){
fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
}
}
/** taken from simple_decode_parser_arc_eager.c and modified in order to
taken an input string (in mcf format) which is read through a FILE * via fmemopen()
instead reading a file or stdin.
It writes the result to a FILE * opened with open_memstream() in order to get the result in a char *
*/
void MacaonTransParser::simple_decoder_parser_arc_eager_str(context *ctx, const char *mcfString) {
FILE *f = fmemopen ((void *)mcfString, strlen(mcfString), "r");
feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
int root_label;
int mvt_code;
int mvt_type;
int mvt_label;
float max;
feat_vec *fv = feat_vec_new(feature_types_nb);
config *c = NULL;
int result;
/* float entropy; */
/* float delta; */
int argmax1, argmax2;
float max1, max2;
int index;
root_label = dico_string2int(ctx->dico_labels, ctx->root_label);
if(root_label == -1) root_label = 0;
c = config_new(f, ctx->mcd_struct, 5);
while(!config_is_terminal(c)){
if(ctx->debug_mode){
fprintf(stdout, "***********************************\n");
config_print(stdout, c);
}
/* forced EOS (the element on the top of the stack is eos, but the preceding movement is not MVT_PARSER_EOS */
/* which means that the top of the stack got its eos status from input */
/* force the parser to finish parsing the sentence (perform all pending reduce actions) and determine root of the sentence */
if((word_get_sent_seg(stack_top(config_get_stack(c))) == 1) && (mvt_get_type(mvt_stack_top(config_get_history(c))) != MVT_PARSER_EOS)){
word_set_sent_seg(stack_top(config_get_stack(c)), -1);
movement_parser_eos(c);
while(movement_parser_reduce(c));
while(movement_parser_root(c, root_label));
if(ctx->debug_mode) printf("force EOS\n");
}
/* normal behavious, ask classifier what is the next movement to do and do it */
else{
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
mvt_code = feature_table_argmax(fv, ft, &max);
if(ctx->debug_mode){
vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
for(int i=0; i < 3; i++){
printf("%d\t", i);
movement_parser_print(stdout, vcode_array[i].class_code, ctx->dico_labels);
printf("\t%.4f\n", vcode_array[i].score);
}
free(vcode_array);
}
if(ctx->trace_mode){
index = word_get_index(word_buffer_b0(config_get_buffer(c)));
fprintf(stdout, "%d\t", index);
stack_print(stdout, c->st);
fprintf(stdout, "\t");
movement_parser_print(stdout, mvt_code, ctx->dico_labels);
fprintf(stdout, "\t");
feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2);
printf("%f\n", max1 - max2);
}
mvt_type = movement_parser_type(mvt_code);
mvt_label = movement_parser_label(mvt_code);
result = 0;
switch(mvt_type){
case MVT_PARSER_LEFT :
result = movement_parser_left_arc(c, mvt_label);
break;
case MVT_PARSER_RIGHT:
result = movement_parser_right_arc(c, mvt_label);
break;
case MVT_PARSER_REDUCE:
result = movement_parser_reduce(c);
break;
case MVT_PARSER_ROOT:
result = movement_parser_root(c, root_label);
break;
case MVT_PARSER_EOS:
result = movement_parser_eos(c);
break;
case MVT_PARSER_SHIFT:
result = movement_parser_shift(c);
}
if(result == 0){
if(ctx->debug_mode) fprintf(stdout, "WARNING : movement cannot be executed doing a SHIFT instead !\n");
result = movement_parser_shift(c);
if(result == 0){ /* SHIFT failed no more words to read, let's get out of here ! */
if(ctx->debug_mode) fprintf(stdout, "WARNING : cannot exectue a SHIFT emptying stack !\n");
while(!stack_is_empty(config_get_stack(c)))
movement_parser_root(c, root_label);
}
}
}
}
//if(!ctx->trace_mode) {
size_t size;
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
FILE *outstream = open_memstream (&resultstring, &size);
print_word_buffer_fp(c, ctx->dico_labels, ctx->mcd_struct, outstream);
fclose(outstream);
config_free(c);
feat_vec_free(fv);
feature_table_free(ft);
fclose(f);
}
/** taken from simple_decode_parser_arc_eager.c and modified in order to write to any FILE* not only stdout */
void MacaonTransParser::print_word_buffer_fp(config *c, dico *dico_labels, mcd *mcd_struct, FILE *out) {
int i;
word *w;
char *label;
char *buffer = NULL;
char *token = NULL;
int col_nb = 0;
for(i=0; i < config_get_buffer(c)->nbelem; i++){
w = word_buffer_get_word_n(config_get_buffer(c), i);
if((mcd_get_gov_col(mcd_struct) == -1)
&& (mcd_get_label_col(mcd_struct) == -1)
&& (mcd_get_sent_seg_col(mcd_struct) == -1)){
fprintf(out, "%s\t", word_get_input(w));
fprintf(out, "%d\t", word_get_gov(w));
label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w));
if(label != NULL)
fprintf(out, "%s\t", label) ;
else
fprintf(out, "_\t");
if(word_get_sent_seg(w) == 1)
fprintf(out, "1\n") ;
else
fprintf(out, "0\n");
}
else{
buffer = strdup(w->input);
token = strtok(buffer, "\t");
col_nb = 0;
while(token){
if(col_nb != 0) fprintf(out, "\t");
if(col_nb == mcd_get_gov_col(mcd_struct)){
fprintf(out, "%d", word_get_gov(w));
}
else
if(col_nb == mcd_get_label_col(mcd_struct)){
label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w));
if(label != NULL)
fprintf(out, "%s", label) ;
else
fprintf(out, "_");
}
else
if(col_nb == mcd_get_sent_seg_col(mcd_struct)){
if(word_get_sent_seg(w) == 1)
fprintf(out, "1") ;
else
fprintf(out, "0");
}
else{
word_print_col_n(out, w, col_nb);
}
col_nb++;
token = strtok(NULL, "\t");
}
if((col_nb <= mcd_get_gov_col(mcd_struct)) || (mcd_get_gov_col(mcd_struct) == -1)){
fprintf(out, "\t%d", word_get_gov(w));
}
if((col_nb <= mcd_get_label_col(mcd_struct)) || (mcd_get_label_col(mcd_struct) == -1)){
label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w));
if(label != NULL)
fprintf(out, "\t%s", label) ;
else
fprintf(out, "\t_");
}
if((col_nb <= mcd_get_sent_seg_col(mcd_struct)) || (mcd_get_sent_seg_col(mcd_struct) == -1)){
if(word_get_sent_seg(w) == 1)
fprintf(out, "\t1") ;
else
fprintf(out, "\t0");
}
fprintf(out, "\n");
free(buffer);
}
}
}
#include "context.h"
class MacaonTransParser {
public:
/**
creates instance of MacaonParser, assumes that the environment variable MACAON_DIR
is defined
@param lg language to be used (in the sense of sub-dir in MACAON_DIR)
@param mcd the filename of the mcd definitions
*/
MacaonTransParser(char *lg, char *mcd);
~MacaonTransParser();
/** call dependency parser
@param mcfString a string containing the sentence to be analysed in mcf format
(at least the columns form, pos, lemma must be present
@return the parser output
*/
const char *parsemcf(const char *mcfString);
int initOK;
private:
/// keeps parser context
context *ctx;
/// keeps last result (or NULL)
char *resultstring;
void set_linguistic_resources_filenames_parser(context *ctx);
void print_word_buffer_fp(config *c, dico *dico_labels, mcd *mcd_struct, FILE *out);
void simple_decoder_parser_arc_eager_str(context *ctx, const char *mcfString);
};
#include <stdio.h>
#include <string.h>
#ifdef __cplusplus
extern "C"{
#endif
#include "context.h"
#include "config2feat_vec.h"
#include "movement_tagger.h"
#include "simple_decoder_tagger.h"
#ifdef __cplusplus
}
#endif
#include "maca_trans_tagger_export.h"
MacaonTransTagger::MacaonTransTagger(char *lg, char *mcd) {
char * argv[] = { (char *)"initParser",
(char *)"-L", lg,
(char *)"-C", mcd,
0
};
ctx = context_read_options(5, argv);
decode_tagger_set_linguistic_resources_filenames(ctx);
ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose);
ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio);
mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose);
ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features");
resultstring = NULL;
}
MacaonTransTagger::~MacaonTransTagger() {
context_free(ctx);
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
}
const char *MacaonTransTagger::tagmcf(const char *mcfString) {
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
if(ctx->beam_width == 1)
simple_decoder_tagger(ctx, mcfString);
return resultstring;
}
/** taken as is from maca_trans_tagger.c */
void MacaonTransTagger::decode_tagger_set_linguistic_resources_filenames(context *ctx) {
char absolute_filename[500];
if(!ctx->perc_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MODEL_TAGGER_FILENAME);
ctx->perc_model_filename = strdup(absolute_filename);
}
if(!ctx->vocabs_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_VOCABS_TAGGER_FILENAME);
ctx->vocabs_filename = strdup(absolute_filename);
}
/* if(!ctx->mcd_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_TAGGER_FILENAME);
ctx->mcd_filename = strdup(absolute_filename);
}*/
if(!ctx->features_model_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_FEATURES_MODEL_TAGGER_FILENAME);
ctx->features_model_filename = strdup(absolute_filename);
}
if(!ctx->f2p_filename){
strcpy(absolute_filename, ctx->maca_data_path);
strcat(absolute_filename, DEFAULT_F2P_FILENAME);
ctx->f2p_filename = strdup(absolute_filename);
ctx->f2p = form2pos_read(ctx->f2p_filename);
}
if(ctx->verbose){
fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename);
fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename);
fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename);
fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename);
fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename);
}
}
/** taken from simple_decoder_tagger.c and modified to read from string and write to string */
void MacaonTransTagger::simple_decoder_tagger(context *ctx, const char *mcfString) {
config *c;
feat_vec *fv = feat_vec_new(feature_types_nb);
//FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin;
FILE *f = fmemopen ((void *)mcfString, strlen(mcfString), "r");
feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose);
int postag;
float max;
word *b0;
dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS");
c = config_new(f, ctx->mcd_struct, 5);
size_t size;
if (resultstring != NULL) {
free(resultstring);
resultstring = NULL;
}
FILE *outstream = open_memstream (&resultstring, &size);
while(!config_is_terminal(c)){
if(ctx->f2p)
/* add_signature_to_words_in_word_buffer(c->bf, ctx->f2p, dico_pos); */
add_signature_to_words_in_word_buffer(c->bf, ctx->f2p);
b0 = word_buffer_b0(c->bf);
postag = word_get_pos(b0);
if(ctx->debug_mode){
fprintf(stderr, "***********************************\n");
config_print(stderr, c);
}
/* if postag is not specified in input it is predicted */
if(postag == -1){
/* config_print(stdout, c); */
config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE);
/* feat_vec_print(stdout, fv); */
postag = feature_table_argmax(fv, ft, &max);
/* printf("postag = %d\n", postag); */
if(ctx->debug_mode){
vcode *vcode_array = feature_table_get_vcode_array(fv, ft);
for(int i=0; i < 3; i++){
fprintf(stderr, "%d\t", i);
fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score);
}
free(vcode_array);
}
}
print_word(b0, ctx->mcd_struct, dico_pos, postag, outstream);
movement_tagger(c, postag);
}
fclose(outstream);
/* config_print(stdout, c); */
feat_vec_free(fv);
feature_table_free(ft);
config_free(c);
fclose(f);
}
/** taken from simple_decoder_tagger.c and modified (paramater FILE *outstreal)
*/
void MacaonTransTagger::print_word(word *w, mcd *mcd_struct, dico *dico_pos, int postag, FILE *outstream) {
char *buffer = NULL;
char *token = NULL;
int col_nb = 0;
if(mcd_get_pos_col(mcd_struct) == -1){
fprintf(outstream, "%s\t%s\n", w->input, dico_int2string(dico_pos, postag));
}
else{
buffer = strdup(w->input);
token = strtok(buffer, "\t");
col_nb = 0;
while(token){
if(col_nb != 0) printf("\t");
if(col_nb == mcd_get_pos_col(mcd_struct))
fprintf(outstream, "%s", dico_int2string(dico_pos, postag));
else
word_print_col_n(outstream, w, col_nb);
col_nb++;
token = strtok(NULL, "\t");
}
if(col_nb <= mcd_get_pos_col(mcd_struct))
fprintf(outstream, "\t%s", dico_int2string(dico_pos, postag));
fprintf(outstream, "\n");
free(buffer);
}
}
#include "context.h"
class MacaonTransTagger {
public:
/**
creates instance, assumes that the environment variable MACAON_DIR
is defined. One instance for each thread has to be created (this means loading resources for each thread).
@param lg language to be used (in the sense of sub-dir in MACAON_DIR)
@param mcd the filename of the mcd definitions
*/
MacaonTransTagger(char *lg, char *mcd);
~MacaonTransTagger();
/** call tagger
@param mcfString a string containing the sentence to be analysed in mcf format.
At least the column form must be present
@return the parser output
*/
const char *tagmcf(const char *mcfString);
int initOK;
private:
/// keeps parser context
context *ctx;
/// keeps last result (or NULL)
char *resultstring;
// import functions which are not available in libtransparse.a or are modified
void decode_tagger_set_linguistic_resources_filenames(context *ctx);
void simple_decoder_tagger(context *ctx, const char *mcf);
void print_word(word *w, mcd *mcd_struct, dico *dico_pos, int postag, FILE *stream);
};
......@@ -137,7 +137,7 @@ void simple_decoder_tagger(context *ctx)
feat_vec_free(fv);
feature_table_free(ft);
config_free(c);
fclose(f);
if (ctx->input_filename) fclose(f);
}
#endif
......
#ifndef __SIMPLE_DECODER_TAGGER__
#define __SIMPLE_DECODER_TAGGER__
void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p);
void simple_decoder_tagger(context *ctx);
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment