diff --git a/CMakeLists.txt b/CMakeLists.txt index bd2ed1c48dec24215fc2d5d5afab5ca11ea4884e..6c42da14c375b30c2d46764114c58e8ddd6c6c37 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,17 @@ cmake_minimum_required(VERSION 2.8.7) project(macaon2) -add_definitions("-Wall") + +# for gcc < 5.3 +#add_definitions("-Wall -std=gnu11" ) +add_definitions("-Wall" ) + +# activate with cmake -DMACA_EXPORT=TRUE +# to use macaon in python/java (with swig) +if(MACA_EXPORT) + # swig needs that c/c++ code is compiled with -fPIC + set (CMAKE_POSITION_INDEPENDENT_CODE TRUE) +endif() include_directories(maca_common/include) include_directories(perceptron/lib/include) @@ -14,4 +24,8 @@ add_subdirectory(maca_trans_parser) add_subdirectory(maca_crf_tagger) add_subdirectory(maca_graph_parser) +if(MACA_EXPORT) + add_subdirectory(maca_export) +endif() + #set(CMAKE_INSTALL_PREFIX ../) diff --git a/INSTALL b/INSTALL index b6e0cf27d4a76ae5457feb20ce44741336ee5860..f031c1e9cb5a162913d80aa8cdbc7cf557486a6c 100644 --- a/INSTALL +++ b/INSTALL @@ -14,7 +14,10 @@ The basic procedure to build and install macaon from sources is the following. cmake -DCMAKE_BUILD_TYPE=Debug .. If you want to install macaon locally, you can specify the install path with : - cmake -DCMAKE_INSTALL_PREFIX:PATH=/absolute/path/to/macaon_install_dir + cmake -DCMAKE_INSTALL_PREFIX:PATH=/absolute/path/to/macaon_install_dir .. + + If you want to create a library to use in python + cmake -DMACA_EXPORT=TRUE .. - Build the sources with: make diff --git a/maca_export/CMakeLists.txt b/maca_export/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9a505059c5c5def347a4001b7625983c9c95707 --- /dev/null +++ b/maca_export/CMakeLists.txt @@ -0,0 +1,99 @@ + +if(MACA_EXPORT) + FIND_PACKAGE(SWIG 3.0) + if (SWIG_FOUND) + FIND_PACKAGE(PythonLibs) + + if(PYTHONLIBS_FOUND) + set(SOURCES src/maca_export.i) + set(PYTHON_MODULE_NAME Macaon) + + include_directories(${PYTHON_INCLUDE_PATH}) + include_directories(../maca_trans_parser/src) + include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) + + #message(eeeeeeeeeeee ${CMAKE_CURRENT_SOURCE_DIR}) + add_custom_command( + DEPENDS ${SOURCES} + OUTPUT maca_export_py.cc + COMMAND swig -python -c++ -o maca_export_py.cc ../../maca_export/src/maca_export.i + ) + + # Specify the lib + add_library(${PYTHON_MODULE_NAME} SHARED + maca_export_py.cc + src/maca_trans_tagger_export.cc + src/maca_lemmatizer_export.cc + src/maca_trans_parser_export.cc + ) + + set_target_properties(${PYTHON_MODULE_NAME} PROPERTIES PREFIX _) + target_link_libraries(${PYTHON_MODULE_NAME} transparse maca_common perceptron) + else() + message("pythonlibs not installed on your system") + endif() + + + + FIND_PACKAGE(Java 1.7) + FIND_PACKAGE(JNI) + if (JNI_FOUND AND Java_FOUND) + set(JAVA_MODULE_NAME Macaon) + set(JAVA_LIBRARY MacaonJava) + set(JAVA_CLASS_TAGGER MacaonTransTagger) + set(JAVA_CLASS_LEMMATIZER MacaonTransLemmatizer) + set(JAVA_CLASS_TRANSPARSER MacaonTransParser) + set(JAVA_PACKAGE lif) + set(JAR_FILENAME macaon) + + #set(ADDITIONNAL_JAVA_FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/MacaonTransParserWrapper.java) + + # Add include directories + include_directories(${JAVA_INCLUDE_PATH} ${JAVA_INCLUDE_PATH2}) + + # Run swig executable to generate java code + add_custom_command( + DEPENDS ${SOURCES} + OUTPUT maca_export_java.cc + COMMAND swig -java -package ${JAVA_PACKAGE} -c++ -o maca_export_java.cc ../../maca_export/src/maca_export.i + ) + + add_library(${JAVA_LIBRARY} SHARED + maca_export_java.cc + src/maca_trans_tagger_export.cc + src/maca_lemmatizer_export.cc + src/maca_trans_parser_export.cc + ) + + target_link_libraries(${JAVA_LIBRARY} transparse maca_common perceptron) + + string(REGEX REPLACE "[.]" "/" JAVA_PACKAGE_DIR ${JAVA_PACKAGE}) + add_custom_command( + TARGET ${JAVA_LIBRARY} POST_BUILD + COMMAND mkdir -p ${JAVA_PACKAGE_DIR} + COMMAND ${Java_JAVAC_EXECUTABLE} -encoding utf8 -d ${CMAKE_CURRENT_BINARY_DIR} #${JAVA_PACKAGE_DIR} + ${JAVA_MODULE_NAME}JNI.java + ${JAVA_CLASS_TAGGER}.java + ${JAVA_CLASS_LEMMATIZER}.java + ${JAVA_CLASS_TRANSPARSER}.java + ${JAVA_MODULE_NAME}.java + ${ADDITIONNAL_JAVA_FILES} + DEPENDS ${JAVA_MODULE_NAME}JNI.java ${JAVA_CLASS_TRANSPARSER}.java ${JAVA_CLASS_TAGGER}.java ${JAVA_CLASS_LEMMATIZER}.java ${JAVA_MODULE_NAME}.java ${ADDITIONNAL_JAVA_FILES} + ) + add_custom_command( + TARGET ${JAVA_LIBRARY} POST_BUILD + COMMAND ${Java_JAR_EXECUTABLE} -cvf ${JAR_FILENAME}.jar -C ${CMAKE_CURRENT_BINARY_DIR} ${JAVA_PACKAGE_DIR} + DEPENDS ${JAVA_MODULE_NAME}JNI.java ${JAVA_CLASS_TRANSPARSER}.java ${JAVA_CLASS_TAGGER}.java ${JAVA_CLASS_LEMMATIZER}.java ${JAVA_MODULE_NAME}.java ${ADDITIONNAL_JAVA_FILES} + #WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMENT "Building ${JAR_FILENAME}" + VERBATIM + ) + else() + message("Java JNI support not installed") + endif() + else() + message("swig >= 3.0 not installed on your system") + endif() +endif() + + diff --git a/maca_export/example/example.java b/maca_export/example/example.java new file mode 100644 index 0000000000000000000000000000000000000000..25816d5a00dac7c03acc2ec6be4ea03f30df66e7 --- /dev/null +++ b/maca_export/example/example.java @@ -0,0 +1,65 @@ +import lif.*; +/** example to use the macaon parser with java + compile (in maca_data2) + javac -cp ../macaon2/build_debug/maca_export/macaon.jar ../macaon2/maca_export/example/example.java + run + java -cp ../macaon2/build_debug/maca_export/macaon.jar:../macaon2/maca_export/example -Djava.library.path=../macaon2/build_debug/maca_export/ example + */ +public class example { + public static void main(String []args) { + System.loadLibrary("MacaonJava"); // use libMacaonExport.so + + /* StringBuilder mcf1 = new StringBuilder("La D le\n"); + mcf1.append("pose N pose\n"); + mcf1.append("d' P de\n"); + mcf1.append("un D un\n"); + mcf1.append("panneau N panneau\n"); + mcf1.append("stop N stop\n"); + mcf1.append("paraît V paraître\n"); + mcf1.append("être V être\n"); + mcf1.append("la D le\n"); + mcf1.append("formule N formule\n"); + mcf1.append("la D le\n"); + mcf1.append("mieux ADV mieux\n"); + mcf1.append("adaptée A adapté\n"); + mcf1.append("pour P pour\n"); + mcf1.append("assurer V assurer\n"); + mcf1.append("la D le\n"); + mcf1.append("sécurité N sécurité\n"); + mcf1.append("des P+D de\n"); + mcf1.append("usagers N usager\n"); + mcf1.append(". PONCT .\n");*/ + + + StringBuilder mcf1 = new StringBuilder("La\n"); + mcf1.append("pose\n"); + mcf1.append("d'\n"); + mcf1.append("un\n"); + mcf1.append("panneau\n"); + mcf1.append("stop\n"); + mcf1.append("paraît\n"); + mcf1.append("être\n"); + mcf1.append("la\n"); + mcf1.append("formule\n"); + mcf1.append("la\n"); + mcf1.append("mieux\n"); + mcf1.append("adaptée\n"); + mcf1.append("pour\n"); + mcf1.append("assurer\n"); + mcf1.append("la\n"); + mcf1.append("sécurité\n"); + mcf1.append("des\n"); + mcf1.append("usagers\n"); + mcf1.append(".\n"); + + MacaonTransTagger mt = new MacaonTransTagger("jh-seq", "jh-seq/eval/wplgfs.mcd"); + MacaonTransLemmatizer ml = new MacaonTransLemmatizer("jh-seq", "jh-seq/eval/wplgfs.mcd"); + MacaonTransParser mp = new MacaonTransParser("jh-seq", "jh-seq/eval/wplgfs.mcd"); + + String tags = mt.tagmcf(mcf1.toString()); + System.out.println(tags); + String lemmas = ml.lemmatizemcf(tags); + System.out.println(lemmas); + System.out.println(mp.parsemcf(lemmas)); + } +} diff --git a/maca_export/example/example.py b/maca_export/example/example.py new file mode 100755 index 0000000000000000000000000000000000000000..53216d40d64f6d9932d47c5fb9cea98e279c47af --- /dev/null +++ b/maca_export/example/example.py @@ -0,0 +1,99 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- + + +import os +currentdir = os.path.dirname(os.path.abspath(__file__)) + +import sys +sys.path.append(currentdir + "/../../build_debug/maca_export") + + +import Macaon + +# for this example you should be in maca_data2 +mt = Macaon.MacaonTransTagger("jh-seq", "jh-seq/eval/wplgfs.mcd") +ml = Macaon.MacaonTransLemmatizer("jh-seq", "jh-seq/eval/wplgfs.mcd") +mp = Macaon.MacaonTransParser("jh-seq", "jh-seq/eval/wplgfs.mcd") + +mcf="""La +grosse +souris +verte +a +mangé +le +bon +fromage +hier +soir""" + +mcf0 ="""La D +pose N +d' P +un D +panneau N +stop N +paraît V""" + +mcf1 = """La D le +pose N pos +d' P de +un D un +panneau N panneau +stop N stop +paraît V paraître +être V être +la D le +formule N formule +la D le +mieux ADV mieux +adaptée A adapté +pour P pour +assurer V assurer +la D le +sécurité N sécurité +des P+D de +usagers N usager +. PONCT .""" + + +mcf2 = """Une D un +réflexion N réflexion +commune A commun +est V être +menée V mener +avec P avec +les D le +enseignants N enseignant +et C et +les D le +délégués N délégué +de P de +parents N parent +d' P de +élèves N élève +, PONCT , +sous P sous +la D le +conduite N conduite +du P+D de +CAUE N CAUE +. PONCT .""" + +#print mp.parsemcf(mcf1) + +#print mp.parsemcf(mcf2) +#print ml.lemmatizemcf(mcf0) + +tags = mt.tagmcf(mcf) +print tags +lemmas = ml.lemmatizemcf(tags) +print lemmas +print mp.parsemcf(lemmas) + +del mp +del ml +del mt + + diff --git a/maca_export/src/maca_export.i b/maca_export/src/maca_export.i new file mode 100644 index 0000000000000000000000000000000000000000..0e425ac6e17b5d2d13232a6e1bffe2a9197921b1 --- /dev/null +++ b/maca_export/src/maca_export.i @@ -0,0 +1,30 @@ +%module Macaon + +%{ +#include <maca_trans_tagger_export.h> +#include <maca_lemmatizer_export.h> +#include <maca_trans_parser_export.h> +%} + +class MacaonTransTagger { + public: + MacaonTransTagger(char *lg, char *mcd); + ~MacaonTransTagger(); + const char *tagmcf(const char *mcf); +}; + + +class MacaonTransLemmatizer { + public: + MacaonTransLemmatizer(char *lg, char *mcd); + ~MacaonTransLemmatizer(); + const char *lemmatizemcf(const char *mcf); +}; + + +class MacaonTransParser { + public: + MacaonTransParser(char *lg, char *mcd); + ~MacaonTransParser(); + const char *parsemcf(const char *mcf); +}; diff --git a/maca_export/src/maca_lemmatizer_export.cc b/maca_export/src/maca_lemmatizer_export.cc new file mode 100644 index 0000000000000000000000000000000000000000..af4ee502b7fa7448bd08d05829a0045de8663d32 --- /dev/null +++ b/maca_export/src/maca_lemmatizer_export.cc @@ -0,0 +1,213 @@ +#include <stdio.h> +#include <string.h> + + +#ifdef __cplusplus +extern "C"{ +#endif + +#include "context.h" +#include "feat_fct.h" +#include "config2feat_vec.h" +#include "feature_table.h" +#include "dico.h" + +#ifdef __cplusplus +} +#endif + +#include "maca_lemmatizer_export.h" + +MacaonTransLemmatizer::MacaonTransLemmatizer(char *lg, char *mcd) { + char * argv[] = { (char *)"initParser", + (char *)"-L", lg, + (char *)"-C", mcd, + 0 + }; + + ctx = context_read_options(5, argv); + form_pos_ht = hash_new(1000000); + + //maca_lemmatizer_check_options(ctx); + maca_lemmatizer_set_linguistic_resources_filenames(ctx); + + lemma_array = read_fplm_file(ctx->fplm_filename, form_pos_ht, ctx->debug_mode, &lemma_array_size); + resultstring = NULL; +} + +MacaonTransLemmatizer::~MacaonTransLemmatizer() { + hash_free(form_pos_ht); + + for(int i=0; i<lemma_array_size; ++i) { + if (lemma_array[i]) free(lemma_array[i]); + } + free(lemma_array); + context_free(ctx); + if (resultstring != NULL) { + free(resultstring); + resultstring = NULL; + } +} + +const char *MacaonTransLemmatizer::lemmatizemcf(const char *mcfString) { + word *b0; + + FILE *f = fmemopen ((void *)mcfString, strlen(mcfString), "r"); + + config *c = config_new(f, ctx->mcd_struct, 5); + //char lemma[200]; + //char form[200]; + //char pos[200]; + + size_t size; + if (resultstring != NULL) { + free(resultstring); + resultstring = NULL; + } + + FILE *outstream = open_memstream (&resultstring, &size); + while(!config_is_terminal(c)){ + b0 = word_buffer_b0(c->bf); + word_sprint_col_n(lemma, b0, mcd_get_lemma_col(ctx->mcd_struct)); + word_sprint_col_n(form, b0, mcd_get_form_col(ctx->mcd_struct)); + word_sprint_col_n(pos, b0, mcd_get_pos_col(ctx->mcd_struct)); + + /* if lemma is not specified in input it is looked up */ + if(strlen(lemma) && strcmp(lemma, "_")) + print_word(b0, ctx->mcd_struct, lemma, outstream); + else + print_word(b0, ctx->mcd_struct, lookup_lemma(form, pos, form_pos_ht, lemma_array, ctx->verbose), outstream); + + word_buffer_move_right(c->bf); + } + fclose(f); + config_free(c); + fclose(outstream); + return resultstring; +} +/** taken from maca_trans_lemmatizer and added FILE * outstream to write the result to + */ +void MacaonTransLemmatizer::print_word(word *w, mcd *mcd_struct, char *lemma, FILE *outstream) { + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + + if(mcd_get_lemma_col(mcd_struct) == -1){ + fprintf(outstream, "%s\t%s\n", w->input, lemma); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) fprintf(outstream, "\t"); + if(col_nb == mcd_get_lemma_col(mcd_struct)) + fprintf(outstream, "%s", lemma); + else + word_print_col_n(outstream, w, col_nb); + col_nb++; + token = strtok(NULL, "\t"); + } + if(col_nb <= mcd_get_lemma_col(mcd_struct)) + fprintf(outstream, "\t%s", lemma); + fprintf(outstream, "\n"); + free(buffer); + } +} + + +/** taken as is from maca_lemmatizer.c since it is not included in libtransparse.a */ +void MacaonTransLemmatizer::maca_lemmatizer_set_linguistic_resources_filenames(context *ctx) { + char absolute_filename[500]; + + if(!ctx->fplm_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FPLM_FILENAME); + ctx->fplm_filename = strdup(absolute_filename); + } + + if(ctx->verbose){ + fprintf(stderr, "fplm_filename = %s\n", ctx->fplm_filename); + } +} + +/** taken as is from maca_trans_lemmatizer.c since it is not included in libtransparse.a */ +char **MacaonTransLemmatizer::read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, int *lemma_array_size) { + char form[1000]; + char pos[1000]; + char lemma[1000]; + char morpho[1000]; + int num = 0; + char **lemma_array; + //int lemma_array_size = 10000; + *lemma_array_size = 10000; + char buffer[10000]; + int fields_nb; + FILE *f= myfopen(fplm_filename, "r"); + + lemma_array = (char **)memalloc((*lemma_array_size) * sizeof(char *)); + + while(fgets(buffer, 10000, f)){ + fields_nb = sscanf(buffer, "%[^\t]\t%s\t%[^\t]\t%s\n", form, pos, lemma, morpho); + /* if(!strcmp(form, "d")) */ + /* fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); */ + if(fields_nb != 4){ + if(debug_mode){ + fprintf(stderr, "form = %s pos = %s lemma = %s\n", form, pos, lemma); + fprintf(stderr, "incorrect fplm entry, skipping it\n"); + } + continue; + } + strcat(form, "/"); + strcat(form, pos); + // TODO: memory leak: if form is already in the hash, it is not added and the memory + // allocated by strdup() is leaked + // solutions: hash_add does the strdup() if necessary (check else where !) + // or return code to indicate whether form has been added or not + hash_add(form_pos_ht, strdup(form), num); + + if(num >= *lemma_array_size){ + *lemma_array_size = 2 * (*lemma_array_size) + 1; + lemma_array = (char **)realloc(lemma_array, (*lemma_array_size) * sizeof(char *)); + // initialize in order to be able to free correctly and the end + for(int i=num; i<*lemma_array_size; ++i) { + lemma_array[i] = NULL; + } + } + + /* if(lemma_array[num] == NULL) */ + lemma_array[num] = strdup(lemma); + num++; + } + /* fprintf(stderr, "%d entries loaded\n", num); */ + fclose(f); + return lemma_array; +} +/** taken as is from maca_trans_lemmatizer.c since it is not included in libtransparse.a */ +char *MacaonTransLemmatizer::lookup_lemma(char *form, char *pos, hash *form_pos_ht, char **lemma_array, int verbose) { + //char form_pos[1000]; + int index_form_pos; + + strcpy(form_pos, form); + strcat(form_pos, "/"); + strcat(form_pos, pos); + index_form_pos = hash_get_val(form_pos_ht, form_pos); + + + if(index_form_pos != HASH_INVALID_VAL) /* couple form/pos found in the hash table */ + return lemma_array[index_form_pos]; + + strcpy(form_pos, form); + to_lower_string(form_pos); /* change form to lower case and look it up again */ + strcat(form_pos, "/"); + strcat(form_pos, pos); + index_form_pos = hash_get_val(form_pos_ht, form_pos); + if(index_form_pos != HASH_INVALID_VAL) + return lemma_array[index_form_pos]; + + /* even in lower case couple form/pos is not found, return the form as lemma */ + if(verbose) + fprintf(stderr, "cannot find an entry for %s %s\n", form, pos); + + return form; +} diff --git a/maca_export/src/maca_lemmatizer_export.h b/maca_export/src/maca_lemmatizer_export.h new file mode 100644 index 0000000000000000000000000000000000000000..afa63bc1071a69a8e52df08ba1985cb540ef83c7 --- /dev/null +++ b/maca_export/src/maca_lemmatizer_export.h @@ -0,0 +1,49 @@ +#include "context.h" + +class MacaonTransLemmatizer { + public: + /** + creates instance, assumes that the environment variable MACAON_DIR + is defined. One instance for each thread has to be created (this means loading resources for each thread). + @param lg language to be used (in the sense of sub-dir in MACAON_DIR) + @param mcd the filename of the mcd definitions + */ + MacaonTransLemmatizer(char *lg, char *mcd); + + ~MacaonTransLemmatizer(); + + /** call lemmatizer + @param mcfString a string containing the sentence to be analysed in mcf format + (at least the columns form, pos must be present + @return the parser output + */ + const char *lemmatizemcf(const char *mcfString); + + int initOK; + + private: + /// keeps parser context + context *ctx; + /// keeps last result (or NULL) + char *resultstring; + + /// variables used during lemmatization + char form_pos[1000]; + char lemma[200]; + char form[200]; + char pos[200]; + + /// variables to stock data + hash *form_pos_ht = NULL; + char **lemma_array = NULL; + int lemma_array_size; + + void maca_lemmatizer_set_linguistic_resources_filenames(context *ctx); + char **read_fplm_file(char *fplm_filename, hash *form_pos_ht, int debug_mode, int *lemma_array_size); + void print_word(word *w, mcd *mcd_struct, char *lemma, FILE *stream); + char *lookup_lemma(char *form, char *pos, hash *form_pos_ht, char **lemma_array, int verbose); +}; + + + + diff --git a/maca_export/src/maca_trans_parser_export.cc b/maca_export/src/maca_trans_parser_export.cc new file mode 100644 index 0000000000000000000000000000000000000000..7749f7033456741d6655966048904231010294a9 --- /dev/null +++ b/maca_export/src/maca_trans_parser_export.cc @@ -0,0 +1,330 @@ +#include <stdio.h> +#include <string.h> + + +#ifdef __cplusplus +extern "C"{ +#endif + +#include "context.h" +#include "simple_decoder_parser_arc_eager.h" +#include "movement_parser_arc_eager.h" +#include "feat_fct.h" +#include "config2feat_vec.h" +#include "feature_table.h" +#include "dico.h" + +#ifdef __cplusplus +} +#endif + +#include "maca_trans_parser_export.h" + +/** initialises class variables + resultstring: which keeps last result + ctx: current context + */ +MacaonTransParser::MacaonTransParser(char *lg, char *mcd) { + resultstring = NULL; + initOK = 1; + char * argv[] = { (char *)"initParser", + (char *)"-L", lg, + (char *)"-C", mcd, + 0 + }; + + ctx = context_read_options(5, argv); + + + set_linguistic_resources_filenames_parser(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + ctx->dico_labels = dico_vec_get_dico(ctx->vocabs, (char *)"LABEL"); + + if(ctx->dico_labels == NULL){ + fprintf(stderr, "cannot find label names\n"); + initOK = 0; + } + + ctx->mvt_nb = ctx->dico_labels->nbelem * 2 + 3; + + /* load models */ + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); +} + +MacaonTransParser::~MacaonTransParser() { + if (resultstring != NULL) { + free(resultstring); + resultstring = NULL; + } + context_free(ctx); +} + + +const char *MacaonTransParser::parsemcf(const char *mcf) { + simple_decoder_parser_arc_eager_str(ctx, mcf); + //printf("rrr %s\n", resultstring); + //return "abcdef"; + return resultstring; +} + + +/** taken as is from maca_trans_parser.c, since this function is not in the + libtransparse.a library */ +void MacaonTransParser::set_linguistic_resources_filenames_parser(context *ctx) { + char absolute_path[500]; + char absolute_filename[500]; + + absolute_path[0] = '\0'; + + if(ctx->maca_data_path) + strcat(absolute_path, ctx->maca_data_path); + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_MODEL_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_VOCABS_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + /* if(!ctx->mcd_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_FILENAME); + ctx->mcd_filename = strdup(absolute_filename); + }*/ + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, absolute_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + } +} + + +/** taken from simple_decode_parser_arc_eager.c and modified in order to + taken an input string (in mcf format) which is read through a FILE * via fmemopen() + instead reading a file or stdin. + It writes the result to a FILE * opened with open_memstream() in order to get the result in a char * + */ +void MacaonTransParser::simple_decoder_parser_arc_eager_str(context *ctx, const char *mcfString) { + FILE *f = fmemopen ((void *)mcfString, strlen(mcfString), "r"); + + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + int root_label; + int mvt_code; + int mvt_type; + int mvt_label; + float max; + feat_vec *fv = feat_vec_new(feature_types_nb); + config *c = NULL; + int result; + /* float entropy; */ + /* float delta; */ + int argmax1, argmax2; + float max1, max2; + int index; + + root_label = dico_string2int(ctx->dico_labels, ctx->root_label); + if(root_label == -1) root_label = 0; + + c = config_new(f, ctx->mcd_struct, 5); + while(!config_is_terminal(c)){ + + if(ctx->debug_mode){ + fprintf(stdout, "***********************************\n"); + config_print(stdout, c); + } + /* forced EOS (the element on the top of the stack is eos, but the preceding movement is not MVT_PARSER_EOS */ + /* which means that the top of the stack got its eos status from input */ + /* force the parser to finish parsing the sentence (perform all pending reduce actions) and determine root of the sentence */ + + if((word_get_sent_seg(stack_top(config_get_stack(c))) == 1) && (mvt_get_type(mvt_stack_top(config_get_history(c))) != MVT_PARSER_EOS)){ + word_set_sent_seg(stack_top(config_get_stack(c)), -1); + movement_parser_eos(c); + while(movement_parser_reduce(c)); + while(movement_parser_root(c, root_label)); + if(ctx->debug_mode) printf("force EOS\n"); + } + + /* normal behavious, ask classifier what is the next movement to do and do it */ + else{ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + mvt_code = feature_table_argmax(fv, ft, &max); + + if(ctx->debug_mode){ + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + for(int i=0; i < 3; i++){ + printf("%d\t", i); + movement_parser_print(stdout, vcode_array[i].class_code, ctx->dico_labels); + printf("\t%.4f\n", vcode_array[i].score); + } + free(vcode_array); + } + + if(ctx->trace_mode){ + index = word_get_index(word_buffer_b0(config_get_buffer(c))); + fprintf(stdout, "%d\t", index); + + stack_print(stdout, c->st); + fprintf(stdout, "\t"); + + movement_parser_print(stdout, mvt_code, ctx->dico_labels); + fprintf(stdout, "\t"); + feature_table_argmax_1_2(fv, ft, &argmax1, &max1, &argmax2, &max2); + printf("%f\n", max1 - max2); + } + + mvt_type = movement_parser_type(mvt_code); + mvt_label = movement_parser_label(mvt_code); + + result = 0; + switch(mvt_type){ + case MVT_PARSER_LEFT : + result = movement_parser_left_arc(c, mvt_label); + break; + case MVT_PARSER_RIGHT: + result = movement_parser_right_arc(c, mvt_label); + break; + case MVT_PARSER_REDUCE: + result = movement_parser_reduce(c); + break; + case MVT_PARSER_ROOT: + result = movement_parser_root(c, root_label); + break; + case MVT_PARSER_EOS: + result = movement_parser_eos(c); + break; + case MVT_PARSER_SHIFT: + result = movement_parser_shift(c); + } + + if(result == 0){ + if(ctx->debug_mode) fprintf(stdout, "WARNING : movement cannot be executed doing a SHIFT instead !\n"); + result = movement_parser_shift(c); + if(result == 0){ /* SHIFT failed no more words to read, let's get out of here ! */ + if(ctx->debug_mode) fprintf(stdout, "WARNING : cannot exectue a SHIFT emptying stack !\n"); + while(!stack_is_empty(config_get_stack(c))) + movement_parser_root(c, root_label); + } + } + } + } + + //if(!ctx->trace_mode) { + size_t size; + if (resultstring != NULL) { + free(resultstring); + resultstring = NULL; + } + FILE *outstream = open_memstream (&resultstring, &size); + print_word_buffer_fp(c, ctx->dico_labels, ctx->mcd_struct, outstream); + fclose(outstream); + + config_free(c); + feat_vec_free(fv); + feature_table_free(ft); + fclose(f); +} + + +/** taken from simple_decode_parser_arc_eager.c and modified in order to write to any FILE* not only stdout */ +void MacaonTransParser::print_word_buffer_fp(config *c, dico *dico_labels, mcd *mcd_struct, FILE *out) { + int i; + word *w; + char *label; + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + + + for(i=0; i < config_get_buffer(c)->nbelem; i++){ + w = word_buffer_get_word_n(config_get_buffer(c), i); + + if((mcd_get_gov_col(mcd_struct) == -1) + && (mcd_get_label_col(mcd_struct) == -1) + && (mcd_get_sent_seg_col(mcd_struct) == -1)){ + fprintf(out, "%s\t", word_get_input(w)); + fprintf(out, "%d\t", word_get_gov(w)); + label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w)); + if(label != NULL) + fprintf(out, "%s\t", label) ; + else + fprintf(out, "_\t"); + if(word_get_sent_seg(w) == 1) + fprintf(out, "1\n") ; + else + fprintf(out, "0\n"); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) fprintf(out, "\t"); + if(col_nb == mcd_get_gov_col(mcd_struct)){ + fprintf(out, "%d", word_get_gov(w)); + } + else + if(col_nb == mcd_get_label_col(mcd_struct)){ + label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w)); + if(label != NULL) + fprintf(out, "%s", label) ; + else + fprintf(out, "_"); + } + else + if(col_nb == mcd_get_sent_seg_col(mcd_struct)){ + if(word_get_sent_seg(w) == 1) + fprintf(out, "1") ; + else + fprintf(out, "0"); + } + else{ + word_print_col_n(out, w, col_nb); + } + col_nb++; + token = strtok(NULL, "\t"); + } + if((col_nb <= mcd_get_gov_col(mcd_struct)) || (mcd_get_gov_col(mcd_struct) == -1)){ + fprintf(out, "\t%d", word_get_gov(w)); + } + if((col_nb <= mcd_get_label_col(mcd_struct)) || (mcd_get_label_col(mcd_struct) == -1)){ + label = (word_get_label(w) == -1)? NULL : dico_int2string(dico_labels, word_get_label(w)); + if(label != NULL) + fprintf(out, "\t%s", label) ; + else + fprintf(out, "\t_"); + } + if((col_nb <= mcd_get_sent_seg_col(mcd_struct)) || (mcd_get_sent_seg_col(mcd_struct) == -1)){ + if(word_get_sent_seg(w) == 1) + fprintf(out, "\t1") ; + else + fprintf(out, "\t0"); + } + fprintf(out, "\n"); + free(buffer); + } + } +} + + + + + + + + diff --git a/maca_export/src/maca_trans_parser_export.h b/maca_export/src/maca_trans_parser_export.h new file mode 100644 index 0000000000000000000000000000000000000000..ddf0c1b41692c3fd5721ef74c2d8764419a00244 --- /dev/null +++ b/maca_export/src/maca_trans_parser_export.h @@ -0,0 +1,37 @@ +#include "context.h" + + +class MacaonTransParser { + public: + /** + creates instance of MacaonParser, assumes that the environment variable MACAON_DIR + is defined + @param lg language to be used (in the sense of sub-dir in MACAON_DIR) + @param mcd the filename of the mcd definitions + */ + MacaonTransParser(char *lg, char *mcd); + ~MacaonTransParser(); + /** call dependency parser + @param mcfString a string containing the sentence to be analysed in mcf format + (at least the columns form, pos, lemma must be present + @return the parser output + */ + const char *parsemcf(const char *mcfString); + + int initOK; + + private: + /// keeps parser context + context *ctx; + /// keeps last result (or NULL) + char *resultstring; + + + void set_linguistic_resources_filenames_parser(context *ctx); + void print_word_buffer_fp(config *c, dico *dico_labels, mcd *mcd_struct, FILE *out); + void simple_decoder_parser_arc_eager_str(context *ctx, const char *mcfString); +}; + + + + diff --git a/maca_export/src/maca_trans_tagger_export.cc b/maca_export/src/maca_trans_tagger_export.cc new file mode 100644 index 0000000000000000000000000000000000000000..a791372ddf7d6977ed6b34418e0a7fa84514ac5b --- /dev/null +++ b/maca_export/src/maca_trans_tagger_export.cc @@ -0,0 +1,195 @@ +#include <stdio.h> +#include <string.h> + + +#ifdef __cplusplus +extern "C"{ +#endif + +#include "context.h" +#include "config2feat_vec.h" +#include "movement_tagger.h" +#include "simple_decoder_tagger.h" +#ifdef __cplusplus +} +#endif + +#include "maca_trans_tagger_export.h" + + +MacaonTransTagger::MacaonTransTagger(char *lg, char *mcd) { + char * argv[] = { (char *)"initParser", + (char *)"-L", lg, + (char *)"-C", mcd, + 0 + }; + + ctx = context_read_options(5, argv); + + decode_tagger_set_linguistic_resources_filenames(ctx); + ctx->features_model = feat_model_read(ctx->features_model_filename, ctx->verbose); + ctx->vocabs = dico_vec_read(ctx->vocabs_filename, ctx->hash_ratio); + mcd_link_to_dico(ctx->mcd_struct, ctx->vocabs, ctx->verbose); + + ctx->d_perceptron_features = dico_vec_get_dico(ctx->vocabs, (char *)"d_perceptron_features"); + resultstring = NULL; +} + +MacaonTransTagger::~MacaonTransTagger() { + context_free(ctx); + if (resultstring != NULL) { + free(resultstring); + resultstring = NULL; + } +} + +const char *MacaonTransTagger::tagmcf(const char *mcfString) { + if (resultstring != NULL) { + free(resultstring); + resultstring = NULL; + } + if(ctx->beam_width == 1) + simple_decoder_tagger(ctx, mcfString); + + return resultstring; +} + + +/** taken as is from maca_trans_tagger.c */ +void MacaonTransTagger::decode_tagger_set_linguistic_resources_filenames(context *ctx) { + char absolute_filename[500]; + + if(!ctx->perc_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MODEL_TAGGER_FILENAME); + ctx->perc_model_filename = strdup(absolute_filename); + } + + if(!ctx->vocabs_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_VOCABS_TAGGER_FILENAME); + ctx->vocabs_filename = strdup(absolute_filename); + } + + /* if(!ctx->mcd_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_MULTI_COL_DESC_TAGGER_FILENAME); + ctx->mcd_filename = strdup(absolute_filename); + }*/ + + if(!ctx->features_model_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_FEATURES_MODEL_TAGGER_FILENAME); + ctx->features_model_filename = strdup(absolute_filename); + } + + if(!ctx->f2p_filename){ + strcpy(absolute_filename, ctx->maca_data_path); + strcat(absolute_filename, DEFAULT_F2P_FILENAME); + ctx->f2p_filename = strdup(absolute_filename); + ctx->f2p = form2pos_read(ctx->f2p_filename); + } + + if(ctx->verbose){ + fprintf(stderr, "perc_model_filename = %s\n", ctx->perc_model_filename); + fprintf(stderr, "vocabs_filename = %s\n", ctx->vocabs_filename); + fprintf(stderr, "mcd_filename = %s\n", ctx->mcd_filename); + fprintf(stderr, "perc_features_model_filename = %s\n", ctx->features_model_filename); + fprintf(stderr, "f2p_filename = %s\n", ctx->f2p_filename); + } +} + +/** taken from simple_decoder_tagger.c and modified to read from string and write to string */ +void MacaonTransTagger::simple_decoder_tagger(context *ctx, const char *mcfString) { + config *c; + feat_vec *fv = feat_vec_new(feature_types_nb); + //FILE *f = (ctx->input_filename)? myfopen(ctx->input_filename, "r") : stdin; + FILE *f = fmemopen ((void *)mcfString, strlen(mcfString), "r"); + feature_table *ft = feature_table_load(ctx->perc_model_filename, ctx->verbose); + int postag; + float max; + word *b0; + dico *dico_pos = dico_vec_get_dico(ctx->vocabs, (char *)"POS"); + + c = config_new(f, ctx->mcd_struct, 5); + + size_t size; + if (resultstring != NULL) { + free(resultstring); + resultstring = NULL; + } + + FILE *outstream = open_memstream (&resultstring, &size); + while(!config_is_terminal(c)){ + if(ctx->f2p) + /* add_signature_to_words_in_word_buffer(c->bf, ctx->f2p, dico_pos); */ + add_signature_to_words_in_word_buffer(c->bf, ctx->f2p); + + b0 = word_buffer_b0(c->bf); + postag = word_get_pos(b0); + + if(ctx->debug_mode){ + fprintf(stderr, "***********************************\n"); + config_print(stderr, c); + } + + /* if postag is not specified in input it is predicted */ + if(postag == -1){ + /* config_print(stdout, c); */ + config2feat_vec_cff(ctx->features_model, c, ctx->d_perceptron_features, fv, LOOKUP_MODE); + + /* feat_vec_print(stdout, fv); */ + postag = feature_table_argmax(fv, ft, &max); + /* printf("postag = %d\n", postag); */ + + if(ctx->debug_mode){ + vcode *vcode_array = feature_table_get_vcode_array(fv, ft); + for(int i=0; i < 3; i++){ + fprintf(stderr, "%d\t", i); + fprintf(stderr, "%s\t%.4f\n", dico_int2string(dico_pos, vcode_array[i].class_code), vcode_array[i].score); + } + free(vcode_array); + } + } + + print_word(b0, ctx->mcd_struct, dico_pos, postag, outstream); + + movement_tagger(c, postag); + + } + fclose(outstream); + /* config_print(stdout, c); */ + feat_vec_free(fv); + feature_table_free(ft); + config_free(c); + fclose(f); +} + +/** taken from simple_decoder_tagger.c and modified (paramater FILE *outstreal) + */ +void MacaonTransTagger::print_word(word *w, mcd *mcd_struct, dico *dico_pos, int postag, FILE *outstream) { + char *buffer = NULL; + char *token = NULL; + int col_nb = 0; + if(mcd_get_pos_col(mcd_struct) == -1){ + fprintf(outstream, "%s\t%s\n", w->input, dico_int2string(dico_pos, postag)); + } + else{ + buffer = strdup(w->input); + token = strtok(buffer, "\t"); + col_nb = 0; + while(token){ + if(col_nb != 0) printf("\t"); + if(col_nb == mcd_get_pos_col(mcd_struct)) + fprintf(outstream, "%s", dico_int2string(dico_pos, postag)); + else + word_print_col_n(outstream, w, col_nb); + col_nb++; + token = strtok(NULL, "\t"); + } + if(col_nb <= mcd_get_pos_col(mcd_struct)) + fprintf(outstream, "\t%s", dico_int2string(dico_pos, postag)); + fprintf(outstream, "\n"); + free(buffer); + } +} diff --git a/maca_export/src/maca_trans_tagger_export.h b/maca_export/src/maca_trans_tagger_export.h new file mode 100644 index 0000000000000000000000000000000000000000..1d3aefe500aa178d27b5e7c6c543de92dde1714e --- /dev/null +++ b/maca_export/src/maca_trans_tagger_export.h @@ -0,0 +1,37 @@ +#include "context.h" + +class MacaonTransTagger { + public: + /** + creates instance, assumes that the environment variable MACAON_DIR + is defined. One instance for each thread has to be created (this means loading resources for each thread). + @param lg language to be used (in the sense of sub-dir in MACAON_DIR) + @param mcd the filename of the mcd definitions + */ + MacaonTransTagger(char *lg, char *mcd); + + ~MacaonTransTagger(); + + /** call tagger + @param mcfString a string containing the sentence to be analysed in mcf format. + At least the column form must be present + @return the parser output + */ + const char *tagmcf(const char *mcfString); + + int initOK; + + private: + /// keeps parser context + context *ctx; + /// keeps last result (or NULL) + char *resultstring; + + // import functions which are not available in libtransparse.a or are modified + void decode_tagger_set_linguistic_resources_filenames(context *ctx); + void simple_decoder_tagger(context *ctx, const char *mcf); + void print_word(word *w, mcd *mcd_struct, dico *dico_pos, int postag, FILE *stream); +}; + + + diff --git a/maca_trans_parser/src/simple_decoder_tagger.c b/maca_trans_parser/src/simple_decoder_tagger.c index 80f1fc37be2679e73302eb958e07818a138ddbec..f54b3c9046d47633299418095ebe18b33a847298 100644 --- a/maca_trans_parser/src/simple_decoder_tagger.c +++ b/maca_trans_parser/src/simple_decoder_tagger.c @@ -137,7 +137,7 @@ void simple_decoder_tagger(context *ctx) feat_vec_free(fv); feature_table_free(ft); config_free(c); - fclose(f); + if (ctx->input_filename) fclose(f); } #endif diff --git a/maca_trans_parser/src/simple_decoder_tagger.h b/maca_trans_parser/src/simple_decoder_tagger.h index 28f4f1091d01c97638b0a891c76176c68042992a..b5d5b2aa019629a560011a9b7f575228a7cb3353 100644 --- a/maca_trans_parser/src/simple_decoder_tagger.h +++ b/maca_trans_parser/src/simple_decoder_tagger.h @@ -1,6 +1,6 @@ #ifndef __SIMPLE_DECODER_TAGGER__ #define __SIMPLE_DECODER_TAGGER__ - +void add_signature_to_words_in_word_buffer(word_buffer *bf, form2pos *f2p); void simple_decoder_tagger(context *ctx); #endif