diff --git a/maca_graph_parser/Makefile b/maca_graph_parser/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..535998d0bc3ee1cdb937e617f0d94d10c8bf5556 --- /dev/null +++ b/maca_graph_parser/Makefile @@ -0,0 +1,26 @@ +CFLAGS := -g -Wall -Wno-unused-variable -Wno-unused-but-set-variable -Werror +CXXFLAGS := $(CFLAGS) -std=c++0x +CC := g++ + +SOURCES := array.c hash.c maca_alphabet.cc maca_alphabet_wrapper.cc maca_common.c maca_graph_parser_alphabet.c maca_graph_parser.c maca_graph_parser_conll2007_format.c maca_graph_parser_corpora.c maca_graph_parser_decoder1.c maca_graph_parser_decoder2.c maca_graph_parser_decoder.c maca_graph_parser_dep_count_table.c maca_graph_parser_feature_counter_array.c maca_graph_parser_feature_counter.c maca_graph_parser_features.c maca_graph_parser_feature_table.c maca_graph_parser_feature_vector.c maca_graph_parser_hash.c maca_graph_parser_heapq.c maca_graph_parser_hyperdecoder.c maca_graph_parser_hypergraph.c maca_graph_parser_metrics.c maca_graph_parser_model.c maca_graph_parser_sentence.c maca_mcf.cc maca_mcf_wrapper.cc maca_msg.c simple_parser.cc maca_graph_parser_train.c +OBJ:= $(patsubst %.cc, %.o, $(patsubst %.c, %.o, $(SOURCES))) +EXE := test_simple_parser maca_graph_parser_decode maca_graph_parser_eval maca_graph_parser maca_graph_parser_print_model maca_graph_parser_resize_model maca_graph_parser_train + +all: $(EXE) + +test_simple_parser: test_simple_parser.o $(OBJ) +maca_graph_parser_decode: maca_graph_parser_decode_main.o $(OBJ) +maca_graph_parser_eval: maca_graph_parser_eval_main.o $(OBJ) +maca_graph_parser: maca_graph_parser_main.o $(OBJ) +maca_graph_parser_print_model: maca_graph_parser_print_model_main.o $(OBJ) +maca_graph_parser_resize_model: maca_graph_parser_resize_model_main.o $(OBJ) +maca_graph_parser_train: maca_graph_parser_train_main.o $(OBJ) + +%.o: %.c + gcc $(CFLAGS) -c $< + +%: %_main.o $(OBJ) + g++ $(CFLAGS) -o $@ $^ + +clean: + rm -f $(EXE) *.o diff --git a/maca_graph_parser/array.c b/maca_graph_parser/array.c new file mode 100644 index 0000000000000000000000000000000000000000..d7c27c11c14b9a3b552692b00fcb4055c9db3d21 --- /dev/null +++ b/maca_graph_parser/array.c @@ -0,0 +1,25 @@ +#include <stdlib.h> +#include "array.h" + +array_t* array_new() { + array_t* array = malloc(sizeof(array_t)); + array->num_elements = 0; + array->data = NULL; + return array; +} + +void array_free(array_t* array) { + if(array->data) free(array->data); + free(array); +} + +ARRAY_TYPE array_get(array_t* array, int element) { + return array->data[element]; +} + +void array_push(array_t* array, ARRAY_TYPE value) { + array->data = realloc(array->data, sizeof(ARRAY_TYPE) * (array->num_elements + 1)); + array->data[array->num_elements] = value; + array->num_elements++; +} + diff --git a/maca_graph_parser/array.h b/maca_graph_parser/array.h new file mode 100644 index 0000000000000000000000000000000000000000..8dcfb344b60deb5d939e6891322a61626fc9630c --- /dev/null +++ b/maca_graph_parser/array.h @@ -0,0 +1,16 @@ +#ifndef __ARRAY__ +#define __ARRAY__ + +#define ARRAY_TYPE char* + +typedef struct { + int num_elements; + ARRAY_TYPE* data; +} array_t; + +array_t* array_new(); +void array_free(array_t* array); +ARRAY_TYPE array_get(array_t* array, int element); +void array_push(array_t* array, ARRAY_TYPE value); + +#endif diff --git a/maca_graph_parser/hash.c b/maca_graph_parser/hash.c new file mode 100644 index 0000000000000000000000000000000000000000..d4a821768b6537afa464ba1a5b8d36ab26f6b050 --- /dev/null +++ b/maca_graph_parser/hash.c @@ -0,0 +1,111 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> + +#include"hash.h" + +cell *cell_new(char *key, int val, cell *next) +{ + cell *c = (cell *)malloc(sizeof(cell)); + c->val = val; + c->key = key; + c->next = next; + return c; +} + +void cell_free(cell *c) +{ + if(c == NULL) return; + cell_free(c->next); + free(c->key); + free(c); +} + + +hash_t *hash_new(int size) +{ + int i; + hash_t *h = (hash_t *)malloc(sizeof(hash_t)); + h->size = size; + h->nbelem = 0; + h->array = (cell **)malloc(size * sizeof(cell *)); + for(i=0; i < size; i++) + h->array[i] = NULL; + return h; +} + +void hash_free(hash_t *h) +{ + int i; + for(i=0; i < h->size; i++) + cell_free(h->array[i]); + free(h); +} + +int hash_func(char *key, int size) +{ + int i; + int l = strlen(key); + int val = key[0]; + for(i=1; i < l; i++) + val = val + i *i * abs(key[i]); + return val % size; +} + +cell *hash_lookup(hash_t *h, char *key) +{ + int index = hash_func(key, h->size); + cell *c; + for(c=h->array[index]; c; c = c->next) + if(!strcmp(key, c->key)) + return c; + return NULL; +} + +int hash_get_val(hash_t *h, char *key) +{ + int index = hash_func(key, h->size); + cell *c; + for(c=h->array[index]; c; c = c->next) + if(!strcmp(key, c->key)) + return c->val; + return HASH_INVALID_VAL; +} + +void hash_add(hash_t *h, char *key, int val) +{ + int index; + if(hash_lookup(h, key)) return; + index = hash_func(key, h->size); + h->array[index] = cell_new(key, val, h->array[index]); + h->nbelem++; +} + +int cell_nb(cell *c) +{ + if(c == NULL) return 0; + return 1 + cell_nb(c->next); +} + +void hash_stats(hash_t *h) +{ + int max = 0; + int i,l; + int *table; + int nb; + + for(i=0; i < h->size; i++) + if((l = cell_nb(h->array[i])) > max) + max = l; + nb = max + 1; + table = (int *)malloc(nb * sizeof(int)); + for(i=0; i < nb; i++) + table[i] = 0; + for(i=0; i < h->size; i++) + table[cell_nb(h->array[i])]++; + + for(i=0; i < nb; i++) + printf("%d %d\n", i, table[i]); + + +} diff --git a/maca_graph_parser/hash.h b/maca_graph_parser/hash.h new file mode 100644 index 0000000000000000000000000000000000000000..8f0bc0b435dd931e03feac2679f2977604fca69c --- /dev/null +++ b/maca_graph_parser/hash.h @@ -0,0 +1,32 @@ +#ifndef __HASH__ +#define __HASH__ + +#define HASH_INVALID_VAL -1 + +typedef struct _cell +{ + char *key; + int val; + struct _cell *next; +} cell; + +typedef struct +{ + int size; + int nbelem; + cell **array; +} hash_t; + + +cell *cell_new(char *key, int val, cell *next); +void cell_free(cell *c); + +hash_t *hash_new(int size); +void hash_free(hash_t *h); +cell *hash_lookup(hash_t *h, char *key); +int hash_get_val(hash_t *h, char *key); +void hash_add(hash_t *h, char *key, int val); +void hash_stats(hash_t *h); + + +#endif diff --git a/maca_graph_parser/maca_alphabet.cc b/maca_graph_parser/maca_alphabet.cc new file mode 100644 index 0000000000000000000000000000000000000000..228252ac6d1a92d7195af7ab995e225776fbaec4 --- /dev/null +++ b/maca_graph_parser/maca_alphabet.cc @@ -0,0 +1,259 @@ +#include "maca_alphabet.hh" +#include <sstream> + +// int main(int argc, char **argv) { + +// if (argc - 1 != 1) { +// std::cerr << "Need 1 arg\n"; +// exit(1); +// } + +// std::cout << "This is a test executable!\n"; + +// macaon::Alphabet alphabet("TEST"); + +// alphabet.addSymbol("Orange"); +// int code2 = alphabet.addSymbol("Poire"); + +// int code = alphabet.getCode("Orange"); +// std::cout << "Code of Orange: " << code << std::endl; +// std::cout << "Symbol of " << code2 << " is " << alphabet.getSymbol(code2) << std::endl; +// std::cout << "We can also access it like that: " << alphabet[1] << std::endl; + +// std::string file(argv[1]); +// std::cout << "Loading alphabets from " << file << std::endl; +// macaon::AlphabetArray alphaArray(file); + +// std::cout << "We've loaded " << alphaArray.size() << " alphabets\n"; + +// alphaArray.addAlphabet(&alphabet); + +// if (alphabet == *(alphaArray.getAlphabet("TEST"))) { +// std::cout << "They are equal!\n"; +// } + +// alphaArray.removeAlphabet("WORDS", true); + +// alphaArray.dump(); + +// return 0; +// } + +namespace macaon { + Alphabet::Alphabet(const std::string name, bool loaded /*=false*/): name(name), loaded(loaded) { + nb = 0; + locked = false; + } + + std::string Alphabet::getName() { + return name; + } + + int Alphabet::addSymbol(const std::string symbol) { + int code; + std::map<std::string,int>::iterator it = wordCodeMap.find(symbol); + if (it == wordCodeMap.end()) { + if (locked) { + throw std::runtime_error("Tried to add new symbol but alphabet is locked!"); + } + code = nb; + wordCodeMap.insert(std::map<std::string,int>::value_type(symbol,code)); + codeWordArray.push_back(symbol); + nb++; + } else { + code = it->second; + } + + return code; + } + + int Alphabet::getCode(const std::string &symbol) { + std::map<std::string,int>::iterator it = wordCodeMap.find(symbol); + if (it == wordCodeMap.end()) { + throw std::runtime_error("Symbol does not exist in this alphabet!"); + } + return it->second; + } + + std::string &Alphabet::getSymbol(const int &code) { + return codeWordArray.at(code); + } + + int Alphabet::size() { + return nb; + } + + void Alphabet::lock() { + locked = true; + } + + void Alphabet::unlock() { + locked = false; + } + + bool Alphabet::isLoaded() { + return loaded; + } + + void Alphabet::dump(std::ostream &out) { + out << "##" << name << "\n"; + for (std::vector<std::string>::size_type i = 0; i < codeWordArray.size(); i++) { + out << codeWordArray[i] << "\n"; + } + } + + bool Alphabet::operator ==(const Alphabet &a) { + if (name != a.name) + return false; + if (nb != a.nb) + return false; + if (locked != a.locked) + return false; + if (loaded != a.loaded) + return false; + if (codeWordArray != a.codeWordArray) + return false; + return true; + } + + bool Alphabet::operator !=(const Alphabet &a) { + if (name != a.name) + return true; + if (nb != a.nb) + return true; + if (locked != a.locked) + return true; + if (loaded != a.loaded) + return true; + if (codeWordArray != a.codeWordArray) + return true; + return false; + } + + int Alphabet::operator [](const std::string &s) { + return getCode(s); + } + + std::string &Alphabet::operator [](const int &i) { + return getSymbol(i); + } + + AlphabetArray::AlphabetArray() { + nb = 0; + } + + AlphabetArray::AlphabetArray(std::string filename) { + nb = 0; + load(filename); + } + + // AlphabetArray::~AlphabetArray() { + // for (std::map<std::string, Alphabet*>::iterator it_map = alphabets.begin(); + // it_map != alphabets.end(); + // it_map++) { + // // if (createdAlphabets.find(it_map->first) == createdAlphabets.end()) + // // continue; + + // delete it_map->second; + // } + // } + + bool AlphabetArray::addAlphabet(std::shared_ptr<Alphabet> alphabet) { + auto res = alphabets.insert(std::map<std::string,std::shared_ptr<Alphabet>> + ::value_type(alphabet->getName(), alphabet)); + if (res.second) + nb++; + return res.second; + } + + std::shared_ptr<Alphabet> AlphabetArray::removeAlphabet(const std::string name) { + std::map<std::string,std::shared_ptr<Alphabet>>::iterator it = alphabets.find(name); + if (it == alphabets.end()) { + return nullptr; + } + std::shared_ptr<Alphabet> a = it->second; + + alphabets.erase(name); + //createdAlphabets.erase(name); + nb--; + + return a; + } + + std::shared_ptr<Alphabet> AlphabetArray::getAlphabet(const std::string name) { + std::map<std::string, std::shared_ptr<Alphabet>>::iterator it = alphabets.find(name); + if (it == alphabets.end()) { + return nullptr; + } + return it->second; + } + + bool AlphabetArray::has(const std::string name) { + std::map<std::string, std::shared_ptr<Alphabet>>::iterator it = alphabets.find(name); + if (it == alphabets.end()) + return false; + return true; + } + + int AlphabetArray::size() { + return nb; + } + + void AlphabetArray::load(std::string filename) { + std::string symbol; + std::shared_ptr<Alphabet> alphabet(nullptr); + std::ifstream f(filename); + if (!f.is_open()) { + std::stringstream ss; + ss << "Can't open file '" << filename << "' !"; + throw std::runtime_error(ss.str()); + } + + while (f >> symbol) { + std::string alphaBegin = "##"; + if (symbol.compare(0, alphaBegin.length(), alphaBegin) == 0) { + symbol.erase(0,2); + alphabet = std::shared_ptr<Alphabet>(new Alphabet(symbol, true)); + addAlphabet(alphabet); + } else { + if (alphabet != NULL) { + alphabet->addSymbol(symbol); + } + } + } + f.close(); + } + + void AlphabetArray::dump(std::ostream &out) { + std::map<std::string, std::shared_ptr<Alphabet>>::iterator it; + for (it = alphabets.begin(); it != alphabets.end(); it++) { + it->second->dump(out); + } + } + + void AlphabetArray::dump(std::string filename/*=""*/) { + std::ofstream of; + + if (filename != "") { + of.open(filename); + } + + std::ostream &out = (filename != "") ? of : std::cout; + + if (filename != "" && !of.is_open()) { + std::stringstream ss; + ss << "Can't open file '" << filename << "' !"; + throw std::runtime_error(ss.str()); + } + + dump(out); + + if (filename != "") { + of.close(); + } + } + + std::shared_ptr<Alphabet> AlphabetArray::operator [](const std::string &name) { + return getAlphabet(name); + } +} diff --git a/maca_graph_parser/maca_alphabet.hh b/maca_graph_parser/maca_alphabet.hh new file mode 100644 index 0000000000000000000000000000000000000000..2af0d5d4493781e3c219d179ae3045d93a643c0c --- /dev/null +++ b/maca_graph_parser/maca_alphabet.hh @@ -0,0 +1,284 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Jeremy Auguste <jeremy.auguste@etu.univ-amu.fr> + This file is part of maca_common. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_common. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_ALPHABET_H__ +#define __MACA_ALPHABET_H__ + +#include <iostream> +#include <fstream> +#include <string> +#include <map> +#include <set> +#include <vector> +#include <stdexcept> +#include <memory> + +#define MACA_ALPHABET_INVALID_CODE -1 + +namespace macaon { + class Alphabet { + private: + std::string name; /**< name of the alphabet */ + int nb; /**< number of symbols in the alphabet */ + std::map<std::string, int> wordCodeMap; /**< map where keys are symbols and values their code */ + std::vector<std::string> codeWordArray; /**< vector where indexes are the symbols codes and values are the symbols */ + bool locked; /**< informs if the alphabet is locked or not */ + bool loaded; /**< informs if the alphabet has been loaded from a file */ + + public: + /** + * Constructor for Alphabet. + * + * @param name the name of the alphabet. + * @param loaded true if alphabet is loaded from a file, false otherwise. + */ + Alphabet(const std::string name, bool loaded=false); + + /** + * Gets the name of the alphabet. + * + * + * @return the name of the alphabet. + */ + std::string getName(); + + /** + * Adds a symbol to the alphabet. + * You can only add symbols if the alphabet is unlocked. + * + * @param symbol the symbol to add to the alphabet. + * + * @return a integer representation of the symbol. + * + * @throws runtime_error if the alphabet is locked. + */ + int addSymbol(const std::string symbol); + + /** + * Gets the integer representation of a symbol. + * + * @param symbol the symbol we are looking for. + * + * @return the integer representation of the symbol. + * + * @throws runtime_error if the symbol does not exist. + */ + int getCode(const std::string &symbol); + + /** + * Gets a symbol from its integer representation. + * + * @param code the integer representation of a symbol. + * + * @return the symbol. + * + * @throws out_of_range if the code is invalid. + */ + std::string &getSymbol(const int &code); + + /** + * Gets the number of symbols of the alphabet. + * + * + * @return the number of symbols. + */ + int size(); + + /** + * Locks the alphabet. + * This means that you can't add any additional symbols to the alphabet. + */ + void lock(); + + /** + * Unlocks the alphabet. + * + */ + void unlock(); + + /** + * Returns if the alphabet has been loaded from a file. + * + * + * @return true if loaded from a file, false otherwise. + */ + bool isLoaded(); + + /** + * Dumps the alphabet into an output stream. + * + * @param out the output stream. + */ + void dump(std::ostream &out); + + // Overloading == + bool operator ==(const Alphabet &a); + + // Overloading != + bool operator !=(const Alphabet &a); + + /** + * Same as getCode(). + * + * @return an integer representation. + */ + int operator [](const std::string &s); + + /** + * Same as getSymbol() + * + * @return a symbol. + */ + std::string &operator [](const int &i); + }; + + class AlphabetArray { + private: + std::map<std::string, std::shared_ptr<Alphabet>> alphabets; /**< the stored alphabets */ + int nb; /**< the number of alphabets */ + + public: + /** + * Constructor for AlphabetArray. + * + */ + AlphabetArray(); + + /** + * Constructor for AlphabetArray. + * Also loads alphabets from the specified file. + * + * @param filename the name of the file which contains alphabets. + * + * @throws runtime_error if we couldn't open the file. + */ + AlphabetArray(std::string filename); + + /** + * Adds an alphabet to the array. + * + * @param alphabet the alphabet to add. + * + * @return true if successful, false if an alphabet with the same name is already in the array. + */ + bool addAlphabet(std::shared_ptr<Alphabet> alphabet); + + /** + * Removes an alphabet. + * + * @param name the name of the alphabet to remove. + * + * @return the removed alphabet, or nullptr if it doesn't exist. + */ + std::shared_ptr<Alphabet> removeAlphabet(const std::string name); + + /** + * Gets an alphabet. + * + * @param name the name of the alphabet to get. + * + * @return the wanted alphabet, or nullptr if it doesn't exist. + */ + std::shared_ptr<Alphabet> getAlphabet(const std::string name); + + /** + * Checks if an alphabet has the given name. + * + * @param name the name of the potential alphabet. + * + * @return true if an alphabet was found, false otherwise. + */ + bool has(const std::string name); + + /** + * Get the number of alphabets. + * + * + * @return the number of alphabets. + */ + int size(); + + /** + * Loads alphabets from a file and stores them in the array. + * + * @param filename the name of the file which contains the alphabets. + * + * @throws runtime_error if we couldn't open the file. + */ + void load(std::string filename); + + /** + * Dumps the alphabets in the output stream. + * + * @param out the output stream. + */ + void dump(std::ostream &out); + + /** + * Dumps the alphabets in the given filename. + * + * @param filename the output filename. + * + * @throws runtime_error if we couldn't open the file. + */ + void dump(std::string filename=""); + + + /** + * Same as getAlphabet() + * + * @return an alphabet. + */ + std::shared_ptr<Alphabet> operator [](const std::string &name); + }; + + template<class T> + class PtrPool { + public: + static PtrPool<T>& getInstance() { + static PtrPool<T> instance; + return instance; + } + + T *accept(std::shared_ptr<T> ptr) { + objects[ptr.get()] = ptr; + return ptr.get(); + } + + void release(T *a) { + objects.erase(a); + } + + std::shared_ptr<T> get(T *a) { + auto it = objects.find(a); + if (it == objects.end()) + return nullptr; + return it->second; + } + + private: + std::map<T*,std::shared_ptr<T>> objects; + + PtrPool<T>() {} + PtrPool<T>(PtrPool<T> const&) = delete; + void operator=(PtrPool<T> const&) = delete; + }; +} + +#endif /* __MACA_ALPHABET_H__ */ diff --git a/maca_graph_parser/maca_alphabet_wrapper.cc b/maca_graph_parser/maca_alphabet_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..7440dc5befee4aa445cab282cc06a3686dc5cfc0 --- /dev/null +++ b/maca_graph_parser/maca_alphabet_wrapper.cc @@ -0,0 +1,131 @@ +#include "maca_alphabet.hh" +#include "maca_alphabet_wrapper.h" +#include <cstring> +#include <memory> + +extern "C" { + maca_alphabet* maca_alphabet_new(const char* name, char loaded) { + std::shared_ptr<macaon::Alphabet> a(new macaon::Alphabet(name, (loaded != 0))); + PtrPool<macaon::Alphabet>::getInstance().accept(a); + + return a.get(); + } + + void maca_alphabet_delete(maca_alphabet *a) { + PtrPool<macaon::Alphabet>::getInstance().release(a); + } + + void maca_alphabet_release(maca_alphabet *a) { + maca_alphabet_delete(a); + } + + void maca_alphabet_get_name(maca_alphabet *a, char* name, int size) { + strncpy(name, a->getName().c_str(), size); + } + + int maca_alphabet_add_symbol(maca_alphabet *a, const char* symbol) { + try { + return a->addSymbol(symbol); + } catch (std::exception &e) { + return -1; + } + } + + int maca_alphabet_get_code(maca_alphabet *a, const char* symbol) { + try { + return a->getCode(symbol); + } catch (std::exception &e) { + return -1; + } + } + + char maca_alphabet_get_symbol(maca_alphabet *a, int code, char* name, int size) { + try { + std::string sName = a->getSymbol(code); + strncpy(name, sName.c_str(), size); + return 1; + } catch (std::exception &e) { + return 0; + } + } + + int maca_alphabet_size(maca_alphabet *a) { + return a->size(); + } + + void maca_alphabet_lock(maca_alphabet *a) { + a->lock(); + } + + void maca_alphabet_unlock(maca_alphabet *a) { + a->unlock(); + } + + char maca_alphabet_is_loaded(maca_alphabet *a) { + return a->isLoaded(); + } + + + maca_alphabet_array *maca_alphabet_array_new() { + return new macaon::AlphabetArray(); + } + + maca_alphabet_array *maca_alphabet_array_new_from_file(const char* filename) { + try { + return new macaon::AlphabetArray(filename); + } catch (std::exception &e) { + return NULL; + } + } + + void maca_alphabet_array_delete(maca_alphabet_array *array) { + delete array; + } + + char maca_alphabet_array_add_alphabet(maca_alphabet_array *array, maca_alphabet *a) { + std::shared_ptr<macaon::Alphabet> ptr = PtrPool<macaon::Alphabet>::getInstance().get(a); + return array->addAlphabet(ptr); + } + + maca_alphabet *maca_alphabet_array_remove_alphabet(maca_alphabet_array *array, const char* name, + char free_alphabet) { + std::shared_ptr<macaon::Alphabet> a = array->removeAlphabet(name); + if (free_alphabet) { + return NULL; + } + + return PtrPool<macaon::Alphabet>::getInstance().accept(a); + } + + maca_alphabet *maca_alphabet_array_get_alphabet(maca_alphabet_array *array, const char* name) { + std::shared_ptr<macaon::Alphabet> a = array->getAlphabet(name); + return PtrPool<macaon::Alphabet>::getInstance().accept(a); + } + + char maca_alphabet_array_has_alphabet(maca_alphabet_array *array, const char* name) { + return array->has(name); + } + + int maca_alphabet_array_size(maca_alphabet_array *array) { + return array->size(); + } + + char maca_alphabet_array_load(maca_alphabet_array *array, const char* filename) { + try { + array->load(filename); + return 1; + } catch (std::exception &e) { + return 0; + } + } + + char maca_alphabet_array_dump(maca_alphabet_array *array, const char* filename) { + try { + array->dump(filename); + return 1; + } catch (std::exception &e) { + return 0; + } + } + +} diff --git a/maca_graph_parser/maca_alphabet_wrapper.h b/maca_graph_parser/maca_alphabet_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..8d61d3c2538cbd70ed18d3b2e56e2640cdd5febd --- /dev/null +++ b/maca_graph_parser/maca_alphabet_wrapper.h @@ -0,0 +1,249 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + Jeremy Auguste <jeremy.auguste@etu.univ-amu.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_ALPHABET_WRAPPER_H__ +#define __MACA_ALPHABET_WRAPPER_H__ + +#ifdef __cplusplus +extern "C" { + namespace macaon {} + using namespace macaon; +#else + #define MACA_ALPHABET_INVALID_CODE -1 +#endif + + typedef struct Alphabet maca_alphabet; + + /** + * Creates a new maca_alphabet. + * They allow you to store multiple symbols and get an integer representation + * for each of them. + * + * @param name the name of the alphabet. + * @param loaded true if alphabet is loaded from a file, false otherwise. + * + * @return a pointer to a new maca_alphabet. + * + * @see maca_alphabet_delete() + */ + maca_alphabet* maca_alphabet_new(const char *name, char loaded); + + /** + * Frees a maca_alphabet. + * + * @param a the alphabet we are freeing. + * + * @see maca_alphabet_new() + */ + void maca_alphabet_delete(maca_alphabet *a); + + /** + * Releases a maca_alphabet. + * + * @param a the alphabet we are releasing. + */ + void maca_alphabet_release(maca_alphabet *a); + + /** + * Gets the name of an alphabet. + * + * @param a the alphabet to get the name from. + * @param name the buffer to store the name into. + * @param size the size of the buffer. + */ + void maca_alphabet_get_name(maca_alphabet *a, char *name, int size); + + /** + * Adds a symbol to the alphabet. + * You can only add symbols if the alphabet is unlocked. + * + * @param a the alphabet in which we're adding a symbol. + * @param symbol the symbol to add. + * + * @return the integer representation of the symbol, -1 if error. + * + * @see maca_alphabet_lock() + * @see maca_alphabet_unlock() + */ + int maca_alphabet_add_symbol(maca_alphabet *a, const char *symbol); + + /** + * Gets the integer representation of a symbol. + * + * @param a the alphabet in which we are looking for a symbol. + * @param symbol the symbol we are looking for. + * + * @return the integer representation of the symbol, -1 if not found. + */ + int maca_alphabet_get_code(maca_alphabet *a, const char *symbol); + + /** + * Get a symbol from its integer representation. + * + * @param a the alphabet in which the symbol is stored. + * @param code the integer representation of the symbol. + * @param name a buffer to store the symbol into. + * @param size the size of the buffer. + * + * @return 1 if everything went fine, 0 otherwise (invalid code, etc.) + */ + char maca_alphabet_get_symbol(maca_alphabet *a, int code, char *name, int size); + + /** + * Gets the size of the alphabet. + * + * @param a the alphabet we are getting the size from. + * + * @return the size (number of symbols) of the alphabet. + */ + int maca_alphabet_size(maca_alphabet *a); + + /** + * Locks the alphabet. + * This means that you won't be able to add any new symbols into the alphabet. + * + * @param a the alphabet to lock. + */ + void maca_alphabet_lock(maca_alphabet *a); + + /** + * Unlocks the alphabet. + * + * @param a the alphabet to unlock. + */ + void maca_alphabet_unlock(maca_alphabet *a); + + /** + * Returns if the alphabet has been loaded from a file or not. + * + * @param a the alphabet we are checking. + * + * @return 1 if the alphabet has been loaded from a file, 0 otherwise. + */ + char maca_alphabet_is_loaded(maca_alphabet *a); + + + typedef struct AlphabetArray maca_alphabet_array; + + + /** + * Creates a new maca_alphabet_array. + * They allow you to store multiple alphabets. + * It can load alphabets from files and also dump its content in a file. + * + * @return a pointer to a new maca_alphabet_array. + * + * @see maca_alphabet_array_new_from_file() + * @see maca_alphabet_array_delete() + */ + maca_alphabet_array *maca_alphabet_array_new(); + + /** + * Creates a new maca_alphabet_array, and loads alphabets from the specified + * file. + * + * @param filename the name of the file which contains alphabets. + * + * @return a pointer to a new maca_alphabet_array, or NULL if we couldn't open the file. + * + * @see maca_alphabet_array_new() + * @see maca_alphabet_array_delete() + */ + maca_alphabet_array *maca_alphabet_array_new_from_file(const char *filename); + + /** + * Frees the alphabet array. + * + * @param array the alphabet array we are freeing. + */ + void maca_alphabet_array_delete(maca_alphabet_array *array); + + /** + * Adds an alphabet to the array. + * + * @param array the alphabet array in which we're adding an alphabet. + * @param a the alphabet we're adding. + * + * @return 1 if successful, 0 otherwise (name already in array). + */ + char maca_alphabet_array_add_alphabet(maca_alphabet_array *array, maca_alphabet *a); + + /** + * Removes an alphabet from the array. + * + * @param array the array we are removing an alphabet from. + * @param name the name of the alphabet we are removing. + * @param free_alphabet 1 if we also want to free the alphabet, 0 otherwise. + * + * @return a pointer to the removed alphabet or NULL if it was freed. + * + * @note if you kept a pointer to the alphabet somewhere else, even if you free it here, + * you'll need to release your other pointer too. + */ + maca_alphabet *maca_alphabet_array_remove_alphabet(maca_alphabet_array *array, const char *name, + char free_alphabet); + + /** + * Gets an alphabet from the array. + * + * @param array the array we are getting an alphabet from. + * @param name the name of the alphabet to look for. + * + * @return a pointer to an alphabet if it exists, NULL otherwise. + * + * @note you must release the pointer once you are finished with it! + * + * @see maca_alphabet_release() + */ + maca_alphabet *maca_alphabet_array_get_alphabet(maca_alphabet_array *array, const char *name); + + /** + * Gets the size of the array. + * + * @param array the array we are getting the size from. + * + * @return the size of the array. + */ + int maca_alphabet_array_size(maca_alphabet_array *array); + + /** + * Loads from a file alphabets and stores them in the array. + * + * @param array the array to store the alphabets in. + * @param filename the file to load the alphabets from. + * + * @return 1 if successful, 0 if we couldn't open the file. + */ + char maca_alphabet_array_load(maca_alphabet_array *array, const char *filename); + + /** + * Dumps into a file all the alphabets in the array. + * + * @param array the array to dump from. + * @param filename the file to dump into. + * + * @return 1 if successful, 0 if we couldn't open the file. + */ + char maca_alphabet_array_dump(maca_alphabet_array *array, const char *filename); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/maca_graph_parser/maca_common.c b/maca_graph_parser/maca_common.c new file mode 100644 index 0000000000000000000000000000000000000000..1b2c8007202a4176b170a85c612cd7e3b0531e5c --- /dev/null +++ b/maca_graph_parser/maca_common.c @@ -0,0 +1,17 @@ +#include <stdlib.h> + +#include "maca_common.h" +#include "maca_constants.h" +#include "maca_msg.h" + +char * maca_common_get_macaon_path() +{ + char * path; + + path = getenv(MACAON_PATH); + if(path == NULL) path = MACAON_DIRECTORY; + maca_print_verbose("maca_common",5,MACA_MESSAGE,"maca_common_get_macaon_path","get macaon data path : %s",path); + return path; +} + +int maca_verbose; diff --git a/maca_graph_parser/maca_common.h b/maca_graph_parser/maca_common.h new file mode 100644 index 0000000000000000000000000000000000000000..2ba4add06907f112a8e854eba27692722d08d457 --- /dev/null +++ b/maca_graph_parser/maca_common.h @@ -0,0 +1,16 @@ +#ifndef __MACA_COMMON__ +#define __MACA_COMMON__ + + +/** maca_common control data structure */ +typedef struct { + char * cfg; /*!< Config name */ + +} maca_common_ctx; + +maca_common_ctx* maca_init(int argc, char** argv); +char * maca_common_get_macaon_path(); + +extern int maca_verbose; + +#endif diff --git a/maca_graph_parser/maca_constants.h b/maca_graph_parser/maca_constants.h new file mode 100644 index 0000000000000000000000000000000000000000..6e15bb582292c75d0e80e87d3487ffd63e50515f --- /dev/null +++ b/maca_graph_parser/maca_constants.h @@ -0,0 +1,169 @@ +/*********************************************************************************** + Copyright (C) 2009-2012 by Jean-Francois Rey <jean-francois.rey@lif.univ-mrs.fr> + This file is part of macaon. + + Macaon is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Macaon is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with macaon. If not, see <http://www.gnu.org/licenses/>. +************************************************************************************/ + +/** + * \file maca_constants.h + * \brief Globals Variables declarations + * \author Jean-François REY + * \version 2.0 + * \date 02 Aug 2012 + * + */ + +#ifndef __MACA_CONSTANTS__ +#define __MACA_CONSTANTS__ + +extern int maca_verbose; + +#define MACAON_PATH "MACAON_DIR" +#ifndef MACAON_DIRECTORY +#define MACAON_DIRECTORY "$HOME/macaon" +#endif + +#define MACA_TAGS_FILE_NAME "maca_tags" +#define MACA_LEXIQUE_FILE_NAME "maca_lexique" + +#define MACA_MORPHO_FILE_NAME "maca_morpho" +#define MACA_MORPHO_TRAITS_FILE_NAME "maca_morpho_traits" + +#define MACA_TAGGER_EMISSION_PROB_FILE_NAME "maca_tagger_emission" +#define MACA_TAGGER_POS_NGRAM_FILE_NAME "maca_tagger_pos_ngram" + +#define MACA_GUESSER_PROB_FILE_NAME "maca_tagger_guesser_prob" + +#define MACA_CHUNKER_GRAMMAR_FILE_NAME "maca_chunker_grammar" + +#define MACA_CRF_TAGGER_MODEL_FILE_NAME "crf_tagger_model" +#define MACA_CRF_TAGGER_LEXICON_FILE_NAME "crf_tagger_wordtag_lexicon" + +#define MACA_GRAPH_PARSER_MODEL_FILE_NAME "maca_graph_parser_model1" +#define MACA_GRAPH_PARSER_DEP_COUNT_FILE_NAME "maca_graph_parser_model1" +#define MACA_GRAPH_PARSER_ALPHA_FILE_NAME "maca_graph_parser_model1" + +#define MACA_GRAPH_PARSER_FIRST_MODEL_FILE_NAME "maca_graph_parser_model1" +#define MACA_GRAPH_PARSER_FIRST_DEP_COUNT_FILE_NAME "maca_graph_parser_model1" +#define MACA_GRAPH_PARSER_FIRST_ALPHA_FILE_NAME "maca_graph_parser_model1" + +#define MACA_GRAPH_PARSER_SECOND_MODEL_FILE_NAME "maca_graph_parser_model2" +#define MACA_GRAPH_PARSER_SECOND_DEP_COUNT_FILE_NAME "maca_graph_parser_model2" +#define MACA_GRAPH_PARSER_SECOND_ALPHA_FILE_NAME "maca_graph_parser_model2" + +#define MACA_DEFAULT_CFG "fr" + +#define MACA_ENCODING "UTF-8" /**< Default file and characters encoding */ + +#define MACA_ERROR 1 +#define MACA_WARNING 2 +#define MACA_MESSAGE 3 + +#define DEFAULT_BUFFER_SIZE 4096 + +#define MACA_MAX_LENGTH_LINE 300 +#define MACA_MAX_LENGTH_SENTENCE 400 +#define MACA_MAX_LENGTH_WORD 100 +#define MACA_MAX_LENGTH_POSTAG 50 + +/*#define MACA_START_SENTENCE <s>*/ +#define MACA_END_SENTENCE "<EOS>" +#define MACA_TAILLE_TAB_DICO 50000 +#define MACA_TYPE_PROBA float +#define MACA_LOGPROB_UNK -99 +#define MACA_MINUS_INF (double) -100000 +/*#define MACA_MINUS_INF - DBL_MAX*/ +#define MACA_NULL_BACKP -2 + +#define MACA_INVALID_LEX_ID -1 +#define MACA_INVALID_POS_TAG -1 +#define MACA_INVALID_CHUNK_TAG -1 +#define MACA_INVALID_LOGPROB 50 +#define MACA_INVALID_CHUNK_TAG -1 + +#define MAX_LONG_LIGNE 100 +#define MAX_LONG_PHRASE 2000 +#define MAX_LONG_MOT 100 +#define MAX_LONG_CAT 50 +/*#define DEBUT_PHRASE <s>*/ +#define FIN_PHRASE "<EOS>" +#define TAILLE_TAB_DICO 50000 +#define TYPE_PROBA float +#define INVALID_CAT -1 +#define MAX_LINE 1000 +#define INVALID_LOGPROB 50 +#define LOGPROB_UNK -99 +#define MINUS_INF (double) -100000 +/*#define MINUS_INF - DBL_MAX*/ +#define NULL_BACKP -2 +#define INVALID_LEFFF_CODE -1 + +#define MACA_NAMESPACE "maca" +#define MACA_HREF_NAMESPACE "http://macaon.lif.univ-mrs.fr/macaon/namespace/v1.0" +#define MACA_DTD_HREF "http://macaon.lif.univ-mrs.fr/uploads/macaon/documents/macaon_gen.dtd" +#define MACA_DTD_NAMESPACE_HREF "http://macaon.lif.univ-mrs.fr/uploads/macaon/documents/macaon_gen_namespace.dtd" + +#define MACA_TOKENS_SECTION "prelex" /**< Prelex section type */ +#define MACA_WORDS_SECTION "lex" /**< Lex section type */ +#define MACA_POSS_SECTION "morpho" /**< Morpho section type */ +#define MACA_CHUNKS_SECTION "synt" /**< Synt section type */ + +#define MACA_PRELEX_SECTION "prelex" /**< Prelex section type */ +#define MACA_LEX_SECTION "lex" /**< Lex section type */ +#define MACA_MORPHO_SECTION "morpho" /**< Morpho section type */ +#define MACA_SYNT_SECTION "synt" /**< Synt section type */ + +#define MACA_TOKENS_TYPE "atome" /**< Atome segment type */ +#define MACA_WORDS_TYPE "ulex" /**< Ulex segment type */ +#define MACA_POSS_TYPE "cat" /**< Cat segment type */ +#define MACA_CHUNKS_TYPE "chunk" /**< Chunk segment type */ + +#define MACA_ATOME_TYPE "atome" /**< Atome segment type */ +#define MACA_ULEX_TYPE "ulex" /**< Ulex segment type */ +#define MACA_CAT_TYPE "cat" /**< Cat segment type */ +#define MACA_CHUNK_TYPE "chunk" /**< Chunk segment type */ + +#define MACAON_TAG "macaon" /**< Macaon tag */ +#define MACA_SENTENCE_TAG "sentence" /**< Sentence tag */ +#define MACA_SECTION_TAG "section" /**< Section tag */ +#define MACA_SEGS_TAG "segs" /**< Segments tag */ +#define MACA_SEG_TAG "seg" /**< Segment tag */ +#define MACA_SEQ_TAG "seq" /**< Sequence tag */ +#define MACA_ELT_TAG "elt" /**< Element tag */ +#define MACA_FSM_TAG "fsm" /**< Fsm tag */ + + +/* conventional names of the mcf colums*/ + +#define MACA_MCF_ID "ID" +#define MACA_MCF_FORM "FORM" +#define MACA_MCF_LEMMA "LEMMA" +#define MACA_MCF_CPOSTAG "CPOSTAG" +#define MACA_MCF_POSTAG "POSTAG" +#define MACA_MCF_FEATS "FEATS" +#define MACA_MCF_HEAD "HEAD" +#define MACA_MCF_DEPREL "DEPREL" +#define MACA_MCF_PHEAD "PHEAD" +#define MACA_MCF_PDEPREL "PDEPREL" +#define MACA_MCF_SUBCAT "SUBCAT" + +#define MACA_ALPHABET_WORDS "WORDS" +#define MACA_ALPHABET_POS "POS" +#define MACA_ALPHABET_LABELS "LABELS" +#define MACA_ALPHABET_MORPHO "MORPHO" +#define MACA_ALPHABET_SYNT_FEATS "SYNT_FEATS" + +#endif + diff --git a/maca_graph_parser/maca_graph_parser.c b/maca_graph_parser/maca_graph_parser.c new file mode 100644 index 0000000000000000000000000000000000000000..48699723a61b5a827537ff1dd2433c4973a90b4e --- /dev/null +++ b/maca_graph_parser/maca_graph_parser.c @@ -0,0 +1,515 @@ +/*************************************************************************** + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +**************************************************************************/ + + +#include <string.h> +#include <getopt.h> + +#include "maca_common.h" +#include "maca_graph_parser.h" +#include "maca_graph_parser_model.h" +#include "maca_graph_parser_sentence.h" +#include "maca_graph_parser_dep_count_table.h" +#include "maca_graph_parser_decoder.h" +#include "maca_graph_parser_features.h" +#include "maca_graph_parser_feature_table.h" + +/* +char * maca_graph_parser_get_model_filename(char * cfg, int order); +char * maca_graph_parser_get_alphabet_filename(char * cfg, int order); +char * maca_graph_parser_get_dep_count_filename(char * cfg, int order); +*/ + +void maca_graph_parser_print_ctx(maca_graph_parser_ctx *ctx) +{ + fprintf(stderr, "module = %s\n", ctx->module); + fprintf(stderr, "cfg = %s\n", ctx->cfg); + fprintf(stderr, "verbose_flag = %d\n", ctx->verbose_flag); + fprintf(stderr, "mcf file name = %s\n", ctx->mcf_file_name); + fprintf(stderr, "dep count file name = %s\n", ctx->dep_count_table_file_name); + fprintf(stderr, "model file name = %s\n", ctx->model_file_name); + fprintf(stderr, "alphabet file name = %s\n", ctx->alphabet_file_name); + fprintf(stderr, "sent nb = %d\n", ctx->sent_nb); + fprintf(stderr, "I = %d\n", ctx->I); + fprintf(stderr, "H = %d\n", ctx->H); + fprintf(stderr, "order = %d\n", ctx->order); + fprintf(stderr, "synt labels nb = %d\n", ctx->labels_nb); + fprintf(stderr, "pos nb = %d\n", ctx->pos_nb); + fprintf(stderr, "min dep count = %d\n", ctx->min_dep_count); + fprintf(stderr, "maximum sentence length = %d\n", ctx->max_sent_length); + fprintf(stderr, "algorithm = %d\n", ctx->algorithm); + fprintf(stderr, "use lemmas = %d\n", ctx->use_lemmas); + fprintf(stderr, "use full forms = %d\n", ctx->use_full_forms); + fprintf(stderr, "basic features = %d\n", ctx->basic_features); + fprintf(stderr, "first order features = %d\n", ctx->first_features); + fprintf(stderr, "grandchildren_features = %d\n", ctx->grandchildren_features); + fprintf(stderr, "sibling features = %d\n", ctx->sibling_features); + fprintf(stderr, "subcat features = %d\n", ctx->subcat_features); + fprintf(stderr, "produce hash model = %d\n", ctx->produce_hash_model); + fprintf(stderr, "hash fill rate = %f\n", ctx->hash_fill_rate); + fprintf(stderr, "k-best = %d\n", ctx->k); + if(ctx->mode == DECODE_MODE) + fprintf(stderr, "mode = decode\n"); + else if(ctx->mode == TRAIN_MODE) + fprintf(stderr, "mode = train\n"); + else if(ctx->mode == EVAL_MODE) + fprintf(stderr, "mode = eval\n"); + fprintf(stderr, "out file name = %s\n", ctx->file_name_out); +} + +maca_graph_parser_ctx * maca_graph_parser_InitCTX() +{ + maca_graph_parser_ctx * ctx = calloc(sizeof(maca_graph_parser_ctx), 1); + + ctx->cfg=MACA_DEFAULT_CFG; + ctx->verbose_flag = maca_verbose; + ctx->module = MACA_GRAPH_PARSER_NAME; + ctx->model_file_name = NULL; + ctx->model2_file_name = NULL; + ctx->mcf_file_name = NULL; + ctx->e = NULL; + ctx->sent_nb = 1000000; + ctx->alphabet_file_name = NULL; + ctx->I = 10; + ctx->H = 50000000; + ctx->order = 1; + ctx->dep_count_table = NULL; + ctx->dep_count_table_file_name = NULL; + ctx->feature_table = NULL; + ctx->min_dep_count = 0; + ctx->max_sent_length = MACA_MAX_LENGTH_SENTENCE; + ctx->algorithm = PERCEPTRON_TRAINING; + ctx->use_lemmas = 0; + ctx->use_full_forms = 0; + ctx->basic_features = 1; + ctx->first_features = 1; + ctx->grandchildren_features = 0; + ctx->sibling_features = 0; + ctx->subcat_features = 0; +/* ctx->hash_model = 1; */ + ctx->produce_hash_model = 1; + ctx->print_ctx = 0; + ctx->hash_fill_rate = 0.25; + ctx->k = 0; + ctx->file_name_out = NULL; + ctx->file_out = stdout; + ctx->mode = DECODE_MODE; + ctx->s = NULL; + ctx->store_in_feature_table = 1 ; + + ctx->words_alphabet = NULL; + ctx->pos_alphabet = NULL; + ctx->labels_alphabet = NULL; + ctx->morpho_alphabet = NULL; + ctx->synt_feats_alphabet = NULL; + + ctx->words_nb = 0; + ctx->pos_nb = 0; + ctx->labels_nb = 0; + ctx->morpho_nb = 0; + ctx->synt_feats_nb = 0; + + ctx->mcf_index_id = -1; + ctx->mcf_form_id = -1; + ctx->mcf_lemma_id = -1; + ctx->mcf_cpostag_id = -1; + ctx->mcf_postag_id = -1; + ctx->mcf_feats_id = -1; + ctx->mcf_deprel_id = -1; + ctx->mcf_head_id = -1; + ctx->mcf_pdeprel_id = -1; + ctx->mcf_phead_id = -1; + ctx->mcf_subcat_id = -1; + + return ctx; +} + +maca_graph_parser_ctx * maca_graph_parser_LoadCTX(int argc, char ** argv) { + /** + * Load the execution context. + */ + + /* init with default values */ + maca_graph_parser_ctx * ctx = maca_graph_parser_InitCTX(); + /* get cfg from maca_common_ctx */ + ctx->cfg = NULL; + + /* parse command line options */ + static struct option long_options[] = + { + /* These options set a flag. */ + /*{"verbose", no_argument, &verbose_flag, 1}, + {"brief", no_argument, &verbose_flag, 0},*/ + /* These options don't set a flag. + We distinguish them by their indices. */ + {"verbose", required_argument, 0, 'v'}, + {"model", required_argument, 0, 'm'}, + {"conll", required_argument, 0, 'c'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {"sentence-number", no_argument, 0, 'n'}, + {0, 0, 0, 0} + }; + int option_index = 0; + + optind = 0; + opterr = 0; + + int c; + while ((c = getopt_long (argc, (char**)argv, "hVv:c:m:n:a:I:H:r:C:d:M:p:D:l:LWBFGSUtxk:z:q:",long_options, &option_index)) != -1){ + switch (c) + { + case 'h': + maca_graph_parser_PrintHelpMessage(argv[0]); + exit(0); + case 'V': + fprintf(stderr, "%s version : %s\n", argv[0], MACA_GRAPH_PARSER_VERSION); + exit(0); + case 'B': + ctx->basic_features = 1; + break; + case 'U': + ctx->subcat_features = 1; + break; + case 'F': + ctx->first_features = 1; + break; + case 'G': + ctx->grandchildren_features = 1; + break; + case 'S': + ctx->sibling_features = 1; + ctx->order = 2; + break; + case 'L': + ctx->use_lemmas = 1; + break; + case 'W': + ctx->use_full_forms = 1; + break; + case 'p': + if(!strcmp(optarg, "perceptron")) ctx->algorithm = PERCEPTRON_TRAINING; + else if(!strcmp(optarg, "mira")) ctx->algorithm = MIRA_TRAINING; + else if(!strcmp(optarg, "adagrad")) ctx->algorithm = ADAGRAD_TRAINING; + else { + fprintf(stderr, "ERROR: Unsupported training algorithm \"%s\".\n", optarg); + exit(1); + } + break; + case 't': +/* ctx->hash_model = 0; */ + ctx->produce_hash_model = 0; + break; + case 'v': + ctx->verbose_flag = atoi(optarg); + break; + case 'a': + ctx->alphabet_file_name = strdup(optarg); + break; + case 'm': + ctx->model_file_name = strdup(optarg); + break; + case 'q': + ctx->model2_file_name = strdup(optarg); + break; + case 'C': + ctx->cfg = strdup(optarg); + break; + case 'M': + if(optarg[0] == 't') ctx->mode = TRAIN_MODE; + else if(optarg[0] == 'd') ctx->mode = DECODE_MODE; + break; + case 'c': + ctx->mcf_file_name = strdup(optarg); + break; + case 'z': + ctx->file_name_out = strdup(optarg); + ctx->file_out = fopen(ctx->file_name_out, "w"); + if(ctx->file_out == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "cannot open out file %s aborting\n", ctx->file_name_out); + exit(1); + } + break; + case 'd': + ctx->dep_count_table_file_name = strdup(optarg); + break; + case 'l': + ctx->max_sent_length = atoi(optarg) + 1; /* add one for the fake root */ + if(ctx->max_sent_length > MACA_MAX_LENGTH_SENTENCE){ + fprintf(stderr, "cannot set max sentence length to %d", ctx->max_sent_length); + fprintf(stderr, ", it is higher than the maximal hard-coded value of %d", MACA_MAX_LENGTH_SENTENCE); + fprintf(stderr, ", aborting\n"); + exit(1); + } + break; + case 'D': + ctx->min_dep_count = atoi(optarg); + break; + case 'n': + ctx->sent_nb = atoi(optarg); + break; + case 'r': + ctx->hash_fill_rate = atof(optarg); + break; + case 'I': + ctx->I = atoi(optarg); + break; + case 'H': + ctx->H = atoi(optarg); + break; + case 'x': + ctx->print_ctx = 1; + break; + case 'k': + ctx->k = atoi(optarg); + break; +/* default : optind++; break; */ + /* default : break; */ + } + } + + /* check for presence of mandatory options */ + + /* alphabet */ + /* file name from command line or cfg */ + if(ctx->alphabet_file_name == NULL){ + ctx->alphabet_file_name = maca_graph_parser_get_alphabet_filename(ctx->cfg, ctx->order); + } + if(ctx->alphabet_file_name == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "you must specify an alphabet file with option -a\n"); + exit(1); + } + + /* dep_count_table */ + /* file name from command line or cfg */ + if(ctx->dep_count_table_file_name == NULL){ + ctx->dep_count_table_file_name = maca_graph_parser_get_dep_count_filename(ctx->cfg, ctx->order); + } + if(ctx->dep_count_table_file_name == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "you must specify an dep count table file with option -d\n"); + exit(1); + } + + /* model */ + /* file name from command line or cfg */ + if(ctx->model_file_name == NULL){ + ctx->model_file_name = maca_graph_parser_get_model_filename(ctx->cfg, ctx->order); + } + if(ctx->model_file_name == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "you must specify a name for the model file with option -m\n"); + exit(1); + } + + + /* training corpus required */ + if((ctx->mode == TRAIN_MODE) && (ctx->mcf_file_name == NULL)){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "you must specify a name for the training corpus with option -c\n"); + exit(1); + } + + + + return ctx; +} + + +void maca_graph_parser_init(maca_graph_parser_ctx * ctx) +{ + int i; + /* lexicon */ + /* used only in maca_graph_parser_sentence, at the moment */ + /* filename in cfg */ + + /* tagset */ + /* filename in cfg */ + + /* if(!maca_tags_loaded()){ + maca_tags_load_bin_cfg(ctx->cfg); + }*/ + + + /* ctx->labels_nb = maca_tags_count(ctx->cfg, "morpho", "fct"); */ + /* if(ctx->labels_nb > NB_LABELS){ + fprintf(stderr, "the given tagset contains %d syntactic labels, which exceeds the hard-coded limitation of %d, aborting\n", ctx->labels_nb, NB_LABELS); + } */ + /* ctx->pos_nb = maca_tags_count(ctx->cfg, "morpho", "stype"); */ + + + /* initialize or load alphabets */ + + + if(ctx->pos_start == -1){ + fprintf(stderr, "error: POS-tag __START__ not found in the alphabet file.\n"); + exit(1); + } + if(ctx->pos_end == -1){ + fprintf(stderr, "error: POS-tag __END__ not found in the alphabet file.\n"); + exit(1); + } + if(ctx->fct_joker == -1){ + fprintf(stderr, "error: function __JOKER__ not found in the alphabet file.\n"); + exit(1); + } + + +} + + +void maca_graph_parser_free_all(maca_graph_parser_ctx *ctx) +{ + maca_graph_parser_dep_count_table_free(ctx->pos_nb, ctx->labels_nb, ctx->dep_count_table); + if(ctx->words_alphabet) maca_alphabet_release(ctx->words_alphabet); + if(ctx->labels_alphabet) maca_alphabet_release(ctx->labels_alphabet); + if(ctx->pos_alphabet) maca_alphabet_release(ctx->pos_alphabet); + if(ctx->morpho_alphabet) maca_alphabet_release(ctx->morpho_alphabet); + + + maca_graph_parser_model_free(ctx->model); + maca_graph_parser_model_free(ctx->model2); + if(ctx->feature_table) maca_graph_parser_feature_table_free(ctx); + if(ctx->model_file_name) free(ctx->model_file_name); + if(ctx->model2_file_name) free(ctx->model2_file_name); + if(ctx->alphabet_file_name) free(ctx->alphabet_file_name); + if(ctx->dep_count_table_file_name) free(ctx->dep_count_table_file_name); + if(ctx->s) maca_graph_parser_free_sentence(ctx->s); + if(ctx->mcf_file_name) free(ctx->mcf_file_name); + if(ctx->file_name_out) free(ctx->file_name_out); + if(ctx->file_out) fclose(ctx->file_out); + //if(ctx->cfg) free(ctx->cfg); + if(ctx->e) maca_graph_parser_templ_library_free(ctx->e); + // maca_lex_free(); + /* maca_sentence_clean_section(ctx->ms,MACA_GRAPH_PARSER_SECTION); */ +} + + + +void maca_graph_parser_PrintHelpMessage() +{ + fprintf(stderr, "%s usage: %s [options]\n", MACA_GRAPH_PARSER_NAME, MACA_GRAPH_PARSER_NAME); + fprintf(stderr, "OPTIONS :\n"); + + fprintf(stderr, "general options\n"); + fprintf(stderr, " -h : print this message\n"); + fprintf(stderr, " -V : show version\n"); + fprintf(stderr, " -M t|d : t for train mode\n"); + fprintf(stderr, " : d for decode mode\n"); + fprintf(stderr, " -v 1|2|3 : verbosity level\n"); + fprintf(stderr, " -c file name : conll file (mandatory in training mode)\n"); + fprintf(stderr, " -m file name : model file\n"); + fprintf(stderr, " -a file name : alphabet file\n"); + fprintf(stderr, " -d file name : dep count table file name\n"); + fprintf(stderr, " -n int : number of sentences to process\n"); + fprintf(stderr, " -C language : language\n"); + fprintf(stderr, " -l int : maximum sentence length\n"); + fprintf(stderr, " -x : print context\n"); + + fprintf(stderr, "\ndecoding options\n"); + fprintf(stderr, " -k int : num of k-best (works for first order only)\n"); + fprintf(stderr, " -z file name : output file\n"); + fprintf(stderr, " -r float : hash fill rate\n"); + fprintf(stderr, " -S : use second order model\n"); + + fprintf(stderr, "\ntraining options\n"); + fprintf(stderr, " -p algorithm : training algorithm (perceptron, mira, adagrad)\n"); + fprintf(stderr, " -L : use lemmas\n"); + fprintf(stderr, " -W : use full forms\n"); + fprintf(stderr, " -B : use basic features\n"); + fprintf(stderr, " -F : use first order features\n"); + fprintf(stderr, " -G : use grandchildren features\n"); + fprintf(stderr, " -S : use sibling features\n"); + fprintf(stderr, " -U : use subcat features\n"); + fprintf(stderr, " -I int : number of iterations for training\n"); + fprintf(stderr, " -D int : minimum dependency count\n"); + fprintf(stderr, " -H int : size of hash table\n"); + fprintf(stderr, " -t : store model as an array (more compact but slower to load)\n"); +} + +char * maca_graph_parser_GetVersion() +{ + return MACA_GRAPH_PARSER_VERSION; +} + +char * maca_graph_parser_get_model_filename(char * cfg, int order) +{ + char * filename; + char * path = maca_common_get_macaon_path(cfg); + if(order == 2){ + filename = (char*)malloc(sizeof(char)*(strlen(path)+1+strlen(cfg)*2+5+strlen(MACA_GRAPH_PARSER_SECOND_MODEL_FILE_NAME)+1+4+1+1)); + /* sprintf(filename,"%s/%s/bin/%s_%s.bin",path,cfg,MACA_GRAPH_PARSER_SECOND_MODEL_FILE_NAME,cfg); */ + sprintf(filename,"%s/%s/bin/%s.bin",path,cfg,MACA_GRAPH_PARSER_SECOND_MODEL_FILE_NAME); + return filename; + } + filename = (char*)malloc(sizeof(char)*(strlen(path)+1+strlen(cfg)*2+5+strlen(MACA_GRAPH_PARSER_FIRST_MODEL_FILE_NAME)+1+4+1+1)); + /* sprintf(filename,"%s/%s/bin/%s_%s.bin",path,cfg,MACA_GRAPH_PARSER_FIRST_MODEL_FILE_NAME,cfg); */ + sprintf(filename,"%s/%s/bin/%s.bin",path,cfg,MACA_GRAPH_PARSER_FIRST_MODEL_FILE_NAME); + return filename; + +} + +char * maca_graph_parser_get_alphabet_filename(char * cfg, int order) +{ + char * filename; + char * path = maca_common_get_macaon_path(cfg); + if(order == 2){ + filename = (char*)malloc(sizeof(char)*(strlen(path)+1+strlen(cfg)*2+5+strlen(MACA_GRAPH_PARSER_SECOND_ALPHA_FILE_NAME)+1+5+1+1)); + /* sprintf(filename,"%s/%s/bin/%s_%s.alpha",path,cfg,MACA_GRAPH_PARSER_SECOND_ALPHA_FILE_NAME,cfg); */ + sprintf(filename,"%s/%s/bin/%s.alpha",path,cfg,MACA_GRAPH_PARSER_SECOND_ALPHA_FILE_NAME); + return filename; + } + filename = (char*)malloc(sizeof(char)*(strlen(path)+1+strlen(cfg)*2+5+strlen(MACA_GRAPH_PARSER_FIRST_ALPHA_FILE_NAME)+1+5+1+1)); + /* sprintf(filename,"%s/%s/bin/%s_%s.alpha",path,cfg,MACA_GRAPH_PARSER_FIRST_ALPHA_FILE_NAME,cfg); */ + sprintf(filename,"%s/%s/bin/%s.alpha",path,cfg,MACA_GRAPH_PARSER_FIRST_ALPHA_FILE_NAME); + return filename; +} + +char * maca_graph_parser_get_dep_count_filename(char * cfg, int order) +{ + char * filename; + char * path = maca_common_get_macaon_path(cfg); + if(order == 2){ + filename = (char*)malloc(sizeof(char)*(strlen(path)+1+strlen(cfg)*2+5+strlen(MACA_GRAPH_PARSER_SECOND_DEP_COUNT_FILE_NAME)+1+10+1)); + /* sprintf(filename,"%s/%s/bin/%s_%s.dep_count",path,cfg,MACA_GRAPH_PARSER_SECOND_ALPHA_FILE_NAME,cfg); */ + sprintf(filename,"%s/%s/bin/%s.dep_count",path,cfg,MACA_GRAPH_PARSER_SECOND_ALPHA_FILE_NAME); + return filename; + } + filename = (char*)malloc(sizeof(char)*(strlen(path)+1+strlen(cfg)*2+5+strlen(MACA_GRAPH_PARSER_FIRST_DEP_COUNT_FILE_NAME)+1+10+1)); + /* sprintf(filename,"%s/%s/bin/%s_%s.dep_count",path,cfg,MACA_GRAPH_PARSER_FIRST_ALPHA_FILE_NAME,cfg); */ + sprintf(filename,"%s/%s/bin/%s.dep_count",path,cfg,MACA_GRAPH_PARSER_FIRST_ALPHA_FILE_NAME); + return filename; + +} + +void maca_graph_parser_print_verbose(maca_graph_parser_ctx * mtctx, int level, int type, char * message,...) +{ + //extern int verbose; + va_list args; + + if(level <= mtctx->verbose_flag) + { + va_start(args,message); + maca_print_vverbose(mtctx->module,level,type,NULL,message,&args); + va_end(args); + } + +} diff --git a/maca_graph_parser/maca_graph_parser.h b/maca_graph_parser/maca_graph_parser.h new file mode 100644 index 0000000000000000000000000000000000000000..eae0d3bb01324fd87d9f5fe056ccfe947631691b --- /dev/null +++ b/maca_graph_parser/maca_graph_parser.h @@ -0,0 +1,514 @@ +/******************************************************************************* + Copyright (C) 2011 by XX <XX@lif.univ-mrs.fr> + This file is part of maca_module. + + maca_MODULE is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_MODULE is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_MODULE. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_GRAPH_PARSER__ +#define __MACA_GRAPH_PARSER__ + + +#include <stdint.h> +#include <stdio.h> + +#include "maca_alphabet_wrapper.h" +#include "maca_common.h" + +#include "maca_graph_parser_hypergraph.h" /* dirty hack: we need def of Vertex for dynamically allocated kbest */ + +#include <math.h> +#ifndef INFINITY +#define INFINITY (float) (1.0f / 0.0f) +#endif + +/*!< MODULE version */ +#ifndef MACA_GRAPH_PARSER_VERSION +#define MACA_GRAPH_PARSER_VERSION "1.0.0" +#endif + +/*!< MODULE NAME */ +#define MACA_GRAPH_PARSER_NAME "maca_graph_parser" + + +#define feature_t uint64_t +#define _feat_family_nb 200 + + + + +typedef struct +{ + int taille; + int nbelem; + feature_t *table_clef; + float *params; + float *total; +} maca_graph_parser_hash; + + + +/*typedef struct +{ + int taille; + int nbelem; + int labels_nb; + feature_t *table_clef; + float ***params; + float ***total; +} maca_graph_parser_hash; +*/ +/* forward declaration of decoder structures */ +struct open; +struct closed; +/* +typedef struct +{ + feature_t feature; + float weight; +} feature_weight; +*/ +typedef struct +{ + feature_t *features; + float *weights; + uint32_t size; /* was: size_t */ +} maca_graph_parser_feature_weight_table; + + +typedef struct { + /* header */ + int is_hash_model; /* type of model */ + int min_dep_count; + int use_lemmas; /* features */ + int use_full_forms; + int basic_features; + int first_features; + int grandchildren_features; + int sibling_features; + int subcat_features; + /* content stored in a hash table or plain table */ + maca_graph_parser_hash *feat_ht; + maca_graph_parser_feature_weight_table *feat_array; + /* mmap model */ + int model_fd; + const char *mmap_data; + uint64_t mmap_length; + /* file name */ + char *file_name; +} maca_graph_parser_model; + + +typedef struct { + int field_nb; + int start[10]; + int end[10]; + char type[10]; + int value[10]; + int length[10]; + int label_start; + int label_end; + int direction_start; + int direction_end; + int hash_key_start; + int hash_key_end; +} templ; + +typedef int *****maca_graph_parser_dep_count_table; + +typedef struct { + int s_rel; /*number of different syntactic relations */ + int s_pos; /*number of different parts of speech */ + int s_word; /*number of different fledged forms */ + int s_synt_feat; /*number of different syntactic features */ + int s_type; /* */ + int s_dir; /* number of different dependency direction (left and right)*/ + int s_dist; /* number of different dependency lengths */ + int s_feat; /* */ + int s_child; /* */ + int type_start; /*first bit used to encode type in the binary representation of a feature */ + int type_end; /*first bit used to encode type in the binary representation of a feature */ + + templ *tfdp; + templ *tfdw; + templ *tfdwp; + templ *tfdpp; + templ *tfdww; + templ *tfdwpp; + templ *tfdwwp; + templ *tfdppp; + templ *tfdpppp; + templ *tfdwpwp; + templ *tdppp; + templ *tfddpp; + templ *tfddppp; + templ *tfddww; + templ *tfddwp; + templ *tfdsppp; + templ *tfdspp; + templ *tfdsww; + templ *tfdswp; + templ *tflpp; + templ *tflppp; + templ *tflpwp; + + templ* type2templ[_feat_family_nb]; + +}maca_graph_parser_templ_library; + +/* kbest */ +typedef struct { + int k; + int **gov; /* gov[i][ki] */ + int **label; /* label[i][ki] */ + float *score; /* score[ki] */ +} maca_graph_parser_sentence_kbest; + +/* should be in hyperdecoder.h */ +typedef struct { + int num; /* current size */ + int capacity; /* max size */ + DerivBP **elts; /* array of elements */ +} vec_Dbp; /* vector of derivations with backpointers */ + +/* end kbest */ + +typedef struct { + int l; + /* arrays */ + int *words; + int *lemmas; + int *pos; + int **morpho; + int **synt_feats_array; + int *synt_feats_nb; + int *gov; + int *label; + /* */ + float score; + void** word_adr; + maca_graph_parser_sentence_kbest *kb; +}maca_graph_parser_sentence; + +typedef struct{ + int typesLen; + int len; + int order; + /* first order features*/ + float ***pl; /*[start][end][dir]*/ + float ****lab; /*[start][end][dir][label]*/ + /* second order features */ + float *****sib; /*[start][end][sib][dir][label] */ + float *****gra; /*[start][end][gra][dir][label] */ +}maca_graph_parser_feature_table; + + +/* Structure qui contient les informations a conserver, ainsi que des données de controle du module */ +typedef struct{ + char *module; /*!< module name */ + char * cfg; /*!< config/language selected */ + int verbose_flag; /*!< verbose flag */ + maca_graph_parser_sentence *s; + FILE* conll_file; + /* input data */ + char *mcf_file_name; + int sent_nb; + + /* alphabets */ + + char *alphabet_file_name; + + maca_alphabet *words_alphabet; + maca_alphabet *pos_alphabet; + maca_alphabet *labels_alphabet; + maca_alphabet *morpho_alphabet; + maca_alphabet *synt_feats_alphabet; + + /* alphabet sizes */ + int labels_nb; + int pos_nb; + int words_nb; + int morpho_nb; + int synt_feats_nb; + + /* special values in alphabets */ + int w_start; + int w_end; + int pos_start; + int pos_end; + int fct_joker; + + /* dep count */ + int min_dep_count; + char *dep_count_table_file_name; + maca_graph_parser_dep_count_table dep_count_table; + + /* model */ + char *model_file_name; + maca_graph_parser_model *model; + int produce_hash_model; + int H; + float hash_fill_rate; + + /* test: model2 */ + char *model2_file_name; + maca_graph_parser_model *model2; + + /* feature extraction */ + maca_graph_parser_templ_library *e; /*!<pointer to the description of the feature families */ + int use_lemmas; + int use_full_forms; + int basic_features; + int first_features; + int grandchildren_features; + int sibling_features; + int subcat_features; + /* mode */ + int mode; + /* decoder */ + int order; + int k; + /* decoder2 */ + struct closed *****CLOSED2; /*[MACA_MAX_LENGTH_SENTENCE][MACA_MAX_LENGTH_SENTENCE][2][MACA_MAX_LENGTH_SENTENCE]*/; + struct open *****OPEN2; /* [MACA_MAX_LENGTH_SENTENCE][MACA_MAX_LENGTH_SENTENCE][2][NB_LABELS] */ + /* decoder1 */ + struct closed ****CLOSED; /*[MACA_MAX_LENGTH_SENTENCE][MACA_MAX_LENGTH_SENTENCE][2]*/; + struct open *****OPEN; /* [MACA_MAX_LENGTH_SENTENCE][MACA_MAX_LENGTH_SENTENCE][2][NB_LABELS] */ + + /* kbest */ + Vertex ****CLOSEDK; + Vertex *****OPENK; + vec_Dbp ****CDERIV; + vec_Dbp *****ODERIV; + + maca_graph_parser_feature_table *feature_table; + /* training */ + int I; + int algorithm; /* training algorithm */ + int max_sent_length; + /* info */ + int print_ctx; + /* output */ + char *file_name_out; + FILE *file_out; + + /* store in feature_table the scores of the first and second order factors */ + int store_in_feature_table; + + /* mcf columns id */ + int mcf_index_id; + int mcf_form_id; + int mcf_lemma_id; + int mcf_cpostag_id; + int mcf_postag_id; + int mcf_feats_id; + int mcf_deprel_id; + int mcf_head_id; + int mcf_pdeprel_id; + int mcf_phead_id; + int mcf_subcat_id; + +} maca_graph_parser_ctx; + +typedef enum {PERCEPTRON_TRAINING, MIRA_TRAINING, ADAGRAD_TRAINING} TrainingAlgorithm; + + +#ifdef __cplusplus +extern "C"{ +#endif + + +char * maca_graph_parser_get_model_filename(char * cfg, int order); +char * maca_graph_parser_get_alphabet_filename(char * cfg, int order); +char * maca_graph_parser_get_dep_count_filename(char * cfg, int order); +void maca_graph_parser_print_ctx(maca_graph_parser_ctx *ctx); + + +/** Initialize the default information structure + * \return a pointer on the structure maca_graph_parser_ctx + */ +maca_graph_parser_ctx * maca_graph_parser_InitCTX(); + +/** Initialize the information structure with the arguments + * \param argc : number of arguments + * \param argv : arguments (options) + * \param mcctx : pointer on maca_common_ctx + * \return a pointer on the structure maca_graph_parser_ctx + */ +maca_graph_parser_ctx * maca_graph_parser_LoadCTX(int argc, char ** argv); + +/** free inner datastorage + * \param ctx : a maca_graph_parser data info structure pointer + * + * Remove a section and data in the maca_sentence hashtable. + */ +void maca_graph_parser_free_all(maca_graph_parser_ctx * ctx); + +/** Print the graph_parser options + */ +void maca_graph_parser_PrintHelpMessage(); + +/** Get the graph_parser version + */ +char * maca_tagger_GetVersion(); + +void maca_graph_parser_print_verbose(maca_graph_parser_ctx * mtctx, int level, int type, char * message,...); + + +void maca_graph_parser_init(maca_graph_parser_ctx * ctx); + +#ifdef __cplusplus +} +#endif + + + +/* CONSTANTS */ +#define MACA_GRAPH_PARSER_ALPHABET_INVALID_CODE -1 +/* #define NB_LABELS 70 */ /* maximum number of syntactic functions */ +#define MINF -INFINITY +#define BasicFeatNb 500 // MACA_MAX_LENGTH_SENTENCE +#define FirstOrderFeatNb 500 // 33 +#define GrandchildrenFeatNb 100 //32 +#define SiblingFeatNb 100 //32 + +/* parser modes */ +#define DECODE_MODE 1 +#define TRAIN_MODE 2 +#define EVAL_MODE 3 + +/* directions for dependencies */ +#define ra 1 +#define la 0 + +/* features */ +#define _f1 1 +#define _f2 2 +#define _f4 3 +#define _f5 4 +#define _f6 5 +#define _f3 6 +#define _f13 7 +#define _f7 8 +#define _f10 9 +#define _f8 10 +#define _f9 11 +#define _f11 12 +#define _f12 13 +#define _f1l 14 +#define _f2l 15 +#define _f4l 16 +#define _f5l 17 +#define _f6l 18 +#define _f3l 19 +#define _f13l 20 +#define _f7l 21 +#define _f8l 22 +#define _f10l 23 +#define _f9l 24 +#define _f11l 25 +#define _f12l 26 +#define _f14 27 +#define _f15 28 +#define _f16 29 +#define _f17 30 +#define _f18 31 +#define _f19 32 +#define _f20 33 +#define _f39 34 + + +#define _f21 35 +#define _f22 36 +#define _f23 37 +#define _f24 38 +#define _f25 39 +#define _f26 40 +#define _f27 41 +#define _f28 42 +#define _f29 43 +#define _f24l 44 +#define _f25l 45 +#define _f26l 46 +#define _f27l 47 +#define _f28l 48 +#define _f29l 49 +#define _f42 50 +#define _f43 51 +#define _f44 52 +#define _f45 53 +#define _f46 54 +#define _f47 55 +#define _f48 56 +#define _f49 57 +#define _f50 58 +#define _f51 59 +#define _f52 60 +#define _f53 61 +#define _f54 62 +#define _f55 63 +#define _f56 64 +#define _f57 65 +#define _f74 66 +#define _f30 67 +#define _f31 68 +#define _f32 69 +#define _f33 70 +#define _f34 71 +#define _f35 72 +#define _f36 73 +#define _f37 74 +#define _f38 75 +#define _f33l 76 +#define _f34l 77 +#define _f35l 78 +#define _f36l 79 +#define _f37l 80 +#define _f38l 81 +#define _f58 82 +#define _f59 83 +#define _f60 84 +#define _f61 85 +#define _f62 86 +#define _f63 87 +#define _f64 88 +#define _f65 89 +#define _f66 90 +#define _f67 91 +#define _f68 92 +#define _f69 93 +#define _f70 94 +#define _f71 95 +#define _f72 96 +#define _f73 97 +#define _f75 98 +#define _f76 99 + +/* subcat features */ + +#define _f77 100 +#define _f78 101 +#define _f79 102 + + +/* (dependency) distance bins */ +#define d1 1 +#define d2 2 +#define d3 3 +#define d4 4 +#define d5 5 +#define d10 6 +#define di0 7 + +#endif diff --git a/maca_graph_parser/maca_graph_parser_alphabet.c b/maca_graph_parser/maca_graph_parser_alphabet.c new file mode 100644 index 0000000000000000000000000000000000000000..c96f164c833f855e48a25a1846ed0689b58342c1 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_alphabet.c @@ -0,0 +1,259 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#include<stdlib.h> +#include<stdio.h> +#include<time.h> +#include <string.h> +#include"maca_constants.h" +#include"maca_msg.h" +#include"maca_graph_parser.h" +#include"maca_graph_parser_alphabet.h" + +void maca_graph_parser_alphabet_free(maca_graph_parser_alphabet *a) +{ + array_free(a->array); + hash_free(a->htable); + free(a->name); + free(a); +} + +maca_graph_parser_alphabet *maca_graph_parser_alphabet_new(char *name) +{ + maca_graph_parser_alphabet *a = malloc(sizeof(maca_graph_parser_alphabet)); + if(a == NULL){ + fprintf(stderr, "memory allocation error\n"); + exit(1); + } + + a->name = strdup(name); + + a->nb = 0; + a->htable = hash_new(16); + a->array = array_new(); + return a; +} + + +int maca_graph_parser_alphabet_add_symbol(maca_graph_parser_alphabet *a, char *symbol) +{ + int code; + char *symbol_copy; + cell* found; + + /* fprintf(stderr, " maca_graph_parser_alphabet_add_symbol(%s)\n", symbol); */ + + found = hash_lookup(a->htable, symbol); + if(found == NULL){ + code = a->nb; + symbol_copy = strdup(symbol); + hash_add(a->htable, symbol_copy, a->nb); + array_push(a->array, symbol_copy); + a->nb++; + } else { + code = found->val; + } + return code; +} + +int maca_graph_parser_alphabet_get_code(maca_graph_parser_alphabet *a, char *symbol) +{ + cell *code; + + code = hash_lookup(a->htable, symbol); + if(code == NULL) + return MACA_GRAPH_PARSER_ALPHABET_INVALID_CODE; + return code->val; +} + + +char * maca_graph_parser_alphabet_get_symbol(maca_graph_parser_alphabet *a, int code) +{ + return array_get(a->array, code); +} + + +void maca_graph_parser_alphabet_print4(char *filename, maca_graph_parser_alphabet *a1, maca_graph_parser_alphabet *a2, maca_graph_parser_alphabet *a3, maca_graph_parser_alphabet *a4) +{ + FILE *f; + int i; + char *symbol; + + if(filename == NULL) + f = stdout; + else{ + f = fopen(filename, "w"); + if(f == NULL){ + fprintf(stderr, "cannot open file %s\n", filename); + exit(1); + } + } + + if(a1) maca_graph_parser_alphabet_print(f, a1); + if(a2) maca_graph_parser_alphabet_print(f, a2); + if(a3) maca_graph_parser_alphabet_print(f, a3); + if(a4) maca_graph_parser_alphabet_print(f, a4); + + if(filename) + fclose(f); +} +void maca_graph_parser_alphabet_print5(char *filename, maca_graph_parser_alphabet *a1, maca_graph_parser_alphabet *a2, maca_graph_parser_alphabet *a3, maca_graph_parser_alphabet *a4, maca_graph_parser_alphabet *a5) +{ + FILE *f; + int i; + char *symbol; + + if(filename == NULL) + f = stdout; + else{ + f = fopen(filename, "w"); + if(f == NULL){ + fprintf(stderr, "cannot open file %s\n", filename); + exit(1); + } + } + + if(a1) maca_graph_parser_alphabet_print(f, a1); + if(a2) maca_graph_parser_alphabet_print(f, a2); + if(a3) maca_graph_parser_alphabet_print(f, a3); + if(a4) maca_graph_parser_alphabet_print(f, a4); + if(a5) maca_graph_parser_alphabet_print(f, a5); + + if(filename) + fclose(f); +} + +void maca_graph_parser_alphabet_print(FILE *f, maca_graph_parser_alphabet *a) +{ + int i; + char *symbol; + + fprintf(f, "##%s\n", a->name); + for(i=0; i < a->nb; i++){ + symbol = array_get(a->array,i); + fprintf(f,"%s\n", symbol); + } +} + +maca_graph_parser_alphabet **maca_graph_parser_alphabet_load4(char *filename) +{ + FILE *f; + int i = 0; + char symbol[1000]; + maca_graph_parser_alphabet *a = NULL; + maca_graph_parser_alphabet **alpha_array = malloc(4 * sizeof(maca_graph_parser_alphabet*)); + + for(i=0; i < 4; i++) + alpha_array[i] = NULL; + + f = fopen(filename, "rb"); + if(f == NULL){ + fprintf(stderr, "cannot open file %s\n", filename); + exit(1); + } + i = 0; + while(fscanf(f, "%s", symbol) != EOF){ + if((symbol[0] == '#') && (symbol[1] == '#')) + alpha_array[i++] = a = maca_graph_parser_alphabet_new(symbol+2); + else{ + if(a) + maca_graph_parser_alphabet_add_symbol(a, symbol); + } + } + fclose(f); + return alpha_array; +} + +maca_graph_parser_alphabet **maca_graph_parser_alphabet_load5(char *filename) +{ + FILE *f; + int i = 0; + char symbol[1000]; + maca_graph_parser_alphabet *a = NULL; + maca_graph_parser_alphabet **alpha_array = malloc(5 * sizeof(maca_graph_parser_alphabet*)); + + for(i=0; i < 5; i++) + alpha_array[i] = NULL; + + f = fopen(filename, "rb"); + if(f == NULL){ + fprintf(stderr, "cannot open file %s\n", filename); + exit(1); + } + i = 0; + while(fscanf(f, "%s", symbol) != EOF){ + /* printf("%s\n", symbol);*/ + if((symbol[0] == '#') && (symbol[1] == '#')) + alpha_array[i++] = a = maca_graph_parser_alphabet_new(symbol+2); + else{ + if(a) + maca_graph_parser_alphabet_add_symbol(a, symbol); + } + } + fclose(f); + return alpha_array; +} + +/* + + else{ + maca_msg("maca_graph_parser_alphabet", MACA_ERROR); + fprintf(stderr, "error while loading alphabet file : %s\n", filename); + fprintf(stderr, "alphabet files must begin with ##ALPHABET_NAME\n"); + exit(0); + } +*/ + + +maca_graph_parser_alphabet *maca_graph_parser_alphabet_load(char *filename) +{ + FILE *f; + int i; + char symbol[1000]; + maca_graph_parser_alphabet *a = NULL; + + f = fopen(filename, "rb"); + if(f == NULL){ + fprintf(stderr, "cannot open file %s\n", filename); + exit(1); + } + + fscanf(f, "%s", symbol); + if((symbol[0] == '#') && (symbol[1] == '#')) + a = maca_graph_parser_alphabet_new(symbol+2); + else{ + maca_msg("maca_graph_parser_alphabet", MACA_ERROR); + fprintf(stderr, "error while loading alphabet file : %s\n", filename); + fprintf(stderr, "alphabet files must begin with ##ALPHABET_NAME\n"); + exit(0); + } + + while(fscanf(f, "%s", symbol) != EOF){ + /* printf("symbol = %s\n", symbol); */ + maca_graph_parser_alphabet_add_symbol(a, symbol); + } + + fclose(f); + return a; +} + +int maca_graph_parser_alphabet_size(maca_graph_parser_alphabet *a) +{ + return a->nb; +} diff --git a/maca_graph_parser/maca_graph_parser_alphabet.h b/maca_graph_parser/maca_graph_parser_alphabet.h new file mode 100644 index 0000000000000000000000000000000000000000..970570ed20209194348387581f574fac3bd80948 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_alphabet.h @@ -0,0 +1,55 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_GRAPH_PARSER_ALPHABET__ +#define __MACA_GRAPH_PARSER_ALPHABET__ + +#include <stdio.h> +#include "hash.h" +#include "array.h" + +typedef struct { + char *name; + int nb; + hash_t *htable; + array_t *array; +} maca_graph_parser_alphabet; + +#ifdef __cplusplus +extern "C"{ +#endif + +void maca_graph_parser_alphabet_print(FILE *filename, maca_graph_parser_alphabet *a); +void maca_graph_parser_alphabet_free(maca_graph_parser_alphabet *a); +maca_graph_parser_alphabet *maca_graph_parser_alphabet_new(char *name); +int maca_graph_parser_alphabet_add_symbol(maca_graph_parser_alphabet *a, char *symbol); +int maca_graph_parser_alphabet_get_code(maca_graph_parser_alphabet *a, char *symbol); +char * maca_graph_parser_alphabet_get_symbol(maca_graph_parser_alphabet *a, int code); +/* void maca_graph_parser_alphabet_print(char *filename, maca_graph_parser_alphabet *a); */ +void maca_graph_parser_alphabet_print4(char *filename, maca_graph_parser_alphabet *a1, maca_graph_parser_alphabet *a2, maca_graph_parser_alphabet *a3, maca_graph_parser_alphabet *a4); +void maca_graph_parser_alphabet_print5(char *filename, maca_graph_parser_alphabet *a1, maca_graph_parser_alphabet *a2, maca_graph_parser_alphabet *a3, maca_graph_parser_alphabet *a4, maca_graph_parser_alphabet *a5); +maca_graph_parser_alphabet *maca_graph_parser_alphabet_load(char *filename); +int maca_graph_parser_alphabet_size(maca_graph_parser_alphabet *a); +maca_graph_parser_alphabet **maca_graph_parser_alphabet_load4(char *filename); +maca_graph_parser_alphabet **maca_graph_parser_alphabet_load5(char *filename); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/maca_graph_parser/maca_graph_parser_conll2007_format.c b/maca_graph_parser/maca_graph_parser_conll2007_format.c new file mode 100644 index 0000000000000000000000000000000000000000..72395483bd91ac94dd789352a4eb3679a8302401 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_conll2007_format.c @@ -0,0 +1,385 @@ +#include "maca_graph_parser_conll2007_format.h" + +#include <errno.h> +#include <string.h> + +maca_graph_parser_sentence *maca_graph_parser_read_conll_sentence(maca_graph_parser_ctx *ctx, FILE *f, maca_graph_parser_sentence *s){ + /** + * Read into s a sentence from f in the CONLL 2007 format. + * + * Returns the next valid sentence in f. + */ + + int id; + char id_text[MAX_STR]; + char form[MAX_STR]; + char lemma[MAX_STR]; + char cpostag[MAX_STR]; + char postag[MAX_STR]; + char feats[MAX_STR]; + unsigned gov; + char gov_text[MAX_STR]; + char deprel[MAX_STR]; + char phead_str[MAX_STR]; /* phead_str is used to store syntactic features */ + char *synt_feats; + char *synt_feat; + char pdeprel[MAX_STR]; + char buff[MAX_LINE]; + unsigned nb_fields; + int code_postag, code_lemma, code_form, code_label, code_synt_feat; + int sentence_valid = 1; + + int synt_feats_array[100]; + int synt_feats_nb = 0; + + /* early termination */ + if(s == NULL){ + fprintf(stderr, "read_conll_sentence: s == NULL\n"); + return s; + } + + /* reset sentence given as parameter */ + s->l = 1; + + if(feof(f)) return s; + + while(fgets(buff, MAX_LINE, f) != NULL){ + /* sentence separator: empty line */ + if(buff[0] == '\n'){ + if(s->l > 1){ + if(sentence_valid){ + break; + } else { /* invalid sentence: start reading next sentence instead */ + s->l = 1; + sentence_valid = 1; + continue; + } + } else { /* skip extra empty lines */ + continue; + } + } + + /* skip too long sentence */ + if(s->l >= MACA_MAX_LENGTH_SENTENCE){ + if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "sentence too long, skipping it\n"); + } + /* mark as invalid and continue reading */ + sentence_valid = 0; + continue; + } + + id_text[0] = form[0] = lemma[0] = cpostag[0] = feats[0] = gov_text[0] = deprel[0] = phead_str[0] = pdeprel[0] = '\0'; + id = gov = -1; + /*1 Quatre Quatre D det - 2 det 0 0.000000 -1.000000 -1.000000 -1.000000*/ + /* nb_fields = sscanf(buff, "%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%d\t%s\t%lf%lf\t%lf", */ + nb_fields = sscanf(buff, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s", + id_text, form, lemma, cpostag, postag, feats, + gov_text, deprel, phead_str, pdeprel); + + id = strtol(id_text, NULL, 10); + if(errno == ERANGE) { + fprintf(stderr, "WARNING: id is an invalid number in \"%s\"\n", buff); + } + gov = strtol(gov_text, NULL, 10); + if(errno == ERANGE) { + fprintf(stderr, "WARNING: gov is an invalid number in \"%s\"\n", buff); + } + + if(nb_fields != 10) { + fprintf(stderr, "WARNING: incorrect number of fields in \"%s\"\n", buff); + } + + /* encode all fields as integers */ + code_form = maca_alphabet_add_symbol(ctx->words_alphabet, form); + + code_lemma = (strcmp(lemma, CONLL_UNK)) ? maca_alphabet_add_symbol(ctx->words_alphabet, lemma) : -1; + + /* code_postag = maca_tags_get_code(ctx->cfg, "morpho", "stype", postag); */ + + + /* get code of the postag. + If in train mode, new postags can be seen and must be added to the alphabet. + in decode mode, a postag not present in the alphabet must raise a warning and the sentence is skipped */ + if(ctx->mode == TRAIN_MODE){ + code_postag = maca_alphabet_add_symbol(ctx->pos_alphabet, postag); + } + else{ + code_postag = maca_alphabet_get_code(ctx->pos_alphabet, postag); + /* POS not in tagset: raise warning */ + if(code_postag == MACA_ALPHABET_INVALID_CODE){ + if(ctx->verbose_flag > 0){ + maca_msg(ctx->module, MACA_WARNING); + fprintf(stderr,"pos %s unknown\n", postag); + } + /* POS not in tagset: mark sentence as invalid */ + sentence_valid = 0; + } + } + + /* get code of the synt_feat. + If in train mode, new synt_feats can be seen and must be added to the alphabet. + in decode mode, a synt_feat not present in the alphabet must raise a warning and the sentence is skipped */ + + synt_feats_nb = 0; + + if(strcmp(phead_str, "_")){ + synt_feats = strdup(phead_str); + /* printf("synt feats = %s\n", synt_feats); */ + for(synt_feat = strtok (synt_feats, "|"); synt_feat; synt_feat = strtok (NULL, "|")){ + /* printf("%d synt feat = %s\n", synt_feats_nb, synt_feat); */ + if(ctx->mode == TRAIN_MODE){ + code_synt_feat = maca_alphabet_add_symbol(ctx->synt_feats_alphabet, synt_feat); + synt_feats_array[synt_feats_nb++] = code_synt_feat; + + } + else{ + code_synt_feat = maca_alphabet_get_code(ctx->synt_feats_alphabet, synt_feat); + /* synt_feat not in synt_feat set: raise warning */ + if(code_synt_feat == MACA_ALPHABET_INVALID_CODE){ + if(ctx->verbose_flag > 0){ + maca_msg(ctx->module, MACA_WARNING); + fprintf(stderr,"synt_feat %s unknown\n", synt_feat); + } + /* synt_feat not in tagset: mark sentence as invalid */ + sentence_valid = 0; + } + else{ + synt_feats_array[synt_feats_nb++] = code_synt_feat; + } + } + } + free(synt_feats); + } + + + /* code_label = (strcmp(deprel, CONLL_UNK))? maca_tags_get_code(ctx->cfg, "morpho", "fct", deprel) : -2; */ + code_label = (strcmp(deprel, CONLL_UNK))? maca_alphabet_add_symbol(ctx->labels_alphabet, deprel) : -2; + + + /* label not in tagset: raise warning */ + if((code_label == -1) && (ctx->verbose_flag > 0)){ + maca_msg(ctx->module, MACA_WARNING); + fprintf(stderr,"function %s unknown\n", deprel); + } + /* label not in tagset or unknown: + if in training mode, mark sentence as invalid */ + if((code_label < 0) && (ctx->mode == TRAIN_MODE)){ + sentence_valid = 0; + } + + /* fprintf(stdout, "------------> id = %d\n", id); + fprintf(stdout, "------------> form = %s\t%d\n", form, code_form); + fprintf(stdout, "------------> postag = %s\t%d\n", postag, code_postag); + fprintf(stdout, "------------> lemma = %s\t%d\n", lemma, code_lemma); + fprintf(stdout, "------------> cpostag = %s\t%d\n", cpostag, code_postag); + fprintf(stdout, "------------> feats = %s\n", feats); + fprintf(stdout, "------------> gov = %d\n", gov); + fprintf(stdout, "------------> deprel = %s\t%d\n", deprel, code_label); + fprintf(stdout, "------------> pdeprel = %s\n", pdeprel); + */ + + + maca_graph_parser_sentence_add_word(ctx, s, NULL, code_form, code_lemma, code_postag, gov, code_label, synt_feats_nb, synt_feats_array); + } + + return s; +} + + +void maca_graph_parser_dump_conll_sentence_kbest(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, FILE *f){ + int i; /* word index */ + int j; /* j-th best parse */ + int k = s->kb->k; + int cw; /* return val for fprintf */ + char buffer[128]; + for(j=0; j < k; j++){ + for(i=1; i < s->l; i++){ + /* INDEX */ + cw = fprintf(f, "%d", i); + if(cw < 0){ + fprintf(stderr, "dump_conll_sentence: problem while writing to file\n"); + exit(1); + } + /* FORM */ + if (s->words[i] != -1) + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[i], buffer, sizeof(buffer)); + else + strcpy(buffer, "_"); + cw = fprintf(f, "\t%s", buffer); + /* LEMMA */ + if (s->lemmas[i] != -1) + maca_alphabet_get_symbol(ctx->words_alphabet, s->lemmas[i], buffer, sizeof(buffer)); + else + strcpy(buffer, "_"); + cw = fprintf(f, "\t%s", buffer); + /* CPOSTAG */ + if (s->pos[i] != -1) + maca_alphabet_get_symbol(ctx->pos_alphabet, s->pos[i], buffer, sizeof(buffer)); + else + strcpy(buffer, "_"); + cw = fprintf(f, "\t%s", buffer); + /* POSTAG */ + if (s->pos[i] != -1) + maca_alphabet_get_symbol(ctx->pos_alphabet, s->pos[i], buffer, sizeof(buffer)); + else + strcpy(buffer, "_"); + cw = fprintf(f, "\t%s", buffer); + /* TODO: w_morpho */ + cw = fprintf(f, "\t_"); + /* HEAD */ + cw = fprintf(f, "\t%d", s->kb->gov[i][j]); + /* DEPREL */ + /* if(s->kb->gov[i][j] == 0) + fprintf(f,"\troot"); + else*/ + if (s->kb->label[i][j] != -1) + maca_alphabet_get_symbol(ctx->pos_alphabet, s->kb->label[i][j], buffer, sizeof(buffer)); + else + strcpy(buffer, "_"); + cw = fprintf(f, "\t%s", buffer); + /* PHEAD */ + cw = fprintf(f, "\t_"); + /* PDEPREL */ + cw = fprintf(f, "\t_"); + cw = fprintf(f, "\n"); + } + fprintf(f, "\n"); + } + fprintf(f, "\n"); +} + +void maca_graph_parser_dump_conll_sentence_kbest_for_ghasem(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, FILE *f){ +/* void maca_graph_parser_dump_conll_sentence_kbest(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, FILE *f){ */ + int i; /* word index */ + int j; /* j-th best parse */ + int k = s->kb->k; + int cw; /* return val for fprintf */ + char buffer[128]; + + fprintf(f, "#sentence\n"); + for(j=0; j < k; j++){ + fprintf(f, "#parse %d\n", j); + fprintf(f, "#score %f\n", s->kb->score[j]); + for(i=1; i < s->l; i++){ + /* INDEX */ + cw = fprintf(f, "%d", i); + if(cw < 0){ + fprintf(stderr, "dump_conll_sentence: problem while writing to file\n"); + exit(1); + } + + /* HEAD */ + cw = fprintf(f, " %d", s->kb->gov[i][j]); + /* DEPREL */ + if(s->kb->gov[i][j] == 0) + fprintf(f," root"); + else + if (s->kb->label[i][j] != -1) + maca_alphabet_get_symbol(ctx->pos_alphabet, s->kb->label[i][j], buffer, sizeof(buffer)); + else + strcpy(buffer, "_"); + cw = fprintf(f, "\t%s", buffer); + cw = fprintf(f, "\n"); + } + fprintf(f, "\n"); + } + fprintf(f, "\n"); +} + +void maca_graph_parser_dump_conll_sentence(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, FILE *f){ + /** + * Dump sentence s to file f, in the CONLL 2007 format. + * + */ + + int i = 0; // FIXME: was uninitialized before + int cw; /* return val for fprintf */ + + if(s == NULL) + return; + + if(s->kb) { + maca_graph_parser_dump_conll_sentence_kbest(ctx, s, f); + /*maca_graph_parser_dump_conll_sentence_kbest_for_ghasem(ctx, ctx->s, ctx->file_out);*/ + } else { + char word[128]; + char lemma[128]; + char pos[128]; + char label[128]; + + if (s->words[i] != -1) + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[i], word, sizeof(word)); + else + strcpy(word, "_"); + if (s->lemmas[i] != -1) + maca_alphabet_get_symbol(ctx->words_alphabet, s->lemmas[i], lemma, sizeof(lemma)); + else + strcpy(lemma, "_"); + if (s->pos[i] != -1) + maca_alphabet_get_symbol(ctx->pos_alphabet, s->pos[i], pos, sizeof(pos)); + else + strcpy(pos, "_"); + if (s->label[i] != -1) + maca_alphabet_get_symbol(ctx->labels_alphabet, s->label[i], label, sizeof(label)); + else + strcpy(label, "_"); + for(i=1; i < s->l; i++){ + /* + w_morpho = (memcmp(s->morpho[i], {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, sizeof(s->morpho[i]))) ? MORPHO_FIELD_STRING : "_"; + */ + + cw = fprintf(f, "%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\t\n", + i, + /* FORM */ + word, + /* LEMMA */ + lemma, + /* CPOSTAG */ + pos, + /* POSTAG */ + pos, + /* TODO: w_morpho */ + "_", + /* HEAD */ + s->gov[i], + /* DEPREL */ + label, + /* PHEAD */ + "_", + /* PDEPREL */ + "_" + ); + if(cw < 0){ + fprintf(stderr, "dump_conll_sentence: problem while writing to file\n"); + exit(1); + } + } + fprintf(f, "\n"); + } + +} + + +/*-------------------------------------------------------------------------------------------*/ +/* +void maca_graph_parser_load_conll_sentence(maca_graph_parser_ctx * ctx, sentence *conll_s, maca_graph_parser_sentence *maca_s) +{ + parse *p = conll_s->parses[0]; + int i; + int form, lemma, postag, label; + maca_s->l = 0; + + for(i=0; i<p->l; i++){ + postag = maca_tags_get_code(ctx->cfg, "morpho", "stype", p->words[i]->postag); + lemma = maca_alphabet_add_symbol(ctx->words_alphabet, p->words[i]->lemma); + form = maca_alphabet_add_symbol(ctx->words_alphabet, p->words[i]->form); + + + label = maca_tags_get_code(ctx->cfg, "morpho", "fct", p->words[i]->deprel); + maca_graph_parser_sentence_add_word(ctx, maca_s, NULL, form, lemma, postag, p->words[i]->head, label); + } +} +*/ + diff --git a/maca_graph_parser/maca_graph_parser_conll2007_format.h b/maca_graph_parser/maca_graph_parser_conll2007_format.h new file mode 100644 index 0000000000000000000000000000000000000000..eda38f8e2a4206fb770a68c048cd7f372de80fa4 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_conll2007_format.h @@ -0,0 +1,22 @@ +#include "maca_graph_parser_sentence.h" + + +#define CONLL_UNK "_" + +#define MAX_STR 100 +#define MAX_LINE 1000 + +#ifdef __cplusplus +extern "C"{ +#endif + + +maca_graph_parser_sentence *maca_graph_parser_read_conll_sentence(maca_graph_parser_ctx *ctx, FILE *f, maca_graph_parser_sentence *s); +void maca_graph_parser_dump_conll_sentence(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, FILE *f); + +#ifdef __cplusplus +} +#endif + + +/* void maca_graph_parser_load_conll_sentence(maca_graph_parser_ctx * ctx, sentence *conll_s, maca_graph_parser_sentence *maca_s); */ diff --git a/maca_graph_parser/maca_graph_parser_corpora.c b/maca_graph_parser/maca_graph_parser_corpora.c new file mode 100644 index 0000000000000000000000000000000000000000..ad90a37e3c75166018cf08351209749eec83a11c --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_corpora.c @@ -0,0 +1,398 @@ +#include "maca_graph_parser_corpora.h" +#include <stdio.h> +#include <string.h> + + +hyp_ref_vector *allocate_hyp_ref_vector(int capacity){ + /** + * Allocate a vector of hyp and ref. + */ + + int i; + + hyp_ref_vector *v = malloc(sizeof(hyp_ref_vector)); + if(v == NULL){ + fprintf(stderr, "allocate_hyp_ref_vector: memory alloc problem\n"); + exit(1); + } + + v->ref = malloc(capacity * sizeof(maca_graph_parser_sentence *)); + if(v->ref == NULL){ + fprintf(stderr, "allocate_hyp_ref_vector: memory alloc problem for ref\n"); + exit(1); + } + for(i = 0; i < capacity; i++){ + v->ref[i] = NULL; + } + + v->hyp = malloc(capacity * sizeof(maca_graph_parser_sentence *)); + if(v->hyp == NULL){ + fprintf(stderr, "allocate_hyp_ref_vector: memory alloc problem for hyp\n"); + exit(1); + } + for(i = 0; i < capacity; i++){ + v->hyp[i] = NULL; + } + + v->capacity = capacity; + v->size = 0; + + return v; +} + + +void free_hyp_ref_vector(hyp_ref_vector *v){ + /** + * Free a vector of hyp ref. + */ + + int i; + + if(v == NULL) + return; + + if(v->ref != NULL){ + for(i=0; i< v->size; i++){ + free(v->ref[i]); /* free sentence */ + v->ref[i] = NULL; + } + free(v->ref); + v->ref = NULL; + } + + if(v->hyp != NULL){ + for(i=0; i< v->size; i++){ + free(v->hyp[i]); /* free sentence */ + v->hyp[i] = NULL; + } + free(v->hyp); + v->hyp = NULL; + } + + v->capacity = 0; + v->size = 0; + + free(v); +} + + +void hyp_ref_vector_append(hyp_ref_vector *v, maca_graph_parser_sentence *ref, maca_graph_parser_sentence *hyp){ + /** + * Append a ref and hyp to vector. + */ + + if(v->size >= v->capacity){ + fprintf(stderr, "hyp_ref_vector full, please increase its capacity (current capacity: %d)\n", v->capacity); + exit(1); + } + v->ref[v->size] = ref; + v->hyp[v->size] = hyp; + v->size += 1; +} + + +/* hyp_ref_vector *load_conll_corpus(maca_graph_parser_ctx *ctx){ */ +/* /\** */ +/* * Load a CONLL 2007 formatted corpus. */ +/* * */ +/* * sent_nb stores the number of sentences read */ +/* *\/ */ + +/* int sent_nb; */ +/* hyp_ref_vector *v = allocate_hyp_ref_vector(ctx->sent_nb); */ +/* maca_graph_parser_sentence *ref_s = NULL; */ +/* maca_graph_parser_sentence *hyp_s = NULL; */ + +/* sent_nb = 0; */ +/* while(sent_nb < ctx->sent_nb){ */ +/* /\* read (reference) sentence *\/ */ +/* ref_s = maca_graph_parser_allocate_sentence(ctx); */ +/* maca_graph_parser_read_conll_sentence(ctx, ctx->conll_file, ref_s); */ + +/* if(ref_s->l == 1){ /\* should only happen at EOF *\/ */ +/* maca_graph_parser_free_sentence(ref_s); */ +/* if(ctx->verbose_flag > 4){ */ +/* fprintf(stderr, "maca_graph_parser_corpora: breaking on empty sentence\n"); */ +/* } */ +/* break; */ +/* } */ + +/* /\* ignore sentences whose length exceeds the given limit *\/ */ +/* if(ref_s->l > ctx->max_sent_length){ */ +/* if(ctx->verbose_flag > 4){ */ +/* fprintf(stderr, "maca_graph_parser_corpora: ignoring too long sentence\n"); */ +/* } */ +/* maca_graph_parser_free_sentence(ref_s); */ +/* continue; */ +/* } */ + +/* if(ctx->verbose_flag > 3){ */ +/* printf("------------------- %d -----------------------\n", sent_nb); */ +/* } */ +/* if(ctx->verbose_flag > 4){ */ +/* maca_graph_parser_sentence_print_sentence(ctx, ref_s); */ +/* } */ + +/* /\* copy data to hypothesis sentence *\/ */ +/* hyp_s = maca_graph_parser_duplicate_sentence(ctx, ref_s, NULL); */ +/* /\* store in vector *\/ */ +/* hyp_ref_vector_append(v, ref_s, hyp_s); */ + +/* /\* update dep_count_table *\/ */ +/* /\* maca_graph_parser_dep_count_table_update(ctx, ref_s); *\/ */ + +/* sent_nb++; */ +/* } */ + +/* if(ctx->verbose_flag > 1){ */ +/* maca_msg(ctx->module, MACA_MESSAGE); */ +/* fprintf(stderr, "%d sentences loaded\n", sent_nb); */ +/* } */ + +/* return v; */ +/* } */ + +hyp_ref_vector *load_mcf_corpus(maca_graph_parser_ctx *ctx){ + /** + * Load an MCF formatted corpus. + * + */ + + int sent_nb; + hyp_ref_vector *v = allocate_hyp_ref_vector(ctx->sent_nb); + maca_graph_parser_sentence *ref_s = NULL; + maca_graph_parser_sentence *hyp_s = NULL; + int col_id; + maca_mcf *format; + maca_mcf_column *column; + char buffer[128]; + maca_mcf_sentence *mcf_sent; + + format = maca_mcf_new(ctx->mcf_file_name); + + /* full form alphabet */ + if(ctx->use_full_forms){ + ctx->mcf_form_id = maca_mcf_input(format, MACA_MCF_FORM); + if(ctx->mcf_form_id == -1){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column FORM not found in the train file\n"); + exit(1); + } + /* initialize word alphabet in ctx */ + column = maca_mcf_get_column_info(format, ctx->mcf_form_id); + maca_mcf_column_get_type(column, buffer, sizeof(buffer)); + if (strcmp(buffer, MACA_ALPHABET_WORDS) != 0) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column FORM is not using the "MACA_ALPHABET_WORDS" alphabet!\n"); + exit(1); + } + ctx->words_alphabet = maca_mcf_column_get_alphabet(column); + + /* add start and stop symbols for words */ + ctx->w_start = maca_alphabet_add_symbol(ctx->words_alphabet, "__START__"); + ctx->w_end = maca_alphabet_add_symbol(ctx->words_alphabet, "__END__"); + } + + /* lemmas alphabet */ + if(ctx->use_lemmas){ + ctx->mcf_lemma_id = maca_mcf_input(format, MACA_MCF_LEMMA); + if(ctx->mcf_lemma_id == -1){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column LEMMA not found in the train file\n"); + exit(1); + } + /* initialize words alphabet in ctx */ + column = maca_mcf_get_column_info(format, ctx->mcf_lemma_id); + maca_mcf_column_get_type(column, buffer, sizeof(buffer)); + if (strcmp(buffer, MACA_ALPHABET_WORDS) != 0) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column LEMMA is not using the "MACA_ALPHABET_WORDS" alphabet!\n"); + exit(1); + } + if (!ctx->use_full_forms) { + ctx->words_alphabet = maca_mcf_column_get_alphabet(column); + /* add start and stop symbols for lemmas */ + ctx->w_start = maca_alphabet_add_symbol(ctx->words_alphabet, "__START__"); + ctx->w_end = maca_alphabet_add_symbol(ctx->words_alphabet, "__END__"); + } + } + + /* postag alphabet */ + ctx->mcf_postag_id = maca_mcf_input(format, MACA_MCF_POSTAG); + if(ctx->mcf_postag_id == -1){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column POSTAG not found in the train file\n"); + exit(1); + } + /* initialize postag alphabet in ctx */ + column = maca_mcf_get_column_info(format, ctx->mcf_postag_id); + maca_mcf_column_get_type(column, buffer, sizeof(buffer)); + if (strcmp(buffer, MACA_ALPHABET_POS) != 0) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column POSTAG is not using the "MACA_ALPHABET_POS" alphabet!\n"); + exit(1); + } + ctx->pos_alphabet = maca_mcf_column_get_alphabet(column); + /* add start and stop symbols for part of speeches */ + ctx->pos_start = maca_alphabet_add_symbol(ctx->pos_alphabet, "__START__"); + ctx->pos_end = maca_alphabet_add_symbol(ctx->pos_alphabet, "__END__"); + + + /* deprel alphabet */ + ctx->mcf_deprel_id = maca_mcf_input(format, MACA_MCF_DEPREL); + if(ctx->mcf_deprel_id == -1){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column DEPREL not found in the train file\n"); + exit(1); + } + /* initialize postag alphabet in ctx */ + column = maca_mcf_get_column_info(format, ctx->mcf_deprel_id); + maca_mcf_column_get_type(column, buffer, sizeof(buffer)); + if (strcmp(buffer, MACA_ALPHABET_LABELS) != 0) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column DEPREL is not using the "MACA_ALPHABET_LABELS" alphabet!\n"); + exit(1); + } + ctx->labels_alphabet = maca_mcf_column_get_alphabet(column); + /* add joker in syntactic labels tagset */ + ctx->fct_joker = maca_alphabet_add_symbol(ctx->labels_alphabet, "__JOKER__"); + + + /* check that head (governor) column exists */ + ctx->mcf_head_id = maca_mcf_input(format, MACA_MCF_HEAD); + if(ctx->mcf_head_id == -1){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column HEAD not found in the train file\n"); + exit(1); + } + maca_mcf_column_get_type(maca_mcf_get_column_info(format, ctx->mcf_head_id), buffer, sizeof(buffer)); + if(strcmp(buffer, "INT")){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column HEAD should be of type INT\n"); + exit(1); + } + + /* subcat alphabet */ + if(ctx->subcat_features){ + ctx->mcf_subcat_id = maca_mcf_input(format, MACA_MCF_SUBCAT); + if(ctx->mcf_subcat_id == -1){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column SUBCAT not found in the train file\n"); + exit(1); + } + /* initialize word alphabet in ctx */ + column = maca_mcf_get_column_info(format, ctx->mcf_subcat_id); + maca_mcf_column_get_type(column, buffer, sizeof(buffer)); + if (strcmp(buffer, MACA_ALPHABET_SYNT_FEATS) != 0) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column SUBCAT is not using the "MACA_ALPHABET_SYNT_FEATS" alphabet!\n"); + exit(1); + } + ctx->synt_feats_alphabet = maca_mcf_column_get_alphabet(column); + } + + sent_nb = 0; + while(sent_nb < ctx->sent_nb){ + /* read (reference) sentence */ + ref_s = maca_graph_parser_allocate_sentence(ctx); + mcf_sent = maca_graph_parser_read_mcf_sentence(ctx, format, ref_s); + maca_mcf_sentence_release(mcf_sent); + + if(ref_s->l == 1){ /* should only happen at EOF */ + maca_graph_parser_free_sentence(ref_s); + if(ctx->verbose_flag > 4){ + fprintf(stderr, "maca_graph_parser_corpora: breaking on empty sentence\n"); + } + break; + } + + if(ctx->verbose_flag > 3){ + printf("------------------- %d -----------------------\n", sent_nb); + } + if(ctx->verbose_flag > 4){ + maca_graph_parser_sentence_print_sentence(ctx, ref_s); + } + + /* copy data to hypothesis sentence */ + hyp_s = maca_graph_parser_duplicate_sentence(ctx, ref_s, NULL); + /* store in vector */ + hyp_ref_vector_append(v, ref_s, hyp_s); + + /* update dep_count_table */ + /* maca_graph_parser_dep_count_table_update(ctx, ref_s); */ + + sent_nb++; + } + + if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "%d sentences loaded\n", sent_nb); + } + if (!maca_mcf_dump_alphabets(format, ctx->alphabet_file_name)) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "couldn't open the alphabet file!\n"); + exit(1); + } + maca_mcf_delete(format); + return v; +} + + +void free_corpus(hyp_ref_vector *corpus){ + /** + * Free a corpus. + */ + + free_hyp_ref_vector(corpus); +} + + +/* void dump_conll_corpus(maca_graph_parser_ctx *ctx, hyp_ref_vector *corpus, FILE *f){ */ +/* /\** */ +/* * Dump a CONLL 2007 formatted corpus to file f. */ +/* * */ +/* * The syntactic information dumped is the hypothesis parse tree. */ +/* * */ +/* *\/ */ + +/* int sent_id = 0; */ +/* maca_graph_parser_sentence *s = NULL; */ + +/* if(corpus == NULL) */ +/* return; */ + +/* for(sent_id=0; sent_id < corpus->size; sent_id++){ */ +/* s = corpus->hyp[sent_id]; */ +/* maca_graph_parser_dump_conll_sentence(ctx, s, f); */ +/* } */ +/* } */ + + +void maca_graph_parser_sentence_relabel_rare_deps(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s){ + /** + * Preprocess s to relabel rare dependencies as "__JOKER__". + * + * Updates ctx->dep_count_table accordingly. + */ + + int i; + int dir, pos_gov, pos_dep, label, length_class; + int joker_label = ctx->fct_joker; + + for(i=1; i < s->l; i++){ + dir = (i > s->gov[i])? ra : la; + pos_gov = s->pos[s->gov[i]]; + pos_dep = s->pos[i]; + label = s->label[i]; + length_class = maca_graph_parser_dep_count_table_compute_length_class(s->gov[i], i); + + if((pos_gov >= 0 && pos_dep >= 0 && label >= 0) && + (ctx->dep_count_table[pos_gov][pos_dep][label][length_class][dir] < ctx->min_dep_count)){ + /* relabel dep */ + s->label[i] = joker_label; + /* update dep_count_table */ + ctx->dep_count_table[pos_gov][pos_dep][s->label[i]][length_class][dir]++; + /* TODO: remove hapax from dep_count_table ? */ + } + } +} diff --git a/maca_graph_parser/maca_graph_parser_corpora.h b/maca_graph_parser/maca_graph_parser_corpora.h new file mode 100644 index 0000000000000000000000000000000000000000..c0e2a0b41542ddb47b2c8831749e37cd2dd3cd83 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_corpora.h @@ -0,0 +1,30 @@ +#include "maca_common.h" +#include "maca_graph_parser_sentence.h" +#include "maca_graph_parser_dep_count_table.h" + + +typedef struct { + maca_graph_parser_sentence **ref; + maca_graph_parser_sentence **hyp; + int size; + int capacity; +} hyp_ref_vector; + +#ifdef __cplusplus +extern "C"{ +#endif + + +hyp_ref_vector *allocate_hyp_ref_vector(int capacity); +void free_hyp_ref_vector(hyp_ref_vector *v); +void hyp_ref_vector_append(hyp_ref_vector *v, maca_graph_parser_sentence *ref, maca_graph_parser_sentence *hyp); + + //hyp_ref_vector *load_conll_corpus(maca_graph_parser_ctx *ctx); +void free_corpus(hyp_ref_vector *corpus); + //void dump_conll_corpus(maca_graph_parser_ctx *ctx, hyp_ref_vector *corpus, FILE *f); + +void maca_graph_parser_sentence_relabel_rare_deps(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s); +hyp_ref_vector *load_mcf_corpus(maca_graph_parser_ctx *ctx); +#ifdef __cplusplus +} +#endif diff --git a/maca_graph_parser/maca_graph_parser_decode_main.c b/maca_graph_parser/maca_graph_parser_decode_main.c new file mode 100644 index 0000000000000000000000000000000000000000..b5b7bbe7344bd3cdfd5f4c7b14aed816d7a692b6 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_decode_main.c @@ -0,0 +1,78 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +**************************************************************************/ + +#include<getopt.h> +#include<unistd.h> +#include<stdio.h> +#include<string.h> + +#include "maca_common.h" +#include "maca_constants.h" +#include "maca_graph_parser.h" +#include "maca_graph_parser_conll2007_format.h" +#include "maca_graph_parser_decoder.h" + + +int main(int argc, char **argv) +{ + char c; + maca_graph_parser_ctx * ctx; + maca_graph_parser_sentence *sentence; + int sent_num; + + ctx = maca_graph_parser_LoadCTX(argc,argv); + + + if(ctx->conll_file){ + sentence = maca_graph_parser_allocate_sentence(ctx); + for(sentence = maca_graph_parser_read_conll_sentence(ctx, ctx->conll_file, sentence), sent_num = 0; + sentence && (sent_num < ctx->sent_nb); + sentence = maca_graph_parser_read_conll_sentence(ctx, ctx->conll_file, sentence), sent_num++){ + /* exit at empty sentence, should happen iff EOF */ + if(sentence->l == 1) + break; + /* skip too long sentences */ + if(sentence->l >= ctx->max_sent_length){ + continue; + } + + maca_graph_parser_print_verbose(ctx, 2, MACA_MESSAGE, "parsing sentence"); + maca_graph_parser_decoder_parse(ctx, ctx->s); + /* maca_graph_parser_update_sentence(ctx, ctx->s); */ /* TODO: backport modifs to maca_sentence if IO format is maca_xml */ + + /* write parsed sentence to file_out (default to stdout) */ + maca_graph_parser_dump_conll_sentence(ctx, ctx->s, ctx->file_out); + + fprintf(stderr, "%d\n", sent_num); + } + maca_graph_parser_free_sentence(sentence); + } else { + printf("You must provide a Conll file\n"); + } + + + + /* maca_graph_parser_add_stamp(maca_common_get_xml_root_node()); */ + maca_graph_parser_free_all(ctx); + /* fermeture et libération memoire */ +/* maca_close(); */ + + return 0; +} + diff --git a/maca_graph_parser/maca_graph_parser_decoder.c b/maca_graph_parser/maca_graph_parser_decoder.c new file mode 100644 index 0000000000000000000000000000000000000000..3095550072de85e70f4b0f1eaa2c0a003189e88b --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_decoder.c @@ -0,0 +1,210 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ +#include "maca_graph_parser.h" +#include"maca_graph_parser_feature_table.h" +#include"maca_graph_parser_decoder.h" +#include"maca_graph_parser_hyperdecoder.h" + + + +void maca_graph_parser_decoder_cleanup(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s) +{ + if(ctx->k){ + maca_graph_parser_hyperdecoder_cleanup(ctx, s); + } else { + if(ctx->order == 1){ + maca_graph_parser_decoder1_cleanup(ctx, s); + } else { + maca_graph_parser_decoder2_cleanup(ctx, s); + } + } +} + +/*-------------------------------------------------------------------------------------------*/ + +void maca_graph_parser_decoder_parse(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s){ + /** + * Parse sentence s. + */ + + /* extract features */ + /*if(ctx->store_in_feature_table){ + if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_MESSAGE); + if(ctx->ms) + fprintf(stderr, "computing features for sentence %s\n", ctx->ms->id_sentence); + else + fprintf(stderr, "computing features for sentence\n"); + } + maca_graph_parser_feature_table_fill(ctx, s, ctx->feature_table); + + if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "done\n"); + } + }*/ + + /* parse */ + /*if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_MESSAGE); + if(ctx->ms) + fprintf(stderr, "parsing sentence %s\n", ctx->ms->id_sentence); + else + fprintf(stderr, "parsing sentence\n"); + }*/ + + maca_graph_parser_decoder_init(ctx, s); + if(ctx->k){ + maca_graph_parser_hyperdecoder_parse(ctx, s, ctx->feature_table); + } else { + if(ctx->order == 1){ + maca_graph_parser_decoder1_parse(ctx, s, ctx->feature_table); + } else { + maca_graph_parser_decoder2_parse(ctx, s, ctx->feature_table); + } + } + + maca_graph_parser_decoder_cleanup(ctx, s); + + if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "done\n"); + } + +} + +/*-------------------------------------------------------------------------------------------*/ + +void maca_graph_parser_decoder_init(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s) +{ + if(ctx->k){ + maca_graph_parser_hyperdecoder_init(ctx, s); + } else { + if(ctx->order == 1){ + maca_graph_parser_decoder1_init(ctx, s); + } else { + maca_graph_parser_decoder2_init(ctx, s); + } + } +} + +/*-------------------------------------------------------------------------------------------*/ + +Open *alloc_open(float score, + int start, + int end, + int label, + int dir, + Closed *left, + Closed *right) +{ + Open *o = malloc(sizeof(Open)); + if(o == NULL){ + fprintf(stderr, "memory allocation problem\n"); + exit(1); + } + o->score = score; + o->start = start; + o->end = end; + o->label = label; + o->dir = dir; + o->left = left; + o->right = right; + return o; +} + +/*-------------------------------------------------------------------------------------------*/ + +Closed *alloc_closed( float score, + int start, + int end, + int breakpoint, + int dir, + Closed *d, + Open *u) +{ + Closed *c = malloc(sizeof(Closed)); + if(c == NULL){ + fprintf(stderr, "memory allocation problem\n"); + exit(1); + } + c->score = score; + c->start = start; + c->end = end; + c->breakpoint = breakpoint; + c->dir = dir; + c->d = d; + c->u = u; + return c; +} + +/*-------------------------------------------------------------------------------------------*/ + +void create_closed(Closed *c, maca_graph_parser_sentence *s) +{ + /* + if(c->dir == la){ + printf("create_closed [%d;(%d);<%d>] from closed [%d;<%d>] and open [%d;<%d>]\n", + c->start, c->breakpoint, c->end, + (c->d)? c->d->start : -1, (c->d)? c->d->end : -1, + (c->u)? c->u->start : -1, (c->u)? c->u->end : -1); + } else { + printf("create_closed [<%d>;(%d);%d] from open [<%d>;%d] and closed [<%d>;%d]\n", + c->start, c->breakpoint, c->end, + (c->u)? c->u->start : -1, (c->u)? c->u->end : -1, + (c->d)? c->d->start : -1, (c->d)? c->d->end : -1); + } + */ + if(c->d) create_closed(c->d, s); + if(c->u) create_open(c->u, s); +} + +/*-------------------------------------------------------------------------------------------*/ + +void create_open(Open *o, maca_graph_parser_sentence *s) +{ + /* + if (o->dir == la){ + printf("create_open [%d;<%d>] from left closed [<%d>;%d] and right closed [%d;<%d>]\n", + o->start, o->end, + (o->left)? o->left->start : -1, (o->left)? o->left->end : -1, + (o->right)? o->right->start : -1, (o->right)? o->right->end : -1); + } else { + printf("create_open [<%d>;%d] from left closed [<%d;%d>] and right closed [<%d;%d>]\n", + o->start, o->end, + (o->left)? o->left->start : -1, (o->left)? o->left->end : -1, + (o->right)? o->right->start : -1, (o->right)? o->right->end : -1); + } + */ + + /* 1-best gov and label */ + if (o->dir == la) { + s->gov[o->start] = o->end; + s->label[o->start] = o->label; + } + else { + s->gov[o->end] = o->start; + s->label[o->end] = o->label; + } + if (o->left) create_closed(o->left, s); + if (o->right) create_closed(o->right, s); +} + +/*-------------------------------------------------------------------------------------------*/ + diff --git a/maca_graph_parser/maca_graph_parser_decoder.h b/maca_graph_parser/maca_graph_parser_decoder.h new file mode 100644 index 0000000000000000000000000000000000000000..a7681869ac7f49b61cbde36d8e2253d9115ca43d --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_decoder.h @@ -0,0 +1,78 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_GRAPH_PARSER_DECODER__ +#define __MACA_GRAPH_PARSER_DECODER__ + +#include"maca_graph_parser.h" + +typedef struct open Open; +typedef struct closed Closed; + +struct open{ + float score; + int start; + int end; + int label; + int dir; + Closed *left; + Closed *right; +}; + +struct closed{ + float score; + int start; + int end; + int breakpoint; + int dir; + Closed *d; + Open *u; +}; + +#define PRINT 0 + + +#ifdef __cplusplus +extern "C"{ +#endif + + +void create_open(Open *o, maca_graph_parser_sentence *s); +void create_closed(Closed *c, maca_graph_parser_sentence *s); +Open *alloc_open(float score, int start, int end, int label, int dir, Closed *left, Closed *right); +Closed *alloc_closed( float score, int start, int end, int breakpoint, int dir, Closed *d, Open *u); + +void maca_graph_parser_decoder1_cleanup(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s); +void maca_graph_parser_decoder2_cleanup(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s); +void maca_graph_parser_decoder_cleanup(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s); + +void maca_graph_parser_decoder1_init(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s); +void maca_graph_parser_decoder2_init(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s); +void maca_graph_parser_decoder_init(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s); + +void maca_graph_parser_decoder1_parse(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, maca_graph_parser_feature_table *feat_table); +void maca_graph_parser_decoder2_parse(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, maca_graph_parser_feature_table *feat_table); +void maca_graph_parser_decoder_parse(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s); +#ifdef __cplusplus +} +#endif + + + +#endif diff --git a/maca_graph_parser/maca_graph_parser_decoder1.c b/maca_graph_parser/maca_graph_parser_decoder1.c new file mode 100644 index 0000000000000000000000000000000000000000..e53505019b20640726ae4f815ef1d6c9f9e2d42a --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_decoder1.c @@ -0,0 +1,496 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#include<stdlib.h> +#include<stdio.h> +#include<time.h> + +#include"maca_constants.h" +#include"maca_msg.h" + +#include"maca_graph_parser_decoder.h" +#include"maca_graph_parser_features.h" +#include"maca_graph_parser_feature_vector.h" +#include"maca_graph_parser.h" +#include"maca_graph_parser_feature_table.h" +#include "maca_graph_parser_dep_count_table.h" + +/*-------------------------------------------------------------------------------------------*/ +/*-------------------------------------------------------------------------------------------*/ + +//Closed *CLOSED +//[MACA_MAX_LENGTH_SENTENCE] /* start */ +//[MACA_MAX_LENGTH_SENTENCE] /* end */ +//[2] /* position of the root (left or right) */ +//; + +//Open *OPEN +//[MACA_MAX_LENGTH_SENTENCE] /* start */ +//[MACA_MAX_LENGTH_SENTENCE] /* end */ +//[2] /* position of the root (left or right) */ +//[NB_LABELS] /* label */ +//; + +/*-------------------------------------------------------------------------------------------*/ +void maca_graph_parser_decoder1_decode(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, maca_graph_parser_feature_table *feat_table) +{ + int labels_nb = ctx->labels_nb; + int n = s->l; + int start; + int end; + int dir; /* left or right */ + int label; /* label */ + int m; /* breakpoint (middle) */ + int m_argmax = 0; + int label_argmax = 0; + int span; + float w; + /* trial */ + int gov; + int dep; + /* common */ + int nb_cands; + float score_max; + /* fill OPEN: left and right closed */ + Closed *cand_LC; + Closed *cand_RC; + float score_cand_LC; + float score_cand_RC; + Closed *max_LC; + Closed *max_RC; + /* fill CLOSED: candidate open and closed */ + Closed *cand_C; + Open *cand_O; + float score_cand_C; + float score_cand_O; + Closed *max_C; + Open *max_O; + /* test: default edge label */ + /* int dft_label = maca_tags_get_code(ctx->cfg, "morpho", "fct", "__JOKER__"); */ + int dft_label = maca_alphabet_get_code(ctx->labels_alphabet, "__JOKER__"); + int length_class; + feat_vector *fv_basic = NULL; + feat_vector *fv_first = NULL; + float pl_score; + + for(span = 1; span <= n; span++){ + for(start = 0; start+span < n; start++){ + end = start + span; + if(ctx->verbose_flag > 1) fprintf(stderr, "start = %d end = %d\n",start,end); + length_class = maca_graph_parser_dep_count_table_compute_length_class(start,end); + /* fill OPEN table*/ + for(dir=0; dir<2; dir++){ + gov = (dir == la)? end: start; + dep = (dir == la)? start: end; + + for(label=0; label<labels_nb; label++){ + + /* filter with min_dep_count */ + if((ctx->dep_count_table[s->pos[gov]][s->pos[dep]][label][length_class][dir] >= ctx->min_dep_count) || + (label == dft_label)){ + /* init */ + nb_cands = 0; + score_max = MINF; + max_LC = NULL; + max_RC = NULL; + m_argmax = -1; /* for debug msgs */ + + /* find best pair of CLOSED substructures */ + for(m=start; m<end; m++){ + /* if (start == 0 && m != 0) continue; */ /* mate */ + + /* left closed */ + cand_LC = ctx->CLOSED[start][m][ra]; + if((cand_LC != NULL) || (m == start)){ + score_cand_LC = cand_LC ? cand_LC->score : 0; + } else + continue; + + /* right closed */ + cand_RC = ctx->CLOSED[m+1][end][la]; + if((cand_RC != NULL) || (m == end-1)){ + score_cand_RC = cand_RC ? cand_RC->score : 0; + } else + continue; + + /* only candidates reach this point */ + nb_cands++; + + if (ctx->verbose_flag > 1) fprintf(stderr, "\t\tcand = CLOSED[%d][%d](%f) + CLOSED[%d][%d](%f)\n", + start, m, score_cand_LC, + m+1, end, score_cand_RC); + + /* update max */ + if(nb_cands == 1){ + /* first candidate */ + score_max = score_cand_LC + score_cand_RC; + max_LC = cand_LC; + max_RC = cand_RC; + m_argmax = m; + } else { + if (score_cand_LC + score_cand_RC > score_max){ + score_max = score_cand_LC + score_cand_RC; + max_LC = cand_LC; + max_RC = cand_RC; + m_argmax = m; /* for debug msgs */ + } + } /* end update max */ + } /* end find best pair of CLOSED substructures */ + + /* create an OPEN if there is at least a candidate pair of CLOSED */ + if(nb_cands > 0){ + /* label */ + + if(ctx->store_in_feature_table){ + w = feat_table->lab[start][end][label][dir]; + } + else{/*-- compute scores on the fly --*/ + fv_first = maca_graph_parser_first_score(gov, dep, label, ctx, fv_first, &w); + } + if(ctx->verbose_flag > 1){ + char bstart[128]; + char bend[128]; + maca_alphabet_get_symbol(ctx->words_alphabet,s->words[start], bstart, sizeof(bstart)); + maca_alphabet_get_symbol(ctx->words_alphabet,s->words[end], bend, sizeof(bend)); + fprintf(stderr, "\tscore(%s(%d) %s-%d-%s %s(%d)) = %f\n", + bstart, + start, + (dir==la)?"<":"", + label, + (dir==la)?"":">", + bend, + end, + w + ); + } + + /* create an OPEN from the pair of CLOSED */ + + if(ctx->store_in_feature_table){ + pl_score = feat_table->pl[start][end][dir]; + } + else{ + /*-- compute scores on the fly --*/ + fv_basic = maca_graph_parser_basic_score(gov, dep, ctx, fv_basic, &pl_score); + } + /* fprintf(stderr, "pl score = %f\n", pl_score); */ + + ctx->OPEN[start][end][dir][label] = alloc_open(score_max + pl_score + w, + start, end, label, dir, + max_LC, max_RC); + if (ctx->verbose_flag > 1) + fprintf(stderr, "\tOPEN[%d][%d][%d][%d](%f) = CLOSED[%d][%d][1], CLOSED[%d][%d][0]\n", + start, end, dir, label, (score_max + pl_score + w), start, m_argmax, m_argmax+1, end); + } /* end check there is at least a candidate */ + + } /* end min_dep_count */ + } /* end for label */ + } /* end for dir */ + /* end fill OPEN table */ + + + /* fill CLOSED table */ + /* codes for ra and la are almost duplicates, but unrolled to: + - avoid having ternary operators on every line, + - and improve cache locality while accessing to OPEN[][], + thus saving CPU time. + */ + + /* RA */ + /* init */ + nb_cands = 0; + score_max = MINF; + max_C = NULL; + max_O = NULL; + m_argmax = -1; /* for DBG */ /* MM: should be start+1 ? */ + label_argmax = -1; /* for DBG */ + + for(m=start+1; m<=end; m++){ + + for(label=0; label<labels_nb; label++){ + + cand_O = ctx->OPEN[start][m][ra][label]; + if(cand_O != NULL) + score_cand_O = cand_O->score; + else + continue; + + cand_C = ctx->CLOSED[m][end][ra]; + if((cand_C != NULL) || (m == end)) + score_cand_C = cand_C ? cand_C->score : 0; + else + continue; + + /* only candidates reach this point */ + nb_cands++; + + if (ctx->verbose_flag > 1) + fprintf(stderr, "\t\tcand = OPEN[%d][%d][ra][%d](%f) + CLOSED[%d][%d][ra](%f)\n", + start, m, label, score_cand_O, + m, end, score_cand_C); + + /* update max */ + if(nb_cands == 1){ + /* first candidate */ + score_max = (score_cand_C + score_cand_O); + max_C = cand_C; + max_O = cand_O; + m_argmax = m; /* DBG */ + label_argmax = label; /* DBG */ + } else { + if((score_cand_C + score_cand_O) > score_max){ + score_max = (score_cand_C + score_cand_O); + max_C = cand_C; + max_O = cand_O; + m_argmax = m; /* DBG */ + label_argmax = label; /* DBG */ + } + } /* end update max */ + } /* end for label */ + } /* end for m */ + + /* create best closed */ + if(nb_cands > 0){ + ctx->CLOSED[start][end][ra] = alloc_closed(score_max, start, end, m_argmax, ra, + max_C, max_O); + if (ctx->verbose_flag > 1) + fprintf(stderr, "\tCLOSED[%d][%d][ra](%f) = OPEN[%d][%d][ra][%d], CLOSED[%d][%d][ra]\n", + start,end,score_max,start,m_argmax,label_argmax,m_argmax,end); + } + /* end RA */ + + /* LA */ + /* init */ + nb_cands = 0; + score_max = MINF; + max_C = NULL; + max_O = NULL; + m_argmax = -1; /* DBG */ /* MM: should be start ? */ + label_argmax = -1; /* DBG */ + + for(m=start; m<end; m++){ + + for(label=0; label<labels_nb; label++){ + + cand_O = ctx->OPEN[m][end][la][label]; + if(cand_O != NULL) + score_cand_O = cand_O->score; + else + continue; + + cand_C = ctx->CLOSED[start][m][la]; + if((cand_C != NULL) || (m == start)) + score_cand_C = cand_C ? cand_C->score : 0; + else + continue; + + /* only candidates reach this point */ + nb_cands++; + + if (ctx->verbose_flag > 1) + fprintf(stderr, "\t\tcand = CLOSED[%d][%d][la](%f) + OPEN[%d][%d][la][%d](%f)\n", + start, m, score_cand_C, + m, end, label, score_cand_O); + + /* update max */ + if(nb_cands == 1){ + score_max = (score_cand_C + score_cand_O); + max_C = cand_C; + max_O = cand_O; + m_argmax = m; /* DBG */ + label_argmax = label; /* DBG */ + } else { + if ( (score_cand_C + score_cand_O) > score_max){ + score_max = (score_cand_C + score_cand_O); + max_C = cand_C; + max_O = cand_O; + m_argmax = m; /* DBG */ + label_argmax = label; /* DBG */ + } + } /* end update max */ + + } /* end for label */ + } /* end for m */ + + /* create best closed */ + if(nb_cands > 0){ + ctx->CLOSED[start][end][la] = alloc_closed(score_max, start, end, m_argmax, la, + max_C, max_O); + + if (ctx->verbose_flag > 1) + fprintf(stderr, "\tCLOSED[%d][%d][la](%f) = CLOSED[%d][%d][la], OPEN[%d][%d][la][%d]\n", + start,end,score_max,start,m_argmax,m_argmax,end,label_argmax); + } + /* end LA */ + /* end fill CLOSED table */ + + } /* end start */ + } /* end span */ + + if(fv_basic){ + free_feat_vector(fv_basic); + fv_basic = NULL; + } + if(fv_first){ + free_feat_vector(fv_first); + fv_first = NULL; + } +} + +/*-------------------------------------------------------------------------------------------*/ + +void maca_graph_parser_decoder1_cleanup(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s) +{ + int labels_nb = ctx->labels_nb; + int sentence_length = s->l; + int start; + int end; + int dir; /* left or right */ + int label; /*label*/ + + for(start = 0; start < sentence_length; start++){ + for(end=0; end < sentence_length; end++){ + for(dir=0; dir<2; dir++){ + /* closed */ + free(ctx->CLOSED[start][end][dir]); + ctx->CLOSED[start][end][dir] = NULL; + /* open */ + for(label=0; label<labels_nb; label++){ + free(ctx->OPEN[start][end][dir][label]); + ctx->OPEN[start][end][dir][label] = NULL; + } + } + } + } + + // faster implementation + if(sentence_length > 0) { + free(ctx->OPEN[0][0][0]); + free(ctx->OPEN[0][0]); + free(ctx->OPEN[0]); + free(ctx->OPEN); + free(ctx->CLOSED[0][0]); + free(ctx->CLOSED[0]); + free(ctx->CLOSED); + } + // reference implementation + /*for(start = 0; start < sentence_length; start++) { + for(end = 0; end < sentence_length; end++) { + for(dir = 0; dir < 2; dir++) { + free(ctx->OPEN[start][end][dir]); + } + free(ctx->OPEN[start][end]); + } + free(ctx->OPEN[start]); + } + free(ctx->OPEN); + for(start = 0; start < sentence_length; start++) { + for(end = 0; end < sentence_length; end++) { + free(ctx->CLOSED[start][end]); + } + free(ctx->CLOSED[start]); + } + free(ctx->CLOSED);*/ + +} + +/*-------------------------------------------------------------------------------------------*/ +void maca_graph_parser_decoder1_init(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s) +{ + int labels_nb = ctx->labels_nb; + int sentence_length = s->l; + int start; + int end; + int dir; /* left or right */ + int label; /*label*/ + + // faster implementation + ctx->CLOSED = malloc(sizeof(Closed***) * sentence_length); + ctx->OPEN = malloc(sizeof(Open****) * sentence_length); + if(sentence_length > 0) { + ctx->CLOSED[0] = malloc(sizeof(Closed**) * sentence_length * sentence_length); + ctx->CLOSED[0][0] = malloc(sizeof(Closed*) * sentence_length * sentence_length * 2); + for(start = 0; start < sentence_length; start++) { + ctx->CLOSED[start] = ctx->CLOSED[0] + start * sentence_length; + for(end = 0; end < sentence_length; end++) { + ctx->CLOSED[start][end] = ctx->CLOSED[0][0] + ((start * sentence_length) + end) * 2; + } + } + ctx->OPEN[0] = malloc(sizeof(Open***) * sentence_length * sentence_length); + ctx->OPEN[0][0] = malloc(sizeof(Open**) * sentence_length * sentence_length * 2); + ctx->OPEN[0][0][0] = malloc(sizeof(Open*) * sentence_length * sentence_length * 2 * ctx->labels_nb); + for(start = 0; start < sentence_length; start++) { + ctx->OPEN[start] = ctx->OPEN[0] + start * sentence_length; + for(end = 0; end < sentence_length; end++) { + ctx->OPEN[start][end] = ctx->OPEN[0][0] + ((start * sentence_length) + end) * 2; + for(dir = 0; dir < 2; dir++) { + ctx->OPEN[start][end][dir] = ctx->OPEN[0][0][0] + ((((start * sentence_length) + end) * 2) + dir) * ctx->labels_nb; + } + } + } + } + // reference implementation + /*ctx->CLOSED = malloc(sizeof(Closed***) * sentence_length); + for(start = 0; start < sentence_length; start++) { + ctx->CLOSED[start] = malloc(sizeof(Closed**) * sentence_length); + for(end = 0; end < sentence_length; end++) { + ctx->CLOSED[start][end] = malloc(sizeof(Closed*) * 2); + } + } + ctx->OPEN = malloc(sizeof(Open****) * sentence_length); + for(start = 0; start < sentence_length; start++) { + ctx->OPEN[start] = malloc(sizeof(Open***) * sentence_length); + for(end = 0; end < sentence_length; end++) { + ctx->OPEN[start][end] = malloc(sizeof(Open**) * 2); + for(dir = 0; dir < 2; dir++) { + ctx->OPEN[start][end][dir] = malloc(sizeof(Open*) * ctx->labels_nb); + } + } + }*/ + + for(start=0; start < sentence_length; start++){ + for(end=0; end < sentence_length; end++){ + for(dir=0; dir<2; dir++){ + /* closed */ + ctx->CLOSED[start][end][dir] = NULL; + /* open */ + for(label=0; label<labels_nb; label++){ + ctx->OPEN[start][end][dir][label] = NULL; + } + } + } + } +} + + +/*-------------------------------------------------------------------------------------------*/ + +void maca_graph_parser_decoder1_parse(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, maca_graph_parser_feature_table *feat_table) +{ + Closed *bestSpan = NULL; + + maca_graph_parser_decoder1_decode(ctx, s, ctx->feature_table); + + bestSpan = ctx->CLOSED[0][s->l-1][ra]; + if(bestSpan != NULL){ + create_closed(bestSpan, s); + s->score = bestSpan->score; + } + +} diff --git a/maca_graph_parser/maca_graph_parser_decoder1.h b/maca_graph_parser/maca_graph_parser_decoder1.h new file mode 100644 index 0000000000000000000000000000000000000000..3354a3b7f50ae2e43dd54c75de661ffcdd1770f3 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_decoder1.h @@ -0,0 +1,30 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_GRAPH_PARSER_DECODER_1__ +#define __MACA_GRAPH_PARSER_DECODER_1__ + +#include "maca_common.h" +#include "maca_constants.h" +#include "maca_graph_parser.h" +#include "maca_graph_parser_sentence.h" + + + +#endif diff --git a/maca_graph_parser/maca_graph_parser_decoder2.c b/maca_graph_parser/maca_graph_parser_decoder2.c new file mode 100644 index 0000000000000000000000000000000000000000..6320294cb682a93aedd3bbba627cf6ac67b39dc5 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_decoder2.c @@ -0,0 +1,768 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#include<stdlib.h> +#include<stdio.h> +#include<time.h> +#include"maca_constants.h" +#include"maca_msg.h" +#include"maca_graph_parser_decoder.h" +#include"maca_graph_parser_features.h" +#include"maca_graph_parser.h" +#include "maca_graph_parser_dep_count_table.h" +#include "maca_graph_parser_feature_table.h" + + +/*-------------------------------------------------------------------------------------------*/ + +//Closed *CLOSED2 +//[MACA_MAX_LENGTH_SENTENCE] /* start */ +//[MACA_MAX_LENGTH_SENTENCE] /* end */ +//[2] /* position of the root (left or right)*/ +//[MACA_MAX_LENGTH_SENTENCE] /* position of a son of the root */ +//; + +//Open *OPEN2 +//[MACA_MAX_LENGTH_SENTENCE] /* start */ +//[MACA_MAX_LENGTH_SENTENCE] /* end */ +//[2] /* position of the root (left or right) */ +//[NB_LABELS] /* label */ +//; +/*-------------------------------------------------------------------------------------------*/ + +/*-------------------------------------------------------------------------------------------*/ +void maca_graph_parser_decoder2_cleanup(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s) +{ + int labels_nb = ctx->labels_nb; + int sentence_length = s->l; + int start; + int end; + int dir; /* left or right */ + int label; /*label*/ + int m; + + for(start=0; start < sentence_length; start++){ + for(end=0; end < sentence_length; end++){ + for(dir=0; dir<2; dir++){ + /* closed */ + for(m=0; m<sentence_length; m++){ + free(ctx->CLOSED2[start][end][dir][m]); + ctx->CLOSED2[start][end][dir][m] = NULL; + } + /* open */ + for(label=0; label<labels_nb; label++){ + free(ctx->OPEN2[start][end][dir][label]); + ctx->OPEN2[start][end][dir][label] = NULL; + } + } + } + } + + // faster implementation + if(sentence_length > 0) { + free(ctx->OPEN2[0][0][0]); + free(ctx->OPEN2[0][0]); + free(ctx->OPEN2[0]); + free(ctx->OPEN2); + free(ctx->CLOSED2[0][0][0]); + free(ctx->CLOSED2[0][0]); + free(ctx->CLOSED2[0]); + free(ctx->CLOSED2); + } + // reference implementation + /*for(start = 0; start < sentence_length; start++) { + for(end = 0; end < sentence_length; end++) { + for(dir = 0; dir < 2; dir++) { + free(ctx->OPEN2[start][end][dir]); + } + free(ctx->OPEN2[start][end]); + } + free(ctx->OPEN2[start]); + } + free(ctx->OPEN2); + for(start = 0; start < sentence_length; start++) { + for(end = 0; end < sentence_length; end++) { + for(dir = 0; dir < 2; dir++) { + free(ctx->CLOSED2[start][end][dir]); + } + free(ctx->CLOSED2[start][end]); + } + free(ctx->CLOSED2[start]); + } + free(ctx->CLOSED2);*/ +} + +/*-------------------------------------------------------------------------------------------*/ +void maca_graph_parser_decoder2_init(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s) +{ + int labels_nb = ctx->labels_nb; + int sentence_length = s->l; + int start; + int end; + int dir; /* left or right */ + int label; /*label*/ + int m; /* breakpoint */ + + // faster implementation + ctx->CLOSED2 = malloc(sizeof(Closed****) * sentence_length); + ctx->OPEN2 = malloc(sizeof(Open****) * sentence_length); + if(sentence_length > 0) { + ctx->CLOSED2[0] = malloc(sizeof(Closed**) * sentence_length * sentence_length); + ctx->CLOSED2[0][0] = malloc(sizeof(Closed*) * sentence_length * sentence_length * 2); + ctx->CLOSED2[0][0][0] = malloc(sizeof(Closed*) * sentence_length * sentence_length * 2 * sentence_length); + for(start = 0; start < sentence_length; start++) { + ctx->CLOSED2[start] = ctx->CLOSED2[0] + start * sentence_length; + for(end = 0; end < sentence_length; end++) { + ctx->CLOSED2[start][end] = ctx->CLOSED2[0][0] + ((start * sentence_length) + end) * 2; + for(dir = 0; dir < 2; dir++) { + ctx->CLOSED2[start][end][dir] = ctx->CLOSED2[0][0][0] + ((((start * sentence_length) + end) * 2) + dir) * sentence_length; + } + } + } + ctx->OPEN2[0] = malloc(sizeof(Open***) * sentence_length * sentence_length); + ctx->OPEN2[0][0] = malloc(sizeof(Open**) * sentence_length * sentence_length * 2); + ctx->OPEN2[0][0][0] = malloc(sizeof(Open*) * sentence_length * sentence_length * 2 * ctx->labels_nb); + for(start = 0; start < sentence_length; start++) { + ctx->OPEN2[start] = ctx->OPEN2[0] + start * sentence_length; + for(end = 0; end < sentence_length; end++) { + ctx->OPEN2[start][end] = ctx->OPEN2[0][0] + ((start * sentence_length) + end) * 2; + for(dir = 0; dir < 2; dir++) { + ctx->OPEN2[start][end][dir] = ctx->OPEN2[0][0][0] + ((((start * sentence_length) + end) * 2) + dir) * ctx->labels_nb; + } + } + } + } + // reference implementation + /*ctx->CLOSED2 = malloc(sizeof(Closed****) * sentence_length); + for(start = 0; start < sentence_length; start++) { + ctx->CLOSED2[start] = malloc(sizeof(Closed***) * sentence_length); + for(end = 0; end < sentence_length; end++) { + ctx->CLOSED2[start][end] = malloc(sizeof(Closed**) * 2); + for(dir = 0; dir < 2; dir++) { + ctx->CLOSED2[start][end][dir] = malloc(sizeof(Closed*) * sentence_length); + } + } + } + ctx->OPEN2 = malloc(sizeof(Open****) * sentence_length); + for(start = 0; start < sentence_length; start++) { + ctx->OPEN2[start] = malloc(sizeof(Open***) * sentence_length); + for(end = 0; end < sentence_length; end++) { + ctx->OPEN2[start][end] = malloc(sizeof(Open**) * 2); + for(dir = 0; dir < 2; dir++) { + ctx->OPEN2[start][end][dir] = malloc(sizeof(Open*) * ctx->labels_nb); + } + } + }*/ + + for(start=0; start < sentence_length; start++){ + for(end=0; end < sentence_length; end++){ + for(dir=0; dir < 2; dir++){ + /* closed */ + for(m=0; m < sentence_length; m++){ + ctx->CLOSED2[start][end][dir][m] = NULL; + } + /* open */ + for(label=0; label < labels_nb; label++){ + ctx->OPEN2[start][end][dir][label] = NULL; + } + } + } + } + + /* create base substructures for dynamic programming: closed with span 0 */ + /* right attachments only for the fake root node at index 0 */ + ctx->CLOSED2[0][0][ra][0] = alloc_closed(0, 0, 0, 0, ra, NULL, NULL); + /* left and right attachments for the other nodes */ + for(start = 1; start < sentence_length; start++){ + ctx->CLOSED2[start][start][la][start] = alloc_closed(0, start, start, start, la, NULL, NULL); + ctx->CLOSED2[start][start][ra][start] = alloc_closed(0, start, start, start, ra, NULL, NULL); + } +} + +/*-------------------------------------------------------------------------------------------*/ +void maca_graph_parser_decoder2_decode(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, maca_graph_parser_feature_table *feat_table) +{ + int labels_nb = ctx->labels_nb; + int n = s->l; + int span; + int start; + int end; + int label; + int dir; /* left or right */ + int gov; + int dep; + int nb_cands; + float score_max; + Closed *max_LC; + Closed *max_RC; + int m_argmax; + int argmax_i_left; + int argmax_j_right; + int m; /*breakpoint (middle)*/ + int nb_cands_cands; + /* */ + Closed *cand_LC; + float score_cand_LC; + int i_LC; + int i; /* position of grandchildren or sibling for left closed*/ + Closed *cand_cand_LC; + float score_cand_cand_LC; + /* */ + Closed *cand_RC; + float score_cand_RC; + int j_RC; + int j; /* position of grandchildren or sibling for right closed*/ + Closed *cand_cand_RC; + float score_cand_cand_RC; + /* */ + float w; + float w_first; + float w_basic; + /* */ + Closed *max_C; + Open *max_O; + int argmax_i; + int label_argmax; + Open *cand_O; + float score_cand_O; + Closed *cand_C; + float score_cand_C; + float score_cand; + /* test: default edge label */ + /* int dft_label = maca_tags_get_code(ctx->cfg, "morpho", "fct", "__JOKER__"); */ + int dft_label = maca_alphabet_get_code(ctx->labels_alphabet, "__JOKER__"); + int length_class; + feat_vector *fv_basic = NULL; + feat_vector *fv_first = NULL; + feat_vector *fv_sibling = NULL; + feat_vector *fv_grandchildren = NULL; + + for(span = 1; span < n; span++){ + for(start=0; start < n-span; start++){ + end = start + span; + if (PRINT) printf("start = %d end = %d\n",start,end); + length_class = maca_graph_parser_dep_count_table_compute_length_class(start, end); + /* fill OPEN2 table*/ + for(label=0; label<labels_nb; label++){ + if (PRINT) printf("\t\t\tlabel = %d\n",label); + + for(dir=0; dir<2; dir++){ + gov = (dir == la)? end: start; + dep = (dir == la)? start: end; + + if((ctx->dep_count_table[s->pos[gov]][s->pos[dep]][label][length_class][dir] >= ctx->min_dep_count) || + (label == dft_label)){ + /* init */ + nb_cands = 0; + score_max = (float) MINF; + max_LC = NULL; + max_RC = NULL; + m_argmax = start; + argmax_i_left = start; + argmax_j_right = end; + + /* find best pair of CLOSED2 substructures */ + for(m=start; m<end; m++){ + + /* left closed */ + /* find the best candidate */ + nb_cands_cands = 0; + cand_LC = NULL; + score_cand_LC = (float) MINF; + i_LC = start; + for(i=start; i <= m; i++){ + cand_cand_LC = ctx->CLOSED2[start][m][ra][i]; + if((cand_cand_LC != NULL) || (m == start)){ + score_cand_cand_LC = cand_cand_LC ? cand_cand_LC->score : 0; + + if(ctx->store_in_feature_table){ + score_cand_cand_LC += (dir == la) ? feat_table->gra[start][end][i][dir][label] : feat_table->sib[start][end][i][dir][label]; + } + else{ + /*--compute scores on the fly ---*/ + if(dir == la){ + fv_grandchildren = maca_graph_parser_grandchildren_score(gov, dep, i, label, ctx, fv_grandchildren, &w); + score_cand_cand_LC += w; + } + else{ + fv_sibling = maca_graph_parser_sibling_score(gov, dep, i, label, ctx, fv_sibling, &w); + score_cand_cand_LC += w; + } + } + + + + nb_cands_cands++; + + /* update cand LC */ + if(nb_cands_cands == 1){ + score_cand_LC = score_cand_cand_LC; + cand_LC = cand_cand_LC; + i_LC = i; + } else { + if(score_cand_cand_LC > score_cand_LC){ + score_cand_LC = score_cand_cand_LC; + cand_LC = cand_cand_LC; + i_LC = i; + } + } + } + } + /* for this m, no MLC => no OPEN2 */ + if(nb_cands_cands == 0) + continue; + + /* right closed */ + /* find the best candidate */ + nb_cands_cands = 0; + cand_RC = NULL; + score_cand_RC = (float) MINF; + j_RC = end; + for(j=m+1; j <= end; j++){ + cand_cand_RC = ctx->CLOSED2[m+1][end][la][j]; + if((cand_cand_RC != NULL) || (m+1 == end)){ + score_cand_cand_RC = cand_cand_RC ? cand_cand_RC->score : 0; + + if(ctx->store_in_feature_table){ + score_cand_cand_RC += (dir == la) ? feat_table->sib[start][end][j][dir][label] : feat_table->gra[start][end][j][dir][label]; + } + else { + /* --- compute scores on the fly */ + if(dir == la){ + fv_sibling = maca_graph_parser_sibling_score(start, end, j, label, ctx, fv_sibling, &w); + score_cand_cand_RC += w; + } + else{ + fv_grandchildren = maca_graph_parser_grandchildren_score(start, end, j, label, ctx, fv_grandchildren, &w); + score_cand_cand_RC += w; + } + } + + nb_cands_cands++; + + /* update cand RC */ + if(nb_cands_cands == 1){ + score_cand_RC = score_cand_cand_RC; + cand_RC = cand_cand_RC; + j_RC = j; + } else { + if(score_cand_cand_RC > score_cand_RC){ + score_cand_RC = score_cand_cand_RC; + cand_RC = cand_cand_RC; + j_RC = j; + } + } + } + } + /* for this m, no MRC => no OPEN2 */ + if(nb_cands_cands == 0) + continue; + + /* only candidates reach this point */ + nb_cands++; + + if (ctx->verbose_flag > 4) fprintf(stderr, "\t\tcand = CLOSED2[%d][%d](%f) + CLOSED2[%d][%d](%f)\n", + start, m, score_cand_LC, + m+1, end, score_cand_RC); + + /* update max */ + if(nb_cands == 1){ + /* first candidate */ + score_max = score_cand_LC + score_cand_RC; + max_LC = cand_LC; + max_RC = cand_RC; + m_argmax = m; + argmax_i_left = i_LC; + argmax_j_right = j_RC; + } else { + if(score_cand_LC + score_cand_RC > score_max){ + score_max = score_cand_LC + score_cand_RC; + max_LC = cand_LC; + max_RC = cand_RC; + m_argmax = m; + argmax_i_left = i_LC; + argmax_j_right = j_RC; + } + } + + } /* end for m */ + + /* create OPEN2 if there is at least a candidate */ + if(nb_cands > 0){ + /* score from basic and first order features */ + if(ctx->store_in_feature_table){ + w = feat_table->pl[start][end][dir] + feat_table->lab[start][end][label][dir]; + } + else{ + /* compute scores on the fly */ + fv_first = maca_graph_parser_first_score(start, end, label, ctx, fv_first, &w_first); + fv_basic = maca_graph_parser_basic_score(gov, dep, ctx, fv_basic, &w_basic); + w = w_first + w_basic; + } + + if(ctx->verbose_flag > 4){ + char bstart[128]; + char bend[128]; + maca_alphabet_get_symbol(ctx->words_alphabet,s->words[start], bstart, sizeof(bstart)); + maca_alphabet_get_symbol(ctx->words_alphabet,s->words[end], bend, sizeof(bend)); + fprintf(stderr, "\tscore(%s(%d) %s-%d-%s %s(%d)) = %f\n", + bstart, start, + (dir==la)?"<":"", label, (dir==la)?"":">", + bend, end, + w); + } + + ctx->OPEN2[start][end][dir][label] = alloc_open(score_max + w, + start, end, label, dir, + max_LC, max_RC); + + if (ctx->verbose_flag > 4) + fprintf(stderr, "\tOPEN2[%d][%d][%d][%d](%f) = CLOSED2[%d][%d][1][%d], CLOSED2[%d][%d][0][%d]\n", + start, end, dir, label, (score_max + w), + start, m_argmax, argmax_i_left, + m_argmax+1, end, argmax_j_right); + } /* end check there is at least one candidate */ + + } /* end filter min_dep_count */ + } /* end dir */ + } /* end label */ + /* end OPEN2 table */ + + /* fill CLOSED2 table */ + int argmax_i; + for(m=start; m<=end; m++){ + /* la */ + dir = la; + if ((m < end) && (start != 0)){ + /* init */ + nb_cands = 0; + score_max = MINF; + max_C = NULL; + max_O = NULL; + argmax_i = start; // MM: unsure + label_argmax = -1; + + for(label=0; label<labels_nb; label++){ + for(i=m; i >= start; i--){ + + cand_O = ctx->OPEN2[m][end][dir][label]; + if(cand_O != NULL) + score_cand_O = cand_O->score; + else + continue; + + cand_C = ctx->CLOSED2[start][m][dir][i]; + if((cand_C != NULL) || (m == start)) + score_cand_C = cand_C ? cand_C->score : 0; + else + continue; + + /* only candidates reach this point */ + nb_cands++; + + score_cand = score_cand_O + score_cand_C; + if(ctx->store_in_feature_table){ + score_cand += feat_table->gra[m][end][i][dir][label]; + } + else{ + /* compute scores on the fly */ + fv_grandchildren = maca_graph_parser_grandchildren_score(m, end, i, label, ctx, fv_grandchildren, &w); + score_cand += w; + } + + + /* update max */ + if(nb_cands == 1){ + score_max = score_cand; + max_C = cand_C; + max_O = cand_O; + argmax_i = i; + label_argmax = label; + } else { + if(score_cand > score_max){ + score_max = score_cand; + max_C = cand_C; + max_O = cand_O; + argmax_i = i; + label_argmax = label; + } + } + + } + } /* end for label */ + + /* create best closed2 */ + if(nb_cands > 0){ + ctx->CLOSED2[start][end][dir][m] = alloc_closed(score_max, + start, end, m, dir, + max_C, max_O); + + if (ctx->verbose_flag > 4) + fprintf(stderr, "\tCLOSED2[%d][%d][%d][%d](%f) = OPEN2[%d][%d][%d][%d], CLOSED2[%d][%d][%d][%d]\n", + start, end, dir, m, score_max, + m, end, dir, label_argmax, + start, m, dir, argmax_i); + } + } + /* end la */ + + /* ra */ + dir = ra; + if (m > start){ + /* init */ + nb_cands = 0; + score_max = MINF; + max_C = NULL; + max_O = NULL; + argmax_i = end; // MM: unsure + label_argmax = -1; + + for(label=0; label<labels_nb; label++){ + for(i = m; i <= end; i++){ + + cand_O = ctx->OPEN2[start][m][dir][label]; + if(cand_O != NULL) + score_cand_O = cand_O->score; + else + continue; + + cand_C = ctx->CLOSED2[m][end][dir][i]; + if((cand_C != NULL) || (m == end)) + score_cand_C = cand_C ? cand_C->score : 0; + else + continue; + + /* only candidates reach this point */ + nb_cands++; + + score_cand = score_cand_O + score_cand_C; + if(ctx->store_in_feature_table){ + score_cand += feat_table->gra[start][m][i][dir][label]; + } + else{ + /* compute scores on the fly */ + fv_grandchildren = maca_graph_parser_grandchildren_score(start, m, i, label, ctx, fv_grandchildren, &w); + score_cand += w; + } + + /* update max */ + if(nb_cands == 1){ + score_max = score_cand; + max_C = cand_C; + max_O = cand_O; + argmax_i = i; + label_argmax = label; + } else { + if(score_cand > score_max){ + score_max = score_cand; + max_C = cand_C; + max_O = cand_O; + argmax_i = i; + label_argmax = label; + } + } + + } + } /* end for label */ + + /* create best closed2 */ + if(nb_cands > 0){ + ctx->CLOSED2[start][end][ra][m] = alloc_closed(score_max, + start, end, m, dir, + max_C, max_O); + + if (ctx->verbose_flag > 4) + fprintf(stderr, "\tCLOSED2[%d][%d][%d][%d](%f) = OPEN2[%d][%d][%d][%d], CLOSED2[%d][%d][%d][%d]\n", + start, end, dir, m, score_max, + start, m, dir, label_argmax, + m, end, dir, argmax_i); + } + } /* end ra */ + + } /* end m */ + } /* end start */ + } /* end span */ +} + +/*-------------------------------------------------------------------------------------------*/ + +void maca_graph_parser_decoder2_parse(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, maca_graph_parser_feature_table *feat_table){ + Closed *bestSpan = NULL; + float max = (float) MINF; + float cand = (float) MINF; + int nb_cands; + int argmax; + int i; + + maca_graph_parser_decoder2_decode(ctx, s, ctx->feature_table); + + /* find best structure for the sentence */ + nb_cands = 0; + for(i=1; i < s->l; i++){ + + if(ctx->CLOSED2[0][s->l-1][ra][i] != NULL){ + cand = ctx->CLOSED2[0][s->l-1][ra][i]->score; + nb_cands++; + + /* update max */ + if(nb_cands == 1){ + max = cand; + argmax = i; + } else { + if(cand > max){ + max = cand; + argmax = i; + } + } + } + } + + if(nb_cands > 0){ + bestSpan = ctx->CLOSED2[0][s->l-1][ra][argmax]; + create_closed(bestSpan, s); + s->score = bestSpan->score; + } +} + + + /*fill CLOSED2 table ALEXIS*/ + // max = MINF; + // m_argmax = start; + // for(m=start; m<=end; m++){ + // for(label=0; label<L; label++){ + // cand = MINF; + // if(dir == la){ + // if(m < end){ + // cand = (OPEN2[m][end][dir][label]? OPEN2[m][end][dir][label]->score : 0) + // + (CLOSED2[start][m][dir]? CLOSED2[start][m][dir]->score: 0); + // if (PRINT) printf("\t\t\tcand = %f + %f\n", + // (OPEN2[m][end][dir][label]? OPEN2[m][end][dir][label]->score : 0), + // (CLOSED2[start][m][dir]? CLOSED2[start][m][dir]->score: 0)); + // } + // } + // else{ /* dir == ra */ + // if(m > start){ + // cand = (OPEN2[start][m][dir][label]? OPEN2[start][m][dir][label]->score: 0) + // + (CLOSED2[m][end][dir]? CLOSED2[m][end][dir]->score : 0); + // if (PRINT) printf("\t\t\tcand = %f + %f\n", + // (OPEN2[start][m][dir][label]? OPEN2[start][m][dir][label]->score: 0) , + // (CLOSED2[m][end][dir]? CLOSED2[m][end][dir]->score : 0)); + // } + // } + // if(max < cand){ + // max = cand; + // m_argmax = m; + // label_argmax = label; + // } + // } + // } + // if(dir == la){ + // CLOSED2[start][end][dir] = alloc_closed(max, + // start, + // end, + // m_argmax, + // dir, + // CLOSED2[start][m_argmax][dir], + // OPEN2[m_argmax][end][dir][label_argmax]); + // + // if (PRINT) printf("\t\t\t\tCLOSED2[%d][%d][%d](%f) = CLOSED2[%d][%d][%d], OPEN2[%d][%d][%d][%d]\n", start,end,dir,max,start,m_argmax,dir,m_argmax,end,dir,label_argmax); + // } + // else /* dir == ra */{ + // CLOSED2[start][end][dir] = alloc_closed(max, + // start, + // end, + // m_argmax, + // dir, + // CLOSED2[m_argmax][end][dir], + // OPEN2[start][m_argmax][dir][label_argmax]); + // if (PRINT) printf("\t\t\t\tCLOSED2[%d][%d][%d](%f) = CLOSED2[%d][%d][%d], OPEN2[%d][%d][%d][%d]\n", start,end,dir,max,m_argmax,end,dir,start,m_argmax,dir,label_argmax); + // } + // } + + /*-------------------------------------------------------------------------------------------*/ +// +//void parser(int n) +//{ +// int t; /*end*/ +// int s; /*start*/ +// int d; /*direction*/ +// int l; /*label*/ +// int m; /*breakpoint (middle)*/ +// int i; /* */ +// double MO; /*max open*/ +// double MLC; /*max left closed*/ +// double MRC; /*max right closed*/ +// double MC; /*max closed*/ +// double cand; +// +// for(t=0; t<n; t++){ +// for(s=t; s >=0; s--){ +// for(d=0; d<2; d++){ +// /* fill OPEN table*/ +// for(l=0; l<labels_nb; l++){ +// MO = MINF; +// for(m=s; m<=t; m++){ +// MLC = MINF; +// for(i=s; s<m; i++){ +// if(d==1) cand = CLOSED2[s][m][1][i] + SIB(s,t,i,0,l); +// else cand = CLOSED2[s][m][1][i] + GRA(t,s,i,1,l); +// if (cand > MLC) MLC=cand; +// } +// MRC = MINF; +// for(i=m; s<t; i++){ +// if(d==1) cand = CLOSED2[i][t][0][i] + GRA(s,t,i,0,l); +// else cand = CLOSED2[m][t][0][i] + SIB(t,s,i,1,l); +// if (cand > MRC) MRC=cand; +// } +// cand = MLC + MRC; +// if(MO < cand) MO = cand; +// } +// OPEN2[s][t][d][l]=MO; +// } +// +// +// /*fill CLOSED table*/ +// for(m=s; m<=t; m++){ +// CLOSED2[s][t][d][m] = MINF; +// for(l=0; l<L; l++){ +// if(d == 0){ +// MO = OPEN2[m][t][d][l]; +// MC = MINF; +// for(i=s; s<m; i++){ +// cand = CLOSED2[s][m][0][i] + GRA(t,m,i,1,l); +// if (cand > MC) MC=cand; +// } +// } +// else{ /* d == 1*/ +// MO = OPEN2[s][m][d][l]; +// MC = MINF; +// for(i=m; s<t; i++){ +// cand = CLOSED2[m][t][1][i] + GRA(s,m,i,0,l); +// if (cand > MC) MC=cand; +// } +// } +// cand = MO + MC; +// if(CLOSED2[s][t][d][m] < cand) CLOSED2[s][t][d][m] = cand; +// } +// } +// } +// } +// } +//} + diff --git a/maca_graph_parser/maca_graph_parser_decoder2.h b/maca_graph_parser/maca_graph_parser_decoder2.h new file mode 100644 index 0000000000000000000000000000000000000000..3b5e29676995ba9ef7b1b430167c13f82d0c51a6 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_decoder2.h @@ -0,0 +1,29 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_GRAPH_PARSER_DECODER2__ +#define __MACA_GRAPH_PARSER_DECODER2__ + +#include "maca_common.h" +#include "maca_constants.h" +#include "maca_graph_parser.h" +#include "maca_graph_parser_sentence.h" + + +#endif diff --git a/maca_graph_parser/maca_graph_parser_dep_count_table.c b/maca_graph_parser/maca_graph_parser_dep_count_table.c new file mode 100644 index 0000000000000000000000000000000000000000..93e7c3c4e44a8822f30929095065ddea405cb07a --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_dep_count_table.c @@ -0,0 +1,154 @@ +/*************************************************************************** + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +**************************************************************************/ + +#include "maca_graph_parser_dep_count_table.h" +#include "maca_msg.h" + +#include<stdio.h> +#include<stdlib.h> + +#define MAX_LENGTH_CLASSES 10 +#define LENGTH_CLASS_UNIT 4 + +int maca_graph_parser_dep_count_table_compute_length_class(int gov, int dep){ + int len; + if(gov > dep){ + len = (gov - dep) / LENGTH_CLASS_UNIT; + if (len < MAX_LENGTH_CLASSES) return len; else return MAX_LENGTH_CLASSES - 1; + } + else{ + len = (dep - gov) / LENGTH_CLASS_UNIT; + if (len < MAX_LENGTH_CLASSES) return len; else return MAX_LENGTH_CLASSES - 1; + } +} + +maca_graph_parser_dep_count_table maca_graph_parser_dep_count_table_allocate(int pos_nb, int synt_labels_nb) +{ + maca_graph_parser_dep_count_table t; + int gov, dep, label, length_class; + + t = malloc((size_t)pos_nb * sizeof(int ****)); + for(gov = 0; gov < pos_nb; gov++){ + t[gov] = malloc((size_t)pos_nb * sizeof(int ***)); + for(dep = 0; dep < pos_nb; dep++){ + t[gov][dep] = malloc((size_t)synt_labels_nb * sizeof(int **)); + for(label = 0; label < synt_labels_nb; label++){ + t[gov][dep][label] = malloc(MAX_LENGTH_CLASSES * sizeof (int *)); + for(length_class=0; length_class < MAX_LENGTH_CLASSES; length_class++){ + t[gov][dep][label][length_class] = malloc(2 * sizeof(int)); + t[gov][dep][label][length_class][0] = 0; + t[gov][dep][label][length_class][1] = 0; + } + } + } + } + return t; +} + +void maca_graph_parser_dep_count_table_free(int pos_nb, int synt_labels_nb, maca_graph_parser_dep_count_table t) +{ + int gov, dep, label, length_class; + + for(gov = 0; gov < pos_nb; gov++){ + for(dep = 0; dep < pos_nb; dep++){ + for(label = 0; label < synt_labels_nb; label++){ + for(length_class=0; length_class < MAX_LENGTH_CLASSES; length_class++){ + free(t[gov][dep][label][length_class]); + } + free(t[gov][dep][label]); + } + free(t[gov][dep]); + } + free(t[gov]); + } + free(t); +} + +maca_graph_parser_dep_count_table maca_graph_parser_dep_count_table_read(maca_graph_parser_ctx * ctx, char *filename) +{ + FILE *f; + maca_graph_parser_dep_count_table t; + int gov, dep, label, dir, count, length_class; + int fields_nb; + t = maca_graph_parser_dep_count_table_allocate(ctx->pos_nb, ctx->labels_nb); + f = fopen(filename, "rb"); + if(f == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "cannot open file %s\n", filename); + exit(1); + } + while((fields_nb = fscanf(f, "%d\t%d\t%d\t%d\t%d\t%d\n", &gov, &dep, &label, &length_class, &dir, &count)) != EOF){ + if(fields_nb != 6){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "wrong number of fields in file %s (expected %d, found %d)\n", filename, 6, fields_nb); + exit(1); + } + //fprintf(stderr, "%d %d %d %d %d %d\n", gov, dep, label, length_class, dir, count); + t[gov][dep][label][length_class][dir] = count; + } + fclose(f); + return t; +} + +void maca_graph_parser_dep_count_table_print(maca_graph_parser_ctx * ctx, char *filename) +{ + FILE *f; + maca_graph_parser_dep_count_table t = ctx->dep_count_table; + int gov, dep, label, dir, count, length_class; + + if(filename == NULL) + f = stdout; + else{ + f = fopen(filename, "w"); + if(f == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "cannot open file %s\n", filename); + exit(1); + } + } + for(gov = 0; gov < ctx->pos_nb; gov++){ + for(dep = 0; dep < ctx->pos_nb; dep++){ + for(label = 0; label < ctx->labels_nb; label++){ + for(length_class=0; length_class < MAX_LENGTH_CLASSES; length_class++){ + if(t[gov][dep][label][length_class][la])fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\n", gov, dep, label, length_class, la, t[gov][dep][label][length_class][la]); + if(t[gov][dep][label][length_class][ra])fprintf(f, "%d\t%d\t%d\t%d\t%d\t%d\n", gov, dep, label, length_class, ra, t[gov][dep][label][length_class][ra]); + } + } + } + } + if(filename) + fclose(f); +} + +void maca_graph_parser_dep_count_table_update(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s) +{ + int i; + int dir; + maca_graph_parser_dep_count_table t = ctx->dep_count_table; + int length_class; + + for(i=1; i < s->l; i++){ + dir = (s->gov[i] < i) ? ra : la; + if((s->pos[i] >= 0) && (s->label[i] >= 0) && (s->pos[s->gov[i]] >= 0)){ + length_class = maca_graph_parser_dep_count_table_compute_length_class(s->gov[i], i); + /* fprintf(stderr, "length class = %d\n", length_class); */ + t[s->pos[s->gov[i]]][s->pos[i]][s->label[i]][length_class][dir]++; + } + } +} diff --git a/maca_graph_parser/maca_graph_parser_dep_count_table.h b/maca_graph_parser/maca_graph_parser_dep_count_table.h new file mode 100644 index 0000000000000000000000000000000000000000..2e612131a73b2074626e4c1cfbdc953d9d52f754 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_dep_count_table.h @@ -0,0 +1,41 @@ +/*************************************************************************** + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +**************************************************************************/ + +#ifndef __MACA_GRAPH_PARSER_DEP_COUNT_TABLE__ +#define __MACA_GRAPH_PARSER_DEP_COUNT_TABLE__ + +#include "maca_graph_parser.h" + +#ifdef __cplusplus +extern "C"{ +#endif + + +maca_graph_parser_dep_count_table maca_graph_parser_dep_count_table_allocate(int pos_nb, int synt_labels_nb); +void maca_graph_parser_dep_count_table_free(int pos_nb, int synt_labels_nb, maca_graph_parser_dep_count_table t); +maca_graph_parser_dep_count_table maca_graph_parser_dep_count_table_read(maca_graph_parser_ctx *ctx, char *filename); +void maca_graph_parser_dep_count_table_print(maca_graph_parser_ctx * ctx, char *filename); +void maca_graph_parser_dep_count_table_update(maca_graph_parser_ctx * ctx, maca_graph_parser_sentence *s); +int maca_graph_parser_dep_count_table_compute_length_class(int gov, int dep); +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/maca_graph_parser/maca_graph_parser_eval_main.c b/maca_graph_parser/maca_graph_parser_eval_main.c new file mode 100644 index 0000000000000000000000000000000000000000..a2e806829ff1f7f5bc7a925c988caafbe5424bfd --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_eval_main.c @@ -0,0 +1,95 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +**************************************************************************/ + +#include<getopt.h> +#include<unistd.h> +#include<stdio.h> +#include<string.h> + +#include "maca_common.h" +#include "maca_constants.h" +#include "maca_graph_parser.h" +#include "maca_graph_parser_sentence.h" +#include "maca_graph_parser_hash.h" +#include "maca_graph_parser_features.h" +#include "maca_graph_parser_decoder1.h" +#include "maca_graph_parser_conll2007_format.h" +#include "maca_graph_parser_metrics.h" +#include "maca_graph_parser_decoder.h" +#include "maca_graph_parser.h" + +/*-------------------------------------------------------------------------------------------*/ + +int main(int argc, char **argv) +{ + maca_graph_parser_ctx * ctx; + maca_graph_parser_sentence *ref = NULL; + maca_graph_parser_sentence *hyp = NULL; + int sent_num; + double sentence_las; + int total_correct_dep = 0; + int total_dep = 0; + double corpus_las; + /* kbest: oracle LAS */ + double sentence_las_oracle; + int total_correct_dep_oracle = 0; + double corpus_las_oracle; + + ctx = maca_graph_parser_LoadCTX(argc,argv); + + ref = maca_graph_parser_allocate_sentence(ctx); + hyp = maca_graph_parser_allocate_sentence(ctx); + FILE *conll_file = NULL; // TODO: fix + if(conll_file){ + sent_num = 0; + while(1){ + if(sent_num >= ctx->sent_nb) break; + + ref = maca_graph_parser_read_conll_sentence(ctx, conll_file, ref); + if(ref == NULL) break; + if(ref->l == 1) break; + if(ref->l >= ctx->max_sent_length) continue; + + hyp = maca_graph_parser_duplicate_sentence(ctx, ref, hyp); + maca_graph_parser_decoder_parse(ctx, hyp); + + /* metrics */ + sentence_las = maca_graph_parser_sentence_compute_las(ref, hyp, 0); + total_correct_dep += maca_graph_parser_sentence_compute_las(ref, hyp, 1); + total_dep += (hyp->l -1); /* MM: bugfix for compute_las: -1 to discount fake root */ + corpus_las = (double) total_correct_dep / (double) total_dep; + fprintf(stderr, "%d\t%f\t%f", sent_num, sentence_las, corpus_las); + if(ctx->k > 1){ + sentence_las_oracle = maca_graph_parser_sentence_compute_las_oracle(ref, hyp, ctx->k, 0); + total_correct_dep_oracle += maca_graph_parser_sentence_compute_las_oracle(ref, hyp, ctx->k, 1); + corpus_las_oracle = (double) total_correct_dep_oracle / (double) total_dep; + fprintf(stderr, "\t%f\t%f", sentence_las_oracle, corpus_las_oracle); + } + fprintf(stderr, "\n"); + + sent_num++; + } + printf("LAS = %f\n", corpus_las); + } + maca_graph_parser_free_sentence(hyp); + maca_graph_parser_free_sentence(ref); + + maca_graph_parser_free_all(ctx); + return 0; +} diff --git a/maca_graph_parser/maca_graph_parser_feature_counter.c b/maca_graph_parser/maca_graph_parser_feature_counter.c new file mode 100644 index 0000000000000000000000000000000000000000..f3ffd06b3d2a196abf8a3dceb763f4d500e7e699 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_feature_counter.c @@ -0,0 +1,232 @@ +#include "maca_graph_parser_feature_counter.h" + +#define hash_func(key,size) (int)((key) % (feature_t)(size)) + + +feature_counter *feature_counter_new(int nb_elts, float coeff){ + /** + * Create a feature counter of size (nb_elts / coeff). + */ + + feature_counter *c = malloc(sizeof(feature_counter)); + + c->size = (int)((double) nb_elts / coeff); + c->nb_elts = 0; + c->keys = malloc(sizeof(feature_t) * (size_t) c->size); + c->values = malloc(sizeof(int) * (size_t) c->size); + + if((c->keys == NULL) || (c->values == NULL)){ + fprintf(stderr, "feature_counter_new: mem alloc error\n"); + exit(1); + } + + int i; + for(i = 0; i < c->size; i++){ + c->keys[i] = 0; + c->values[i] = 0; + } + return c; +} + + +void feature_counter_destroy(feature_counter *c){ + /** + * Destroy feature counter. + */ + if(c == NULL) + return; + + free(c->keys); + c->keys = NULL; + free(c->values); + c->values = NULL; + free(c); +} + + +int feature_counter_get_index(feature_counter *c, feature_t f){ + /** + * (private function) get the index where a feature should be. + */ + + int index = hash_func(f, c->size); + /* open addressing, linear probing */ + int i; + for(i=0; i < c->size; i++){ + /* new key */ + if(c->keys[index] == 0) return index; + /* existing key */ + if(c->keys[index] == f) return index; + /* collision */ + index = (index + 1) % (c->size); + } + /* no index found iff the feature_counter is full */ + /* TODO: dynamically extend the table */ + fprintf(stderr, "Feature counter full (size = %d)\n", c->size); + exit(1); +} + + +void feature_counter_update_vector(feature_counter *c, feat_vector *v){ + /** + * Add counts from elements in v. + */ + + if(v == NULL) + return; + + int i; + for(i = 0; i < v->elt_nb; i++){ + feature_t f = v->array[i]; + int index = feature_counter_get_index(c, f); + /* insert new key */ + if(c->keys[index] == 0){ + c->keys[index] = f; + c->nb_elts += 1; + } + /* increment value for new or existing key */ + if(c->keys[index] == f){ + c->values[index] += 1; + } + } +} + + +void feature_counter_subtract_vector(feature_counter *c, feat_vector *v){ + /** + * Subtract counts from elements in v. + */ + + if(v == NULL) + return; + + int i; + for(i = 0; i < v->elt_nb; i++){ + feature_t f = v->array[i]; + int index = feature_counter_get_index(c, f); + /* insert new key */ + if(c->keys[index] == 0){ + c->keys[index] = f; + c->nb_elts += 1; + } + /* decrement value for new or existing key */ + if(c->keys[index] == f){ + c->values[index] -= 1; + } + } +} + + +void feature_counter_update(feature_counter *c, feature_counter *other){ + /** + * Update the feature counter with feature counts from other. + */ + + if((c == NULL) || (other == NULL)) + return; + + int i; + for(i=0; i < other->size; i++){ + if(other->keys[i] != 0){ + /* get feature count */ + feature_t f = other->keys[i]; + int count = other->values[i]; + /* get index in c */ + int index = feature_counter_get_index(c, f); + /* add feature to c if necessary */ + if(c->keys[index] == 0){ + c->keys[index] = f; + c->nb_elts += 1; + } + /* add count from other */ + c->values[index] += count; + } + } +} + + +void feature_counter_subtract(feature_counter *c, feature_counter *other){ + /** + * Subtract from the feature counter feature counts from other. + */ + + if((c == NULL) || (other == NULL)) + return; + + int i; + for(i=0; i < other->size; i++){ + if(other->keys[i] != 0){ + /* get feature count */ + feature_t f = other->keys[i]; + int count = other->values[i]; + /* get index in c */ + int index = feature_counter_get_index(c, f); + /* add feature to c if necessary */ + if(c->keys[index] == 0){ + c->keys[index] = f; + c->nb_elts += 1; + } + /* add count from other */ + c->values[index] -= count; + } + } +} + + +int feature_counter_squared_norm(feature_counter *c){ + /** + * Compute the squared norm of c. + */ + + int result = 0; + + int i; + for(i=0; i < c->size; i++){ + int v = c->values[i]; + result += (v * v); + } + + return result; +} + + +feature_count_vector *feature_counter_items(feature_counter *c){ + /** + * Get a vector of feature counts. + */ + + feature_count_vector *v = feature_count_vector_allocate(c->nb_elts); + + int i; + for(i = 0; i < c->size; i++){ + if(c->keys[i] != 0){ + /* append to vector */ + v->counts[v->nb_elts].key = c->keys[i]; + v->counts[v->nb_elts].value = c->values[i]; + v->nb_elts += 1; + } + } + + return v; +} + + +/* feature count vector */ +feature_count_vector *feature_count_vector_allocate(int size){ + feature_count_vector *v = malloc(sizeof(feature_count_vector)); + v->size = size; + v->nb_elts = 0; + v->counts = malloc(size * sizeof(feature_count)); + return v; +} + + +void feature_count_vector_free(feature_count_vector *v){ + if(v == NULL) + return; + + free(v->counts); + v->counts = NULL; + free(v); +} +/* end feature count vector */ diff --git a/maca_graph_parser/maca_graph_parser_feature_counter.h b/maca_graph_parser/maca_graph_parser_feature_counter.h new file mode 100644 index 0000000000000000000000000000000000000000..280ba9d5afdfc1c7febcb28b927037978f122f04 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_feature_counter.h @@ -0,0 +1,43 @@ +#include "maca_graph_parser.h" +#include "maca_graph_parser_feature_vector.h" + + +typedef struct { + int size; + int nb_elts; + feature_t *keys; + int *values; +} feature_counter; + +typedef struct { + feature_t key; + int value; +} feature_count; + +typedef struct { + int size; + int nb_elts; + feature_count *counts; +} feature_count_vector; + + +#ifdef __cplusplus +extern "C"{ +#endif + + + +feature_counter *feature_counter_new(int nb_elts, float coeff); +void feature_counter_destroy(feature_counter *c); +void feature_counter_update_vector(feature_counter *c, feat_vector *v); +void feature_counter_subtract_vector(feature_counter *c, feat_vector *v); +void feature_counter_update(feature_counter *c, feature_counter *other); +void feature_counter_subtract(feature_counter *c, feature_counter *other); +int feature_counter_squared_norm(feature_counter *c); +feature_count_vector *feature_counter_items(feature_counter *c); + +feature_count_vector *feature_count_vector_allocate(int size); +void feature_count_vector_free(feature_count_vector *v); +#ifdef __cplusplus +} +#endif diff --git a/maca_graph_parser/maca_graph_parser_feature_counter_array.c b/maca_graph_parser/maca_graph_parser_feature_counter_array.c new file mode 100644 index 0000000000000000000000000000000000000000..074cdc6d67d77daffabf51130d1859a21f9e7dba --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_feature_counter_array.c @@ -0,0 +1,221 @@ +#include "maca_graph_parser_hash.h" +#include "maca_graph_parser_features.h" +#include "maca_graph_parser_feature_vector.h" +#include "maca_graph_parser_feature_counter_array.h" + + +feature_counter_array *allocate_feature_counter_array(maca_graph_parser_ctx *ctx, int nb_words){ + /** + * Allocate an array of feature counters. + */ + + feature_counter_array *a = malloc(sizeof(feature_counter_array)); + a->size = 4; + int i; + for(i=0; i < a->size; i++){ + a->array[i] = NULL; + } + + a->nb_words = nb_words; + + if(ctx->basic_features){ + a->array[0] = a->basic_feature_counter = feature_counter_new(2 * nb_words * BasicFeatNb, 0.8); + } + if(ctx->first_features){ + a->array[1] = a->first_feature_counter = feature_counter_new(2 * nb_words * FirstOrderFeatNb, 0.8); + } + if(ctx->basic_features){ + a->array[2] = a->grandchildren_feature_counter = feature_counter_new(2 * nb_words * 2 * GrandchildrenFeatNb, 0.8); + } + if(ctx->basic_features){ + a->array[3] = a->sibling_feature_counter = feature_counter_new(2 * nb_words * SiblingFeatNb, 0.8); + } + + return a; +} + +void free_feature_counter_array(feature_counter_array *a){ + /** + * Free a. + */ + + if(a == NULL) + return; + + int i; + for(i = 0; i < a->size; i++){ + if(a->array[i]){ + feature_counter_destroy(a->array[i]); + } + } + free(a); +} + + +int feature_counter_array_squared_norm(feature_counter_array *a){ + /** + * Return the squared norm of the feature counter array a. + */ + + int result = 0; + + int i; + for(i=0; i < a->size; i++){ + if (a->array[i] == NULL) + continue; + + result += feature_counter_squared_norm(a->array[i]); + } + + return result; +} + + +feature_counter_array *feature_counter_array_difference(maca_graph_parser_ctx *ctx, feature_counter_array *a, feature_counter_array *b){ + /** + * Return a new feature_counter_array with elements in a not in b. + */ + + feature_counter_array *result = allocate_feature_counter_array(ctx, a->nb_words); + + int i; + for(i=0; i < a->size; i++){ + feature_counter *c_a = a->array[i]; + feature_counter *c_b = b->array[i]; + if(c_a == NULL) + continue; + + feature_counter *c_res = result->array[i]; + feature_counter_update(c_res, c_a); + feature_counter_subtract(c_res, c_b); + } + + return result; +} + + +feature_counter_array *extract_features_from_parse_fca(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, feature_counter_array *a){ + /** + * Extract features from s and put them in an array of feature counters. + * + * Create a new array of feature counters if a is NULL. + */ + + if(a == NULL){ + a = allocate_feature_counter_array(ctx, (s->l - 1)); + } else { /* reset a */ + free_feature_counter_array(a); + a = allocate_feature_counter_array(ctx, (s->l - 1)); + } + + feature_counter *c; + feat_vector *v; + int dep, gdep, sbl; + int g; + int i; + + if(ctx->basic_features){ + c = a->basic_feature_counter; + v = NULL; + for(dep=1; dep < s->l; dep++){ + v = basic(s, ctx, s->gov[dep], dep, v); + feature_counter_update_vector(c, v); + } + free_feat_vector(v); + v = NULL; + } + + if(ctx->first_features){ + c = a->first_feature_counter; + v = NULL; + for(dep=1; dep < s->l; dep++){ + v = first(s, ctx, s->gov[dep], dep, s->label[dep], v); + feature_counter_update_vector(c, v); + } + free_feat_vector(v); + v = NULL; + } + + if(ctx->sibling_features){ + c = a->sibling_feature_counter; + v = NULL; + for(dep=1; dep < s->l; dep++){ + // MM + sbl = -1; + g = s->gov[dep]; // governor + /* wanted sibling: child of g in [g..dep] that is closest to dep */ + if (g < dep) { /* ra */ + for(i=dep-1; i > g; i--){ + if(g == s->gov[i]){ // && (dep != i) + sbl = i; + break; + } + } + } else { /* la */ + for(i=dep+1; i < g; i++){ + if(g == s->gov[i]){ // (dep != i) && + sbl = i; + break; + } + } + } + /* sbl == -1 if no sibling */ + v = sibling(s, ctx, s->gov[dep], dep, sbl, s->label[dep], v); + feature_counter_update_vector(c, v); + } + free_feat_vector(v); + v = NULL; + } + + if(ctx->grandchildren_features){ + c = a->grandchildren_feature_counter; + v = NULL; + for(dep=1; dep < s->l; dep++){ + // MM + g = s->gov[dep]; + if (g < dep){ /* ra */ + /* cmi: inside [g;dep] */ + gdep = -1; + for(i=dep-1; i > g; i--){ + if(s->gov[i] == dep){ + gdep = i; + } + } + v = grandchildren(s, ctx, g, dep, gdep, s->label[dep], v); + feature_counter_update_vector(c, v); + /* cmo: outside [g;dep] */ + gdep = -1; + for(i=dep+1; i<s->l; i++){ + if(s->gov[i] == dep){ + gdep = i; + } + } + v = grandchildren(s, ctx, g, dep, gdep, s->label[dep], v); + feature_counter_update_vector(c, v); + } else { /* la */ + /* cmi: inside [dep;g] */ + gdep = -1; + for(i=dep+1; i < g; i++){ + if(s->gov[i] == dep){ + gdep = i; + } + } + v = grandchildren(s, ctx, g, dep, gdep, s->label[dep], v); + feature_counter_update_vector(c, v); + /* cmo: outside [dep;g] */ + gdep = -1; + for(i=dep-1; i>0; i--){ + if(s->gov[i] == dep){ + gdep = i; + } + } + v = grandchildren(s, ctx, g, dep, gdep, s->label[dep], v); + feature_counter_update_vector(c, v); + } + } + free_feat_vector(v); + v = NULL; + } + + return a; +} diff --git a/maca_graph_parser/maca_graph_parser_feature_counter_array.h b/maca_graph_parser/maca_graph_parser_feature_counter_array.h new file mode 100644 index 0000000000000000000000000000000000000000..157d901d5fd9b79556b06f2fcacf420f6d81d34b --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_feature_counter_array.h @@ -0,0 +1,28 @@ +#include "maca_graph_parser_feature_counter.h" + + +typedef struct { + feature_counter *basic_feature_counter; + feature_counter *first_feature_counter; + feature_counter *grandchildren_feature_counter; + feature_counter *sibling_feature_counter; + feature_counter *array[4]; + int size; + int nb_words; /* nb of words of the sentence (extremely ad-hoc field) */ +} feature_counter_array; + +#ifdef __cplusplus +extern "C"{ +#endif + + +feature_counter_array *allocate_feature_counter_array(maca_graph_parser_ctx *ctx, int nb_words); +void free_feature_counter_array(feature_counter_array *a); + +int feature_counter_array_squared_norm(feature_counter_array *a); +feature_counter_array *feature_counter_array_difference(maca_graph_parser_ctx *ctx, feature_counter_array *a, feature_counter_array *b); +feature_counter_array *extract_features_from_parse_fca(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, feature_counter_array *a); + + #ifdef __cplusplus +} +#endif diff --git a/maca_graph_parser/maca_graph_parser_feature_table.c b/maca_graph_parser/maca_graph_parser_feature_table.c new file mode 100644 index 0000000000000000000000000000000000000000..50166231677f14a2a11ffe202fbeff1de8d7f55f --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_feature_table.c @@ -0,0 +1,461 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#include<stdlib.h> +#include <string.h> +#include"maca_graph_parser_feature_table.h" +#include"maca_graph_parser_features.h" +#include"maca_graph_parser.h" +#include"maca_graph_parser_feature_vector.h" +#include"maca_graph_parser_model.h" +#include "maca_graph_parser_dep_count_table.h" + + +/* compute on the fly the score of a basic configuration */ + +feat_vector *maca_graph_parser_basic_score(int gov, int dep, maca_graph_parser_ctx *ctx, feat_vector *fv, float *score) +{ + fv = basic(ctx->s, ctx, gov, dep, fv); + *score = score_feat_vector(fv, ctx->model); + return fv; +} + +/* compute on the fly the score of a first order configuration */ + +feat_vector *maca_graph_parser_first_score(int gov, int dep, int label, maca_graph_parser_ctx *ctx, feat_vector *fv, float *score) +{ + fv = first(ctx->s, ctx, gov, dep, label, fv); + *score = score_feat_vector(fv, ctx->model); + return fv; +} + +/* compute on the fly the score of a grandchildren configuration */ + +feat_vector *maca_graph_parser_grandchildren_score(int gov, int dep, int gdep, int label, maca_graph_parser_ctx *ctx, feat_vector *fv, float *score) +{ + fv = grandchildren(ctx->s, ctx, gov, dep, gdep, label, fv); + *score = score_feat_vector(fv, ctx->model); + return fv; +} + +/* compute on the fly the score of a sibling configuration */ + +feat_vector *maca_graph_parser_sibling_score(int gov, int dep, int sib, int label, maca_graph_parser_ctx *ctx, feat_vector *fv, float *score) +{ + fv = sibling(ctx->s, ctx, gov, dep, sib, label, fv); + *score = score_feat_vector(fv, ctx->model); + return fv; +} + + +void maca_graph_parser_feature_table_fill(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, maca_graph_parser_feature_table *d){ + + feat_vector *fv_basic = NULL; + feat_vector *fv_first = NULL; + feat_vector *fv_grandchildren = NULL; + feat_vector *fv_sibling = NULL; + + int labels_nb = ctx->labels_nb; + /* default edge label: joker */ + int dft_label = ctx->fct_joker; + + int w1,w2,w3,label,dir,gov,dep; + float score; + char s_na[3] = "NA"; + + int length_class; /* length class of the dependency beween gov and dep */ + + if(ctx->verbose_flag > 2){ + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "computing first order features\n"); + } + + + for(w1 = 0; w1 < s->l; w1++){ + for(w2 = w1+1; w2 < s->l; w2++){ + for(dir = 0; dir < 2; dir++){ + gov = (dir == ra) ? w1 : w2; + dep = (dir == ra) ? w2 : w1; + length_class = maca_graph_parser_dep_count_table_compute_length_class(gov, dep); + /* basic */ + if(ctx->basic_features){ + fv_basic = basic(s, ctx, gov, dep, fv_basic); + score = score_feat_vector(fv_basic, ctx->model); + d->pl[w1][w2][dir] = score; + + if(ctx->verbose_flag > 3){ + char bgov[128]; + char bdep[128]; + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[gov], bgov, sizeof(bgov)); + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[dep], bdep, sizeof(bdep)); + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "basic (%s, %s) ", bgov, bdep); + fprintf(stderr, "%d features, score = %f\n", fv_basic->elt_nb, score); + } + } /* end basic */ + + for(label = 0; label < ctx->labels_nb; label++){ + /* printf("pos w1 = %d w2 = %d pos w2 = %d label = %d \n", s->pos[w1], w2, s->pos[w2], label ); */ + if((ctx->dep_count_table[s->pos[gov]][s->pos[dep]][label][length_class][dir] >= ctx->min_dep_count) || + (label == dft_label)){ + /* first */ + if(ctx->first_features){ + fv_first = first(s, ctx, gov, dep, label, fv_first); + score = score_feat_vector(fv_first, ctx->model); + d->lab[w1][w2][label][dir] = score; + + if(ctx->verbose_flag > 3){ + char bw1[128]; + char bw2[128]; + char bpos1[128]; + char bpos2[128]; + char blabel[128]; + char blem1[128]; + char blem2[128]; + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w1], bw1, sizeof(bw1)); + if (s->lemmas[w1] != -1) { + maca_alphabet_get_symbol(ctx->words_alphabet, s->lemmas[w1], blem1, sizeof(blem1)); + } else { + strcpy(blem1, s_na); + } + maca_alphabet_get_symbol(ctx->pos_alphabet, s->pos[w1], bpos1, sizeof(bpos1)); + maca_alphabet_get_symbol(ctx->labels_alphabet, label, blabel, sizeof(blabel)); + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w2], bw2, sizeof(bw2)); + if (s->lemmas[w2] != -1) { + maca_alphabet_get_symbol(ctx->words_alphabet, s->lemmas[w2], blem2, sizeof(blem2)); + } else { + strcpy(blem2, s_na); + } + maca_alphabet_get_symbol(ctx->pos_alphabet, s->pos[w2], bpos2, sizeof(bpos2)); + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "first (%s,%s,%s) %s-%s-%s, (%s,%s,%s) ", + bw1, blem1, bpos1, + (dir == ra) ? "" : "<", blabel, (dir == ra) ? ">" : "", + bw2, blem2, bpos2); + fprintf(stderr, "%d features, score = %f\n", fv_first->elt_nb, score); + } + } /* end first */ + } /* min_dep_count pruning */ + } /* end label */ + + } /* end dir */ + } /* end for w2 */ + } /* end for w1 */ + + + //GHASEM: in generating grandchildren information, we need w3 to be inside and outside of the + //range between w1 and w2!! + //for sibling the situation won't be changed, and w3 will be just inside the range!! + if(ctx->verbose_flag > 2){ + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "computing second order features\n"); + } + + + if(ctx->sibling_features || ctx->grandchildren_features){ + for(w1 = 0; w1 < s->l; w1++){ + for(w2 = w1+1; w2 < s->l; w2++){ + + length_class = maca_graph_parser_dep_count_table_compute_length_class(w1, w2); + + for(label = 0; label < ctx->labels_nb; label++){ + /* ra */ + if((ctx->dep_count_table[s->pos[w1]][s->pos[w2]][label][length_class][ra] >= ctx->min_dep_count) || + (label == dft_label)){ + /* for projectivity to be enforced, w3 cannot be before w1 */ + for(w3 = w1; w3 < s->l; w3++){ //GHASEM OLD --> for(w3 = w1; w3 <= w2; w3++){ + + /* grandchildren */ + if(ctx->grandchildren_features){ + if ((w3 != w1) && (w3 != w2)) + fv_grandchildren = grandchildren(s, ctx, w1, w2, w3, label, fv_grandchildren); + else + fv_grandchildren = grandchildren(s, ctx, w1, w2, -1, label, fv_grandchildren); + score = score_feat_vector(fv_grandchildren, ctx->model); + d->gra[w1][w2][w3][ra][label] = score; + + if(ctx->verbose_flag > 3){ + char bw1[128]; + char bw2[128]; + char bw3[128]; + char blabel[128]; + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w1], bw1, sizeof(bw1)); + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w2], bw2, sizeof(bw2)); + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w3], bw2, sizeof(bw3)); + maca_alphabet_get_symbol(ctx->words_alphabet, label, blabel, sizeof(blabel)); + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "grand children ([%s], %s, %s, %s) ", bw1, blabel, bw2, bw3); + fprintf(stderr, "%d features, score = %f\n", fv_grandchildren->elt_nb, score); + } + } + /* end grandchildren */ + + /* siblings */ + // if((w1 <= w3) && (w3 <= w2)){//GHASEM : here I just checked if w3 is between w1 and w2 or not? + if((ctx->sibling_features) && (w3 <= w2)){ + if ((w3 != w1) && (w3 != w2)) + fv_sibling = sibling(s, ctx, w1, w2, w3, label, fv_sibling); + else + fv_sibling = sibling(s, ctx, w1, w2, -1, label, fv_sibling); + score = score_feat_vector(fv_sibling, ctx->model); + d->sib[w1][w2][w3][ra][label] = score; + + if(ctx->verbose_flag > 3){ + char bw1[128]; + char bw2[128]; + char bw3[128]; + char blabel[128]; + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w1], bw1, sizeof(bw1)); + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w2], bw2, sizeof(bw2)); + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w3], bw2, sizeof(bw3)); + maca_alphabet_get_symbol(ctx->words_alphabet, label, blabel, sizeof(blabel)); + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "sibling ([%s], %s, %s, %s) ", bw1, blabel, bw2, bw3); + fprintf(stderr, "%d features, score = %f\n", fv_sibling->elt_nb, score); + } + } + /* end siblings */ + } /* end for w3 */ + } /* end ra */ + + /* la */ + if((ctx->dep_count_table[s->pos[w2]][s->pos[w1]][label][length_class][la] >= ctx->min_dep_count) || + (label == dft_label)){ + /* for projectivity to be enforced, w3 cannot be after w2 */ + for(w3 = 0; w3 <= w2; w3++){ //GHASEM OLD --> for(w3 = w1; w3 <= w2; w3++){ + /* grandchildren */ + if(ctx->grandchildren_features){ // && (w3 < w2) + if ((w3 != w1) && (w3 != w2)) + fv_grandchildren = grandchildren(s, ctx, w2, w1, w3, label, fv_grandchildren); + else + fv_grandchildren = grandchildren(s, ctx, w2, w1, -1, label, fv_grandchildren); + score = score_feat_vector(fv_grandchildren, ctx->model); + d->gra[w1][w2][w3][la][label] = score; + + if(ctx->verbose_flag > 3){ + char bw1[128]; + char bw2[128]; + char bw3[128]; + char blabel[128]; + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w1], bw1, sizeof(bw1)); + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w2], bw2, sizeof(bw2)); + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w3], bw2, sizeof(bw3)); + maca_alphabet_get_symbol(ctx->words_alphabet, label, blabel, sizeof(blabel)); + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "grand children (%s, %s, [%s], %s) ", bw1, blabel, bw2, bw3); + fprintf(stderr, "%d features, score = %f\n", fv_grandchildren->elt_nb, score); + } + } + /* end grandchildren */ + + /* siblings */ + // GHASEM : here I just checked if w3 is between w1 and w2 or not? + if((ctx->sibling_features) && (w3 >= w1)){ + if ((w3 != w1) && (w3 != w2)) + fv_sibling = sibling(s, ctx, w2, w1, w3, label, fv_sibling); + else + fv_sibling = sibling(s, ctx, w2, w1, -1, label, fv_sibling); + score = score_feat_vector(fv_sibling, ctx->model); + d->sib[w1][w2][w3][la][label] = score; + + if(ctx->verbose_flag > 3){ + char bw1[128]; + char bw2[128]; + char bw3[128]; + char blabel[128]; + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w1], bw1, sizeof(bw1)); + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w2], bw2, sizeof(bw2)); + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[w3], bw2, sizeof(bw3)); + maca_alphabet_get_symbol(ctx->words_alphabet, label, blabel, sizeof(blabel)); + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "sibling (%s, %s, [%s], %s) ", bw1, blabel, bw2, bw3); + fprintf(stderr, "%d features, score = %f\n", fv_sibling->elt_nb, score); + } + } + /* end siblings */ + + } /* end for w3 */ + } /* end la */ + } /* end for label */ + } /* end for w2 */ + } /* end for w1 */ + } /* end second order features */ + + /* house cleaning */ + if(fv_basic){ + free_feat_vector(fv_basic); + fv_basic = NULL; + } + if(fv_first){ + free_feat_vector(fv_first); + fv_first = NULL; + } + if(fv_grandchildren){ + free_feat_vector(fv_grandchildren); + fv_grandchildren = NULL; + } + if(fv_sibling){ + free_feat_vector(fv_sibling); + fv_sibling = NULL; + } + +} + + + +/*-------------------------------------------------------------------------------------------*/ + + +void maca_graph_parser_feature_table_free(maca_graph_parser_ctx *ctx) +{ + int i,j,k,l; + maca_graph_parser_feature_table *d = ctx->feature_table; + int length = d->len; + int types = d->typesLen; + + if(ctx->basic_features){ + free(d->pl[0][0]); + free(d->pl[0]); + free(d->pl); + } + + if(ctx->first_features){ + free(d->lab[0][0][0]); + free(d->lab[0][0]); + free(d->lab[0]); + free(d->lab); + } + + if(ctx->grandchildren_features){ + free(d->gra[0][0][0][0]); + free(d->gra[0][0][0]); + free(d->gra[0][0]); + free(d->gra[0]); + free(d->gra); + } + + if(ctx->sibling_features){ + free(d->sib[0][0][0][0]); + free(d->sib[0][0][0]); + free(d->sib[0][0]); + free(d->sib[0]); + free(d->sib); + } + + free(d); +} + +/*-------------------------------------------------------------------------------------------*/ + +void maca_graph_parser_feature_table_allocator(maca_graph_parser_ctx *ctx) +{ + maca_graph_parser_feature_table *d = malloc(sizeof(maca_graph_parser_feature_table)); + if(d == NULL){ + fprintf(stderr, "memory allocation error\n"); + exit(1); + } + + int types = d->typesLen = ctx->labels_nb; + int length = d->len = ctx->max_sent_length; + + int i,j,k,l; + d->pl = NULL; + d->lab = NULL; + d->sib = NULL; + d->gra = NULL; + + if(ctx->basic_features){ + d->pl = malloc((size_t) length * sizeof(float **)); + d->pl[0] = malloc((size_t) (length * length) * sizeof(float *)); + d->pl[0][0] = malloc((size_t) (length * length * 2) * sizeof(float)); + for(i=0; i<length; i++){ + d->pl[i] = d->pl[0] + (i * length); + for(j=0; j<length; j++){ + d->pl[i][j] = d->pl[0][0] + ((i * length) + j) * 2; + } + } + } + + if(ctx->first_features){ + /* [start][end][label][dir] */ + d->lab = malloc((size_t) length * sizeof(float ***)); + d->lab[0] = malloc((size_t) (length * length) * sizeof(float **)); + d->lab[0][0] = malloc((size_t) (length * length * types * sizeof(float *))); + d->lab[0][0][0] = malloc((size_t) (length * length * types * 2 * sizeof(float))); + for(i=0; i<length; i++){ + d->lab[i] = d->lab[0] + (i * length); + for(j=0; j<length; j++){ + d->lab[i][j] = d->lab[0][0] + ((i * length) + j) * types; + for(k=0; k<types; k++){ + d->lab[i][j][k] = d->lab[0][0][0] + (((i * length) + j) * types + k) * 2; + } + } + } + } + + if(ctx->sibling_features){ + /* [gov][dep][sib][dir][label] */ + d->sib = malloc((size_t) length * sizeof(float ****)); + d->sib[0] = malloc((size_t) (length * length) * sizeof(float ***)); + d->sib[0][0] = malloc((size_t) (length * length * length) * sizeof(float **)); + d->sib[0][0][0] = malloc((size_t) (length * length * length * 2) * sizeof(float *)); + d->sib[0][0][0][0] = malloc((size_t) (length * length * length * 2 * types) * sizeof(float)); + for(i=0; i<length; i++){ + d->sib[i] = d->sib[0] + (i * length); + for(j=0; j<length; j++){ + /* for(j=i+1; j<length; j++){ */ /* MM: start at i+1 because siblings are used in open only and open cannot have span 0 */ + d->sib[i][j] = d->sib[0][0] + (i * length + j) * length; + for(k=0; k<length; k++){ + /* for(k=i; k<=j; k++){ */ /* MM: sib in [i..j] */ + d->sib[i][j][k] = d->sib[0][0][0] + ((i * length + j) * length + k) * 2; + for(l=0; l<2; l++){ + d->sib[i][j][k][l] = d->sib[0][0][0][0] + (((i * length + j) * length + k) * 2 + l) * types; + } + } + } + } + } + + if(ctx->grandchildren_features){ + /* [gov][dep][gra][dir][label] */ + d->gra = malloc((size_t) length * sizeof(float ****)); + d->gra[0] = malloc((size_t) (length * length) * sizeof(float ***)); + d->gra[0][0] = malloc((size_t) (length * length * length) * sizeof(float **)); + d->gra[0][0][0] = malloc((size_t) (length * length * length * 2) * sizeof(float *)); + d->gra[0][0][0][0] = malloc((size_t) (length * length * length * 2 * types) * sizeof(float)); + for(i=0; i<length; i++){ + d->gra[i] = d->gra[0] + (i * length); + for(j=0; j<length; j++){ + /* for(j=i; j<length; j++){ */ /* MM: start at i because grandchildren are used in (open and) closed and closed can have span 0 */ + d->gra[i][j] = d->gra[0][0] + (i * length + j) * length; + for(k=0; k<length; k++){ + d->gra[i][j][k] = d->gra[0][0][0] + ((i * length + j) * length + k) * 2; + for(l=0; l<2; l++){ + d->gra[i][j][k][l] = d->gra[0][0][0][0] + (((i * length + j) * length + k) * 2 + l) * types; + } + } + } + } + } + + ctx->feature_table = d; +} + +/*-------------------------------------------------------------------------------------------*/ +/*-------------------------------------------------------------------------------------------*/ diff --git a/maca_graph_parser/maca_graph_parser_feature_table.h b/maca_graph_parser/maca_graph_parser_feature_table.h new file mode 100644 index 0000000000000000000000000000000000000000..e39b1cbb7e2f1edbc8de0bc0ae3173fb6ef600dd --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_feature_table.h @@ -0,0 +1,44 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_GRAPH_PARSER_FEATURE_TABLE__ +#define __MACA_GRAPH_PARSER_FEATURE_TABLE__ + +#include"maca_graph_parser_sentence.h" +#include"maca_graph_parser.h" +#include"maca_graph_parser_feature_vector.h" + +#ifdef __cplusplus +extern "C"{ +#endif + + +void maca_graph_parser_feature_table_allocator(maca_graph_parser_ctx *ctx); +void maca_graph_parser_feature_table_fill(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, maca_graph_parser_feature_table *d); +void maca_graph_parser_feature_table_free(maca_graph_parser_ctx *ctx); + +feat_vector *maca_graph_parser_basic_score(int gov, int dep, maca_graph_parser_ctx *ctx, feat_vector *fv_basic, float *score); +feat_vector *maca_graph_parser_first_score(int gov, int dep, int label, maca_graph_parser_ctx *ctx, feat_vector *fv_first, float *score); +feat_vector *maca_graph_parser_grandchildren_score(int gov, int dep, int gdep, int label, maca_graph_parser_ctx *ctx, feat_vector *fv, float *score); +feat_vector *maca_graph_parser_sibling_score(int gov, int dep, int sib, int label, maca_graph_parser_ctx *ctx, feat_vector *fv, float *score); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/maca_graph_parser/maca_graph_parser_feature_vector.c b/maca_graph_parser/maca_graph_parser_feature_vector.c new file mode 100644 index 0000000000000000000000000000000000000000..e3ebee40d052dbdcaa92dabfa0d8b802f44a8d46 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_feature_vector.c @@ -0,0 +1,101 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#include"maca_graph_parser.h" +#include"maca_graph_parser_features.h" +#include"maca_graph_parser_feature_vector.h" + + +feat_vector *allocate_feat_vector(int size) +{ + feat_vector *v = malloc(sizeof(feat_vector)); + v->array = malloc((size_t)size * sizeof(feature_t)); + v->size = size; + v->elt_nb = 0; + return v; +} + +void free_feat_vector(feat_vector *v) +{ + if(v){ + free(v->array); + free(v); + } +} + + +void print_feat_vector(maca_graph_parser_ctx *ctx, feat_vector *fv){ + /** + * Print a feature vector on stderr (for debug). + */ + + if(fv == NULL) + return; + + int i; + for(i=0; i < fv->elt_nb; i++){ + maca_graph_parser_print_feature(stderr, ctx, fv->array[i]); + fprintf(stderr, "\n"); + } +} + + +feat_matrix *allocate_feat_matrix(int lines, int columns) +{ + int i; + feat_matrix *m = malloc(sizeof(feat_matrix)); + m->array = malloc((size_t)lines * sizeof(feat_vector*)); + for(i=0; i < lines; i++) + m->array[i] = allocate_feat_vector(columns); + m->size = lines; + m->vector_nb = 0; + return m; +} + +void free_feat_matrix(feat_matrix *m) +{ + if(m){ + int i; + for(i=0; i < m->size; i++){ + free_feat_vector(m->array[i]); + } + free(m->array); + free(m); + } +} + + +void print_feat_matrix(maca_graph_parser_ctx *ctx, feat_matrix *fm){ + /** + * Print a feature matrix on stderr (for debug). + */ + + if(fm == NULL) + return; + + int i; + for(i=0; i < fm->vector_nb; i++){ + feat_vector *fv = fm->array[i]; + int j; + for(j=0; j < fv->elt_nb; j++){ + maca_graph_parser_print_feature(stderr, ctx, fv->array[j]); + fprintf(stderr, "\n"); + } + } +} diff --git a/maca_graph_parser/maca_graph_parser_feature_vector.h b/maca_graph_parser/maca_graph_parser_feature_vector.h new file mode 100644 index 0000000000000000000000000000000000000000..ce2bf31a1c611c0c2ca0c1464006be3e98010d7d --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_feature_vector.h @@ -0,0 +1,65 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_GRAPH_PARSER_FEATURE_VECTOR__ +#define __MACA_GRAPH_PARSER_FEATURE_VECTOR__ + +#include "maca_common.h" +#include "maca_constants.h" +//#include "maca_tags.h" +#include "maca_msg.h" +#include "maca_graph_parser.h" + +/* feature vector */ +typedef struct { + feature_t *array; + int size; + int elt_nb; +}feat_vector; + + +/* feature matrix */ +typedef struct { + feat_vector **array; + int size; + int vector_nb; +}feat_matrix; + + +#ifdef __cplusplus +extern "C"{ +#endif + +feat_vector *allocate_feat_vector(int size); +void free_feat_vector(feat_vector *v); +void print_feat_vector(maca_graph_parser_ctx *ctx, feat_vector *fv); + + + +feat_matrix *allocate_feat_matrix(int lines, int columns); +void free_feat_matrix(feat_matrix *m); +void print_feat_matrix(maca_graph_parser_ctx *ctx, feat_matrix *fm); + +#ifdef __cplusplus +} +#endif + + + +#endif diff --git a/maca_graph_parser/maca_graph_parser_features.c b/maca_graph_parser/maca_graph_parser_features.c new file mode 100644 index 0000000000000000000000000000000000000000..9b04fae44fb5ff69a5355785dbb4aa3894fca393 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_features.c @@ -0,0 +1,1775 @@ +/*************************************************************************** + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +**************************************************************************/ + +#include "maca_graph_parser_features.h" + +#include <math.h> +#include <stdint.h> + +#define maca_graph_parser_extract_bits(f,begin,end) (((f) << (63 - (end))) >> (63 + (begin) - (end))) + +/*maca_graph_parser_extract_bits(feature_t f, int begin, int end) +{ + return ((f << (63 - end)) >> (63 + begin - end)); + }*/ + +int maca_graph_parser_get_feature_type(feature_t feat, maca_graph_parser_templ_library *tl) +{ + return (int) maca_graph_parser_extract_bits(feat, tl->type_start, tl->type_end); +} + +templ *maca_graph_parser_get_templ(feature_t feat, maca_graph_parser_templ_library *tl) +{ + return tl->type2templ[maca_graph_parser_get_feature_type(feat, tl)]; +} + +int maca_graph_parser_get_feature_label(feature_t feat, maca_graph_parser_templ_library *tl) +{ + int type = maca_graph_parser_get_feature_type(feat, tl); + return (int) maca_graph_parser_extract_bits(feat, tl->type2templ[type]->label_start, tl->type2templ[type]->label_end); +} + +int maca_graph_parser_get_feature_direction(feature_t feat, maca_graph_parser_templ_library *tl) +{ + int type = maca_graph_parser_get_feature_type(feat, tl); + return (int) maca_graph_parser_extract_bits(feat, tl->type2templ[type]->direction_start, tl->type2templ[type]->direction_end); +} + +feature_t maca_graph_parser_get_feature_hash_key(feature_t feat, maca_graph_parser_templ_library *tl) +{ + int type = maca_graph_parser_get_feature_type(feat, tl); + return maca_graph_parser_extract_bits(feat, tl->type2templ[type]->hash_key_start, tl->type2templ[type]->hash_key_end); +} + +void maca_graph_parser_decompose_feature(feature_t feat, int *direction, int *label, feature_t *hash_key, maca_graph_parser_templ_library *tl) +{ + int type = maca_graph_parser_get_feature_type(feat, tl); + *direction = (int) maca_graph_parser_extract_bits(feat, tl->type2templ[type]->direction_start, tl->type2templ[type]->direction_end); + *label = (int) maca_graph_parser_extract_bits(feat, tl->type2templ[type]->label_start, tl->type2templ[type]->label_end); + *hash_key = maca_graph_parser_extract_bits(feat, tl->type2templ[type]->hash_key_start, tl->type2templ[type]->hash_key_end); +} + +int maca_graph_parser_compute_bits_number(int v) +{ + if (v == 0) return 0; + return (int)(log(v) / log(2)) + 1; +} + +void maca_graph_parser_print_feature_bin(FILE *f, feature_t feat) +{ + int i; + for(i=63; i >=0; i--) + fprintf(f, "%d", (int)((feat & ((feature_t)1 << i))>>i)); +} + +void maca_graph_parser_decode_feature(feature_t f, templ *t) +{ + int field; + for(field=0; field < t->field_nb; field++) + t->value[field] = maca_graph_parser_extract_bits(f, t->start[field], t->end[field]); +} + +void maca_graph_parser_print_templ_field(FILE *f, templ *t, int field, maca_graph_parser_ctx *ctx) +{ + char pos_str[128]; + char fct_str[128]; + char word_str[128]; + char subcat_str[128]; + char type = t->type[field]; + int value = t->value[field]; + + if(type == 's'){ + fprintf(f, "\t%d", value); + } + if(type == 't'){ + fprintf(f, "\t%d", value); + } + if(type == 'd'){ + if(value == ra) + fprintf(f, "\t->"); + else + fprintf(f, "\t<-"); + } + if(type == 'f'){ + if(maca_alphabet_get_symbol(ctx->labels_alphabet, value, fct_str, sizeof(fct_str))) + fprintf(f, "\t%s", fct_str); + else + fprintf(f, "\t(null)"); + } + if(type == 'p'){ + if(maca_alphabet_get_symbol(ctx->pos_alphabet, value, pos_str, sizeof(pos_str))) + fprintf(f, "\t%s", pos_str); + else + fprintf(f, "\t(null)"); + } + if(type == 'w'){ + if (maca_alphabet_get_symbol(ctx->words_alphabet, value, word_str, sizeof(word_str))) + fprintf(f, "\t%s", word_str); + else + fprintf(f, "\t(null)"); + } + + if(type == 'l'){ + if (maca_alphabet_get_symbol(ctx->synt_feats_alphabet, value, subcat_str, sizeof(subcat_str))) + fprintf(f, "\t%s", subcat_str); + else + fprintf(f, "\t(null)"); + } + +} + +void maca_graph_parser_print_templ(FILE *f, templ *t, maca_graph_parser_ctx *ctx) +{ + int field; + + for(field=0; field < t->field_nb; field++) + if(t->length[field]) maca_graph_parser_print_templ_field(f, t, field, ctx); +} + +void maca_graph_parser_print_feature(FILE *f, maca_graph_parser_ctx *ctx, feature_t feat) +{ + templ *t = maca_graph_parser_get_templ(feat, ctx->e); + maca_graph_parser_decode_feature(feat, t); + maca_graph_parser_print_templ(f, t, ctx); +} + +void maca_graph_parser_templ_free(templ *t) +{ + if(t) free(t); +} + +templ *maca_graph_parser_templ_allocator(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8, int v9, + char t0, char t1, char t2, char t3, char t4, char t5, char t6, char t7, char t8, char t9, + int label_position, int direction_position, int hash_key_start, int hash_key_end) +{ + int start = 0; + int end; + int l; + templ *t = malloc(sizeof(templ)); + if(t == NULL){ + fprintf(stderr, "memory allocation error\n"); + exit(1); + } + t->field_nb = 10; + + start = 0; + t->length[0] = maca_graph_parser_compute_bits_number(v0); + end = start + t->length[0] - 1; + t->start[0] = start; + t->end[0] = end; + t->type[0] = t0; + + start = end + 1; + t->length[1] = maca_graph_parser_compute_bits_number(v1); + end = start + t->length[1] - 1; + t->start[1] = start; + t->end[1] = end; + t->type[1] = t1; + + start = end + 1; + t->length[2] = maca_graph_parser_compute_bits_number(v2); + end = start + t->length[2] - 1; + t->start[2] = start; + t->end[2] = end; + t->type[2] = t2; + + start = end + 1; + t->length[3] = maca_graph_parser_compute_bits_number(v3); + end = start + t->length[3] - 1; + t->start[3] = start; + t->end[3] = end; + t->type[3] = t3; + + start = end + 1; + t->length[4] = maca_graph_parser_compute_bits_number(v4); + end = start + t->length[4] - 1; + t->start[4] = start; + t->end[4] = end; + t->type[4] = t4; + + start = end + 1; + t->length[5] = maca_graph_parser_compute_bits_number(v5); + end = start + t->length[5] - 1; + t->start[5] = start; + t->end[5] = end; + t->type[5] = t5; + + start = end + 1; + t->length[6] = maca_graph_parser_compute_bits_number(v6); + end = start + t->length[6] - 1; + t->start[6] = start; + t->end[6] = end; + t->type[6] = t6; + + start = end + 1; + t->length[7] = maca_graph_parser_compute_bits_number(v7); + end = start + t->length[7] - 1; + t->start[7] = start; + t->end[7] = end; + t->type[7] = t7; + + start = end + 1; + t->length[8] = maca_graph_parser_compute_bits_number(v8); + end = start + t->length[8] - 1; + t->start[8] = start; + t->end[8] = end; + t->type[8] = t8; + + start = end + 1; + t->length[9] = maca_graph_parser_compute_bits_number(v9); + end = start + t->length[9] - 1; + t->start[9] = start; + t->end[9] = end; + t->type[9] = t9; + + t->label_start = t->start[label_position]; + t->label_end = t->end[label_position]; + + t->direction_start = t->start[direction_position]; + t->direction_end = t->end[direction_position]; + + t->hash_key_start = t->start[hash_key_start]; + t->hash_key_end = t->end[hash_key_end]; + + return t; +} + +void maca_graph_parser_templ_library_free(maca_graph_parser_templ_library *tl) +{ + if(tl){ + maca_graph_parser_templ_free(tl->tfdp); + maca_graph_parser_templ_free(tl->tfdw); + maca_graph_parser_templ_free(tl->tfdwp); + maca_graph_parser_templ_free(tl->tfdpp); + maca_graph_parser_templ_free(tl->tfdww); + maca_graph_parser_templ_free(tl->tfdwpp); + maca_graph_parser_templ_free(tl->tfdwwp); + maca_graph_parser_templ_free(tl->tfdppp); + maca_graph_parser_templ_free(tl->tfdpppp); + maca_graph_parser_templ_free(tl->tfdwpwp); + maca_graph_parser_templ_free(tl->tdppp); + maca_graph_parser_templ_free(tl->tfddpp); + maca_graph_parser_templ_free(tl->tfddppp); + maca_graph_parser_templ_free(tl->tfddww); + maca_graph_parser_templ_free(tl->tfddwp); + maca_graph_parser_templ_free(tl->tfdsppp); + maca_graph_parser_templ_free(tl->tfdspp); + maca_graph_parser_templ_free(tl->tfdsww); + maca_graph_parser_templ_free(tl->tfdswp); + maca_graph_parser_templ_free(tl->tflpp); + maca_graph_parser_templ_free(tl->tflppp); + maca_graph_parser_templ_free(tl->tflpwp); + free(tl); + } +} + +maca_graph_parser_templ_library *maca_graph_parser_templ_library_allocator(maca_graph_parser_ctx *ctx){ + /** + * Allocate a templ library. + * + * FIXME: write a proper maca_graph_parser_templ_library_free() and use it ! + */ + + maca_graph_parser_templ_library *e = malloc(sizeof(maca_graph_parser_templ_library)); + if(e == NULL){ + fprintf(stderr, "memory allocation error\n"); + exit(1); + } + + e->s_pos = ctx->pos_nb; + /* e->s_word = maca_alphabet_size(ctx->alphabet); */ + e->s_word = ctx->words_nb; + e->s_rel = ctx->labels_nb; + e->s_synt_feat = ctx->synt_feats_nb; + + e->s_type = 255; + e->s_dir = 2; + e->s_dist = 10; + e->s_feat = 0; + e->s_child = 0; + + e->type_start = 0; + e->type_end = maca_graph_parser_compute_bits_number(e->s_type) - 1; + + e->tfdp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_pos, 0 , 0, 0, 0, 0, 0, 't', 'f', 'd', 'p', '0', '0', '0', '0', '0', '0', 1, 2, 3, 3); + e->tfdw = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_word, 0 , 0, 0, 0, 0, 0, 't', 'f', 'd', 'w', '0', '0', '0', '0', '0', '0', 1, 2, 3, 3); + e->tfdwp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_word, e->s_pos, 0, 0, 0, 0, 0, 't', 'f', 'd', 'w', 'p', '0', '0', '0', '0', '0', 1, 2, 3, 4); + e->tfdpp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_pos, e->s_pos, 0, 0, 0, 0, 0, 't', 'f', 'd', 'p', 'p', '0', '0', '0', '0', '0', 1, 2, 3, 4); + e->tfdww = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_word, e->s_word, 0, 0, 0, 0, 0, 't', 'f', 'd', 'w', 'w', '0', '0', '0', '0', '0', 1, 2, 3, 4); + e->tfdwpp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_word, e->s_pos, e->s_pos, 0, 0, 0, 0, 't', 'f', 'd', 'w', 'p', 'p', '0', '0', '0', '0', 1, 2, 3, 5); + e->tfdwwp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_word, e->s_word, e->s_pos, 0, 0, 0, 0, 't', 'f', 'd', 'w', 'w', 'p', '0', '0', '0', '0', 1, 2, 3, 5); + e->tfdppp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_pos, e->s_pos, e->s_pos, 0, 0, 0, 0, 't', 'f', 'd', 'p', 'p', 'p', '0', '0', '0', '0', 1, 2, 3, 5); + e->tfdpppp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_pos, e->s_pos, e->s_pos, e->s_pos, 0, 0, 0, 't', 'f', 'd', 'p', 'p', 'p', 'p', '0', '0', '0', 1, 2, 3, 6); + e->tfdwpwp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_word, e->s_pos, e->s_word, e->s_pos, 0, 0, 0, 't', 'f', 'd', 'w', 'p', 'w', 'p', '0', '0', '0', 1, 2, 3, 6); + e->tfddpp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_dir, e->s_pos, e->s_pos, 0, 0, 0, 0, 't', 'f', 'd', 'd', 'p', 'p', '0', '0', '0', '0', 1, 2, 3, 5); + e->tfddppp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_dir, e->s_pos, e->s_pos, e->s_pos, 0, 0, 0, 't', 'f', 'd', 'd', 'p', 'p', 'p', '0', '0', '0', 1, 2, 3, 6); + e->tfddww = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_dir, e->s_word, e->s_word, 0, 0, 0, 0, 't', 'f', 'd', 'd', 'w', 'w', '0', '0', '0', '0', 1, 2, 3, 5); + e->tfddwp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_dir, e->s_word, e->s_pos, 0, 0, 0, 0, 't', 'f', 'd', 'd', 'w', 'p', '0', '0', '0', '0', 1, 2, 3, 5); + e->tfdsppp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_dist, e->s_pos, e->s_pos, e->s_pos, 0, 0, 0, 't', 'f', 'd', 's', 'p', 'p', 'p', '0', '0', '0', 1, 2, 3, 6); + e->tfdspp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_dist, e->s_pos, e->s_pos, 0, 0, 0, 0, 't', 'f', 'd', 's', 'p', 'p', '0', '0', '0', '0', 1, 2, 3, 5); + e->tfdsww = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_dist, e->s_word, e->s_word, 0, 0, 0, 0, 't', 'f', 'd', 's', 'w', 'w', '0', '0', '0', '0', 1, 2, 3, 5); + e->tfdswp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_dir, e->s_dist, e->s_word, e->s_pos, 0, 0, 0, 0, 't', 'f', 'd', 's', 'w', 'p', '0', '0', '0', '0', 1, 2, 3, 5); + + e->tdppp = maca_graph_parser_templ_allocator(e->s_type, e->s_dir, e->s_pos, e->s_pos, e->s_pos, 0, 0, 0, 0, 0, 't', 'd', 'p', 'p', 'p', '0', '0', '0', '0', '0', -1, 1, 2, 4); + + /* subcat features */ + e->tflpp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_synt_feat, e->s_pos, e->s_pos, 0, 0, 0, 0, 0, 't', 'f', 'l', 'p', 'p', '0', '0', '0', '0', '0', 1, -1, 2, 4); + e->tflppp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_synt_feat, e->s_pos, e->s_pos, e->s_pos, 0, 0, 0, 0, 't', 'f', 'l', 'p', 'p', 'p', '0', '0', '0', '0', 1, -1, 2, 5); + e->tflpwp = maca_graph_parser_templ_allocator(e->s_type, e->s_rel, e->s_synt_feat, e->s_pos, e->s_word, e->s_pos, 0, 0, 0, 0, 't', 'f', 'l', 'p', 'w', 'p', '0', '0', '0', '0', 1, -1, 2, 5); + + + /* basic features */ + e->type2templ[_f39] = e->tdppp; + + /* first order features with form */ + e->type2templ[_f1] = e->tfdwp; /* (hits, X, VBZ) -SBJ-> (X, X, X) */ + e->type2templ[_f2] = e->tfdw; /* (hits, X, X) -SBJ-> (X, X, X) */ + e->type2templ[_f3] = e->tfdp; /* (X, X, VBZ) -SBJ-> (X, X, X) */ + e->type2templ[_f4] = e->tfdwp; /* (X, X, X) -SBJ-> (boy, X, NN) */ + e->type2templ[_f5] = e->tfdw; /* (X, X, X) -SBJ-> (boy, X, X) */ + e->type2templ[_f6] = e->tfdp; /* (X, X, X) -SBJ-> (X, X, NN) */ + e->type2templ[_f7] = e->tfdwpwp; /* (hits, X, VBZ) -SBJ-> (boy, X, NN) */ + e->type2templ[_f8] = e->tfdwpp; /* (X, X, VBZ) -SBJ-> (boy, X, NN) */ + e->type2templ[_f9] = e->tfdwwp; /* (hits, X, X) -SBJ-> (boy, X, NN) */ + e->type2templ[_f10] = e->tfdwpp; /* (hits, X, VBZ) -SBJ-> (X, X, NN) */ + e->type2templ[_f11] = e->tfdwwp; /* (hits, X, VBZ) -SBJ-> (boy, X, XX) */ + e->type2templ[_f12] = e->tfdww; /* (hits, X, X) -SBJ-> (boy, X, X) */ + e->type2templ[_f13] = e->tfdpp; /* (X, X, VBZ) -SBJ-> (X, X, NN) */ + + /* first order features with lemma */ + e->type2templ[_f1l] = e->tfdwp; /* (X, hit, VBZ) -SBJ-> (X, X, X) */ + e->type2templ[_f2l] = e->tfdw; /* (X, hit, X) -SBJ-> (X, X, X) */ + e->type2templ[_f3l] = e->tfdp; /* (X, X, VBZ) -SBJ-> (X, X, X) */ + e->type2templ[_f4l] = e->tfdwp; /* (X, X, X) -SBJ-> (X, boy, NN) */ + e->type2templ[_f5l] = e->tfdw; /* (X, X, X) -SBJ-> (X, boy, XX) */ + e->type2templ[_f6l] = e->tfdp; /* (X, X, X) -SBJ-> (X, X, NN) */ + e->type2templ[_f7l] = e->tfdwpwp;/* (X, hit, VBZ) -SBJ-> (X, boy, NN) */ + e->type2templ[_f8l] = e->tfdwpp; /* (X, X, VBZ) -SBJ-> (X, boy, NN) */ + e->type2templ[_f9l] = e->tfdwwp; /* (X, hit, X) -SBJ-> (X, boy, NN) */ + e->type2templ[_f10l] = e->tfdwpp; /* (X, hit, VBZ) -SBJ-> (X, X, NN) */ + e->type2templ[_f11l] = e->tfdwwp; /* (X, hit, VBZ) -SBJ-> (X, boy, XX) */ + e->type2templ[_f12l] = e->tfdww; /* (X, hit, X) -SBJ-> (X, boy, X) */ + e->type2templ[_f13l] = e->tfdpp; /* (X, X, VBZ) -SBJ-> (X, X, NN) */ + + /* linear features */ + e->type2templ[_f14] = e->tfdppp; + e->type2templ[_f15] = e->tfdppp; + e->type2templ[_f16] = e->tfdppp; + e->type2templ[_f17] = e->tfdpppp; + e->type2templ[_f18] = e->tfdpppp; + e->type2templ[_f19] = e->tfdpppp; + e->type2templ[_f20] = e->tfdpppp; + + /* grandchildren features with pos only */ + e->type2templ[_f21] = e->tfddppp;/* (X, X, NN) -NMOD-> (X, X, TO) -X-> (X, X, NN) */ + e->type2templ[_f22] = e->tfddpp; /* (X, X, NN) -NMOD-> (X, X, X ) -X-> (X, X, NN) */ + e->type2templ[_f23] = e->tfddpp; /* (X, X, X ) -NMOD-> (X, X, TO) -X-> (X, X, NN) */ + + /* grandchildren features with pos and forms */ + e->type2templ[_f24] = e->tfddww; /* (key, X, X) -NMOD-> (X, X, X ) -X-> (heaven, X, X) */ + e->type2templ[_f25] = e->tfddww; /* (X, X, X) -NMOD-> (to, X, X ) -X-> (heaven, X, X) */ + e->type2templ[_f26] = e->tfddwp; /* (X, X, NN) -NMOD-> (X, X, X ) -X-> (heaven, X, X) */ + e->type2templ[_f27] = e->tfddwp; /* (X, X, X) -NMOD-> (X, X, TO) -X-> (heaven, X, X) */ + e->type2templ[_f28] = e->tfddwp; /* (key, X, X) -NMOD-> (X, X, X ) -X-> (X, X, NN) */ + e->type2templ[_f29] = e->tfddwp; /* (X, X, X) -NMOD-> (to, X, X ) -X-> (X, X, NN) */ + + /* grandchildren features with pos and lemmas */ + e->type2templ[_f24l] = e->tfddww; /* (X, key, X) -NMOD-> (X, X, X) -X-> (X, heaven, X) */ + e->type2templ[_f25l] = e->tfddww; /* (X, X, X) -NMOD-> (X, to, X) -X-> (X, heaven, X) */ + e->type2templ[_f26l] = e->tfddwp; /* (X, X, NN) -NMOD-> (X, X, X) -X-> (X, heaven, X) */ + e->type2templ[_f27l] = e->tfddwp; /* (X, X, X) -NMOD-> (X, X, TO) -X-> (X, heaven, X) */ + e->type2templ[_f28l] = e->tfddwp; /* (X, key, X) -NMOD-> (X, X, X) -X-> (X, X, NN)*/ + e->type2templ[_f29l] = e->tfddwp; /* (X, X, X) -NMOD-> (X, to, X) -X-> (X, X, NN)*/ + + /* grandchildren features with neighboring pos */ + e->type2templ[_f42] = e->tfdppp; + e->type2templ[_f43] = e->tfdppp; + e->type2templ[_f44] = e->tfdppp; + e->type2templ[_f45] = e->tfdppp; + e->type2templ[_f46] = e->tfdpppp; + e->type2templ[_f47] = e->tfdpppp; + e->type2templ[_f48] = e->tfdpppp; + e->type2templ[_f49] = e->tfdpppp; + e->type2templ[_f50] = e->tfdppp; + e->type2templ[_f51] = e->tfdppp; + e->type2templ[_f52] = e->tfdppp; + e->type2templ[_f53] = e->tfdppp; + e->type2templ[_f54] = e->tfdpppp; + e->type2templ[_f55] = e->tfdpppp; + e->type2templ[_f56] = e->tfdpppp; + e->type2templ[_f57] = e->tfdpppp; + + /* sibling features with pos only */ + e->type2templ[_f30] = e->tfdsppp; /* (X, X, NNP) <- SBJ - (X, X, VB) - X -> (X, X, NN) */ + e->type2templ[_f31] = e->tfdspp; /* (X, X, X ) <- SBJ - (X, X, VB) - X -> (X, X, NN) */ + e->type2templ[_f32] = e->tfdspp; /* (X, X, NNP) <- SBJ - (X, X, X ) - X -> (X, X, NN) */ + + /* sibling features with pos and forms */ + e->type2templ[_f33] = e->tfdsww; /* (X, X, X ) <- SBJ - (ate, X, X ) - X -> (apple, X, X) */ + e->type2templ[_f34] = e->tfdsww; /* (John, X, X ) <- SBJ - (X, X, X ) - X -> (apple, X, X) */ + e->type2templ[_f35] = e->tfdswp; /* (X, X, X ) <- SBJ - (X, X, VB) - X -> (apple, X, X) */ + e->type2templ[_f36] = e->tfdswp; /* (X, X, NNP) <- SBJ - (X, X, X ) - X -> (apple, X, X) */ + e->type2templ[_f37] = e->tfdswp; /* (X, X, X ) <- SBJ - (ate, X, X ) - X -> (X, X, P) */ + e->type2templ[_f38] = e->tfdswp; /* (John, X, X ) <- SBJ - (X, X, X ) - X -> (X, X, P) */ + + /* sibling features with pos and lemmas */ + e->type2templ[_f33l] = e->tfdsww; /* (X, X, X ) <- SBJ - (X, eat, X ) - X -> (X, apple, X ) */ + e->type2templ[_f34l] = e->tfdsww; /* (X, John, X ) <- SBJ - (X, X, X ) - X -> (X, apple, X ) */ + e->type2templ[_f35l] = e->tfdswp; /* (X, X, X ) <- SBJ - (X, X, VB) - X -> (X, apple, X ) */ + e->type2templ[_f36l] = e->tfdswp; /* (X, X, NNP) <- SBJ - (X, X, X ) - X -> (X, apple, X ) */ + e->type2templ[_f37l] = e->tfdswp; /* (X, X, X ) <- SBJ - (X, eat, X ) - X -> (X, X, NN) */ + e->type2templ[_f38l] = e->tfdswp; /* (X, John, X ) <- SBJ - (X, X, X ) - X -> (X, X, NN) */ + + /* sibling features with neighboring pos */ + e->type2templ[_f58] = e->tfdppp; /* (X, X, X ) <-L- (X, govP, X ) -X-> (X, sblP, sblPp1) */ + e->type2templ[_f59] = e->tfdppp; /* (X, X, X ) <-L- (X, govP, X ) -X-> (sblPm1, sblP, X ) */ + e->type2templ[_f60] = e->tfdppp; /* (X, X, X ) <-L- (X, govP, govPp1) -X-> (X, sblP, X ) */ + e->type2templ[_f61] = e->tfdppp; /* (X, X, X ) <-L- (govPm1, govP, X ) -X-> (X, sblP, X ) */ + e->type2templ[_f62] = e->tfdpppp; /* (X, X, X ) <-L- (govPm1, govP, X ) -X-> (X, sblP, sblPp1) */ + e->type2templ[_f63] = e->tfdpppp; /* (X, X, X ) <-L- (govPm1, govP, X ) -X-> (sblPm1, sblP, X ) */ + e->type2templ[_f64] = e->tfdpppp; /* (X, X, X ) <-L- (X, govP, govPp1) -X-> (X, sblP, sblPp1) */ + e->type2templ[_f65] = e->tfdpppp; /* (X, X, X ) <-L- (X, govP, govPp1) -X-> (sblPm1, sblP, X ) */ + e->type2templ[_f66] = e->tfdppp; /* (X, depP, X ) <-L- (X, X, X ) -X-> (X, sblP, sblPp1) */ + e->type2templ[_f67] = e->tfdppp; /* (X, depP, X ) <-L- (X, X, X ) -X-> (sblPm1, sblP, X ) */ + e->type2templ[_f68] = e->tfdppp; /* (X, depP, depPp1) <-L- (X, X, X ) -X-> (X, sblP, X ) */ + e->type2templ[_f69] = e->tfdppp; /* (depPm1, depP, X ) <-L- (X, X, X ) -X-> (X, sblP, X ) */ + e->type2templ[_f70] = e->tfdpppp; /* (depPm1, depP, X ) <-L- (X, X, X ) -X-> (X, sblP, sblPp1) */ + e->type2templ[_f71] = e->tfdpppp; /* (depPm1, depP, X ) <-L- (X, X, X ) -X-> (sblPm1, sblP, X ) */ + e->type2templ[_f72] = e->tfdpppp; /* (X, depP, depPp1) <-L- (X, X, X ) -X-> (X, sblP, sblPp1) */ + e->type2templ[_f73] = e->tfdpppp; /* (X, depP, depPp1) <-L- (X, X, X ) -X-> (sblPm1, sblP, X ) */ + + /* subcat templates */ + + e->type2templ[_f77] = e->tflpp; /* (X, X, VBZ, SCAT) -SBJ-> (X, X, N) */ + e->type2templ[_f78] = e->tflppp; /* (X, X, VBZ, SCAT) -AOBJ-> (X, X, PRE) -COMP-> (X, X, N) */ + e->type2templ[_f79] = e->tflpwp; /* (X, X, VBZ, SCAT) -AOBJ-> (X, X, PRE) -COMP-> (X, X, N) */ + + return e; +} + +feature_t maca_graph_parser_encode_feature_2(templ *d) { + feature_t l = (feature_t) d->value[0]; + int shift = d->length[0]; + + /* if (d->value[0] < 0 || d->value[1] < 0) return -1; */ + l |= (feature_t)d->value[1] << shift; + shift +=d->length[1]; + /* d->shift=shift; */ + + return l; +} + +feature_t maca_graph_parser_encode_feature_3(templ *d) { + feature_t l = (feature_t) d->value[0]; + int shift = d->length[0]; + + /* if (d->value[0] < 0 || d->value[1] < 0 || d->value[2] < 0) return -1; */ + + l |= (feature_t)d->value[1] << shift; + shift +=d->length[1]; + l |= (feature_t)d->value[2] << shift; + /* d->shift=shift + d->length[2]; */ + + return l; +} + + +feature_t maca_graph_parser_encode_feature_4(templ *d) { + feature_t l = (feature_t) d->value[0]; + int shift =d->length[0]; + + /* if (d->value[0] < 0 || d->value[1] < 0 || d->value[2] < 0 || d->value[3] < 0) return -1; */ + + l |= (feature_t)d->value[1] << shift; + shift += d->length[1]; + l |= (feature_t)d->value[2] << shift; + shift += d->length[2]; + l |= (feature_t)d->value[3] << shift; + /* d->shift = shift + d->length[3]; */ + + return l; +} + + +feature_t maca_graph_parser_encode_feature_5(templ *d) { + feature_t l = (feature_t) d->value[0]; + int shift =d->length[0]; + + /* if (d->value[0] < 0 || d->value[1] < 0 || d->value[2] < 0 || d->value[3] < 0 || d->value[4] < 0) return -1; */ + + l |= (feature_t)d->value[1] << shift; + shift +=d->length[1]; + l |= (feature_t)d->value[2] << shift; + shift +=d->length[2]; + l |= (feature_t)d->value[3] << shift; + shift +=d->length[3]; + l |= (feature_t)d->value[4] << shift; + /* d->shift = shift + d->length[4]; */ + + return l; +} + +feature_t maca_graph_parser_encode_feature_6(templ *d) { + feature_t l = (feature_t) d->value[0]; + int shift =d->length[0]; + + /* if (d->value[0] < 0 || d->value[1] < 0 || d->value[2] < 0 || d->value[3] < 0 || d->value[4] < 0 || d->value[5] < 0) return -1; */ + + l |= (feature_t)d->value[1] << shift; + shift +=d->length[1]; + l |= (feature_t)d->value[2] << shift; + shift +=d->length[2]; + l |= (feature_t)d->value[3] << shift; + shift +=d->length[3]; + l |= (feature_t)d->value[4] << shift; + shift +=d->length[4]; + l |= (feature_t)d->value[5] << shift; + /* d->shift =shift+d->length[5]; */ + + return l; +} + +feature_t maca_graph_parser_encode_feature_7(templ *d) { + feature_t l = (feature_t) d->value[0]; + int shift = d->length[0]; + + /* if (d->value[0] < 0 || d->value[1] < 0 || d->value[2] < 0 || d->value[3] < 0 || d->value[4] < 0 || d->value[5] < 0 || d->value[6] < 0) return -1; */ + + l |= (feature_t)d->value[1] << shift; + shift += d->length[1]; + l |= (feature_t)d->value[2] << shift; + shift += d->length[2]; + l |= (feature_t)d->value[3] << shift; + shift += d->length[3]; + l |= (feature_t)d->value[4] << shift; + shift += d->length[4]; + l |= (feature_t)d->value[5] << shift; + shift += d->length[5]; + l |= (feature_t)d->value[6] << shift; + /* d->shift = shift + d->length[6]; */ + + return l; +} + + +feature_t maca_graph_parser_encode_feature_8(templ *d) { + feature_t l = (feature_t) d->value[0]; + int shift =d->length[0]; + + /* if (d->value[0] < 0 || d->value[1] < 0 || d->value[2] < 0 || d->value[3] < 0 || d->value[4] < 0 || d->value[5] < 0 || d->value[6] < 0 || d->value[7] < 0) return -1; */ + + l |= (feature_t)d->value[1] << shift; + shift +=d->length[1]; + l |= (feature_t)d->value[2] << shift; + shift +=d->length[2]; + l |= (feature_t)d->value[3] << shift; + shift +=d->length[3]; + l |= (feature_t)d->value[4] << shift; + shift +=d->length[4]; + l |= (feature_t)d->value[5] << shift; + shift +=d->length[5]; + l |= (feature_t)d->value[6] << shift; + shift +=d->length[6]; + l |= (feature_t)d->value[7] << shift; + /* d->shift =shift+d->length[7]; */ + + return l; +} + + +feat_vector *basic(maca_graph_parser_sentence *s, maca_graph_parser_ctx *ctx, int gov, int dep, feat_vector *fv){ + /** + * Compute the vector of basic features for a dependency. + * + * Fills and returns fv if not NULL, a new feat_vector otherwise. + */ + + + if(fv == NULL){ + fv = allocate_feat_vector(BasicFeatNb); + } else { + /* reset fv */ + fv->elt_nb = 0; + } + + int dir = (gov < dep) ? ra : la; + int end = (gov >= dep ? gov : dep); + int begin = (gov >= dep ? dep : gov) + 1; + maca_graph_parser_templ_library *e = ctx->e; + + e->tdppp->value[0] = _f39; + e->tdppp->value[1] = dir; + e->tdppp->value[2] = s->pos[gov]; + e->tdppp->value[3] = s->pos[dep]; + + int i; + for (i = begin; i < end; i++) { + e->tdppp->value[4] = s->pos[i]; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_5(e->tdppp); + } + return fv; +} + +feat_vector *first(maca_graph_parser_sentence *s, maca_graph_parser_ctx *ctx, int gov, int dep, int label, feat_vector *fv){ + /** + * Compute the vector of first order features for a dependency. + * + * Fills and returns fv if not NULL, a new feat_vector otherwise. + */ + + if(fv == NULL){ + fv = allocate_feat_vector(FirstOrderFeatNb); + } else { + /* reset fv */ + fv->elt_nb = 0; + } + + maca_graph_parser_templ_library *e = ctx->e; + + int dir = (gov < dep) ? ra : la; + /* gov and dep */ + int govF = s->words[gov]; + int depF = s->words[dep]; + int govL = s->lemmas[gov]; + int depL = s->lemmas[dep]; + int govP = s->pos[gov]; + int depP = s->pos[dep]; + /* gov+-1, dep+-1 */ + int govm1P = (gov == 0) ? ctx->pos_start : s->pos[gov-1]; + int depm1P = (dep == 0) ? ctx->pos_start : s->pos[dep-1]; + int govp1P = (gov == (s->l-1)) ? ctx->pos_end : s->pos[gov+1]; + int depp1P = (dep == (s->l-1)) ? ctx->pos_end : s->pos[dep+1]; + + int subcat_feats_nb = s->synt_feats_nb[gov]; + int *subcat_feats_array = s->synt_feats_array[gov]; + int i,j; + + /* fprintf(stderr, "extract first order features : gov : (%d,%s,%s) dep :(%d,%s,%s)\n", gov, */ + + /* (hits, hit, VBZ) -SBJ-> (boy, boy, NN) */ + if(ctx->use_full_forms){ + /* (hits, X, VBZ) -SBJ-> (X, X, X) */ + e->tfdwp->value[0] = _f1; + e->tfdwp->value[1] = label; + e->tfdwp->value[2] = dir; + e->tfdwp->value[3] = govF; + e->tfdwp->value[4] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_5(e->tfdwp); + + /* MATE: 2 */ + /* (hits, X, X) -SBJ-> (X, X, X) */ + e->tfdw->value[0] = _f2; + e->tfdw->value[1] = label; + e->tfdw->value[2] = dir; + e->tfdw->value[3] = govF; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_4(e->tfdw); + + /* MATE: 7 */ + /* (X, X, VBZ) -SBJ-> (X, X, X) */ + e->tfdp->value[0] = _f3; + e->tfdp->value[1] = label; + e->tfdp->value[2] = dir; + e->tfdp->value[3] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_4(e->tfdp); + + /* (X, X, X) -SBJ-> (boy, X, NN) */ + e->tfdwp->value[0] = _f4; + // e->tfdwp->value[1] = label; + // e->tfdwp->value[2] = dir; + e->tfdwp->value[3] = depF; + e->tfdwp->value[4] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_5(e->tfdwp); + + /* different from the 2010 paper ; guess it is a typo in the paper */ + /* MATE: 4 */ + /* (X, X, X) -SBJ-> (boy, X, X) */ + e->tfdw->value[0] = _f5; + // e->tfdw->value[1] = label; + // e->tfdw->value[2] = dir; + e->tfdw->value[3] = depF; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_4(e->tfdw); + + /* MATE: 6 */ + /* (X, X, X) -SBJ-> (X, X, NN) */ + e->tfdp->value[0] = _f6; + // e->tfdp->value[1] = label; + // e->tfdp->value[2] = dir; + e->tfdp->value[3] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_4(e->tfdp); + + /* (hits, X, VBZ) -SBJ-> (boy, X, NN) */ + e->tfdwpwp->value[0] = _f7; + e->tfdwpwp->value[1] = label; + e->tfdwpwp->value[2] = dir; + e->tfdwpwp->value[3] = govF; + e->tfdwpwp->value[4] = govP; + e->tfdwpwp->value[5] = depF; + e->tfdwpwp->value[6] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdwpwp); + + /* (X, X, VBZ) -SBJ-> (boy, X, NN) */ + e->tfdwpp->value[0] = _f8; + e->tfdwpp->value[1] = label; + e->tfdwpp->value[2] = dir; + e->tfdwpp->value[3] = depF; + e->tfdwpp->value[4] = depP; + e->tfdwpp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdwpp); + + /* (hits, X, X) -SBJ-> (boy, X, NN) */ + e->tfdwwp->value[0] = _f9; + e->tfdwwp->value[1] = label; + e->tfdwwp->value[2] = dir; + e->tfdwwp->value[3] = govF; + e->tfdwwp->value[4] = depF; + e->tfdwwp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdwwp); + + /* conform with paper */ + /* (hits, X, VBZ) -SBJ-> (boy, X, X) */ + e->tfdwpp->value[0] = _f10; + // e->tfdwpp->value[1] = label; + // e->tfdwpp->value[2] = dir; + e->tfdwpp->value[3] = govF; + e->tfdwpp->value[4] = govP; + e->tfdwpp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdwpp); + + /* (hits, X, VBZ) -SBJ-> (X, X, NN) */ + e->tfdwwp->value[0] = _f11; + // e->tfdwwp->value[1] = label; + // e->tfdwwp->value[2] = dir; + e->tfdwwp->value[3] = govF; + e->tfdwwp->value[4] = depF; + e->tfdwwp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdwwp); + + /* MATE: 5 */ + /* (hits, X, X) -SBJ-> (boy, X, X) */ + e->tfdww->value[0] = _f12; + e->tfdww->value[1] = label; + e->tfdww->value[2] = dir; + e->tfdww->value[3] = govF; + e->tfdww->value[4] = depF; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_5(e->tfdww); + + /* MATE: 8 */ + /* (X, X, VBZ) -SBJ-> (X, X, NN) */ + e->tfdpp->value[0] = _f13; + e->tfdpp->value[1] = label; + e->tfdpp->value[2] = dir; + e->tfdpp->value[3] = govP; + e->tfdpp->value[4] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_5(e->tfdpp); + } + + // lemmas + if(ctx->use_lemmas){ + /* _f1l = _f77 */ + /* (X, hit, VBZ) -SBJ-> (X, X, X) */ + e->tfdwp->value[0] = _f1l; + // e->tfdwp->value[1] = label; + // e->tfdwp->value[2] = dir; + e->tfdwp->value[3] = govL; + e->tfdwp->value[4] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_5(e->tfdwp); + + /* _f2l = _f78 */ + /* (X, hit, X) -SBJ-> (X, X, X) */ + e->tfdw->value[0] = _f2l; + e->tfdw->value[1] = label; + e->tfdw->value[2] = dir; + e->tfdw->value[3] = govL; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_4(e->tfdw); + + /* _f3l = _f79 */ + /* (X, X, VBZ) -SBJ-> (X, X, X) */ + e->tfdp->value[0] = _f3l; + e->tfdp->value[1] = label; + e->tfdp->value[2] = dir; + e->tfdp->value[3] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_4(e->tfdp); + + /* _f4l = _f80 */ + /* (X, X, X) -SBJ-> (X, boy, NN) */ + e->tfdwp->value[0] = _f4l; + // e->tfdwp->value[1] = label; + // e->tfdwp->value[2] = dir; + e->tfdwp->value[3] = depL; + e->tfdwp->value[4] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_5(e->tfdwp); + + /* _f5l = _f81 */ + /* (X, X,X) -SBJ-> (X, boy, XX) */ + e->tfdw->value[0] = _f5l; + // e->tfdw->value[1] = label; + // e->tfdw->value[2] = dir; + e->tfdw->value[3] = depL; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_4(e->tfdw); + + /* _f6l = _f82 */ + /* (X, X, X) -SBJ-> (X, X, NN) */ + e->tfdp->value[0] = _f6l; + // e->tfdp->value[1] = label; + // e->tfdp->value[2] = dir; + e->tfdp->value[3] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_4(e->tfdp); + + /* _f7l = _f83 */ + /* (X, hit, VBZ) -SBJ-> (X, boy, NN) */ + e->tfdwpwp->value[0] = _f7l; + e->tfdwpwp->value[1] = label; + e->tfdwpwp->value[2] = dir; + e->tfdwpwp->value[3] = govL; + e->tfdwpwp->value[4] = govP; + e->tfdwpwp->value[5] = depL; + e->tfdwpwp->value[6] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdwpwp); + + /* _f8l = _f84 */ + /* (X, X, VBZ) -SBJ-> (X, boy, NN) */ + e->tfdwpp->value[0] = _f8l; + e->tfdwpp->value[1] = label; + e->tfdwpp->value[2] = dir; + e->tfdwpp->value[3] = depL; + e->tfdwpp->value[4] = depP; + e->tfdwpp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdwpp); + + /* _f9l = _f85 */ + /* (X, hit, X) -SBJ-> (X, boy, NN) */ + e->tfdwwp->value[0] = _f9l; + e->tfdwwp->value[1] = label; + e->tfdwwp->value[2] = dir; + e->tfdwwp->value[3] = govL; + e->tfdwwp->value[4] = depL; + e->tfdwwp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdwwp); + + /* _f10l = _f86 */ + /* (X, hit, VBZ) -SBJ-> (X, X, NN) */ + e->tfdwpp->value[0] = _f10l; + // e->tfdwpp->value[1] = label; + // e->tfdwpp->value[2] = dir; + e->tfdwpp->value[3] = govL; + e->tfdwpp->value[4] = govP; + e->tfdwpp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdwpp); + + /* _f11l = _f87 */ + /* (X, hit, VBZ) -SBJ-> (X, boy, XX) */ + e->tfdwwp->value[0] = _f11l; + // e->tfdwwp->value[1] = label; + // e->tfdwwp->value[2] = dir; + e->tfdwwp->value[3] = govL; + e->tfdwwp->value[4] = depL; + e->tfdwwp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdwwp); + + /* _f12l = _f88 */ + /* (X, hit, X) -SBJ-> (X, boy, X) */ + e->tfdww->value[0] = _f12l; + e->tfdww->value[1] = label; + e->tfdww->value[2] = dir; + e->tfdww->value[3] = govL; + e->tfdww->value[4] = depL; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_5(e->tfdww); + + /* _f13l = _f89 */ + /* (X, X, VBZ) -SBJ-> (X, X, NN) */ + e->tfdpp->value[0] = _f13l; + e->tfdpp->value[1] = label; + e->tfdpp->value[2] = dir; + e->tfdpp->value[3] = govP; + e->tfdpp->value[4] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_5(e->tfdpp); + } + + /* MATE: special values for cases like (govm1 == dep) */ + /* linear features */ + if(1){ + e->tfdppp->value[0] = _f14; + e->tfdppp->value[1] = label; + e->tfdppp->value[2] = dir; + e->tfdppp->value[3] = govP; + e->tfdppp->value[4] = govp1P; + e->tfdppp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + e->tfdppp->value[0] = _f15; + // e->tfdppp->value[1] = label; + // e->tfdppp->value[2] = dir; + // e->tfdppp->value[3] = govP; + e->tfdppp->value[4] = depm1P; + // e->tfdppp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + e->tfdppp->value[0] = _f16; + // e->tfdppp->value[1] = label; + // e->tfdppp->value[2] = dir; + // e->tfdppp->value[3] = govP; + e->tfdppp->value[4] = depP; + e->tfdppp->value[5] = depp1P; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* MATE: 10 */ + e->tfdpppp->value[0] = _f17; + e->tfdpppp->value[1] = label; + e->tfdpppp->value[2] = dir; + e->tfdpppp->value[3] = govP; + e->tfdpppp->value[4] = govp1P; + e->tfdpppp->value[5] = depm1P; + e->tfdpppp->value[6] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* MATE: 11 */ + e->tfdpppp->value[0] = _f18; + // e->tfdpppp->value[1] = label; + // e->tfdpppp->value[2] = dir; + e->tfdpppp->value[3] = govm1P; + e->tfdpppp->value[4] = govP; + // e->tfdpppp->value[5] = depm1P; + // e->tfdpppp->value[6] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* MATE: 9 */ + e->tfdpppp->value[0] = _f19; + // e->tfdpppp->value[1] = label; + // e->tfdpppp->value[2] = dir; + e->tfdpppp->value[3] = govP; + e->tfdpppp->value[4] = govp1P; + e->tfdpppp->value[5] = depP; + e->tfdpppp->value[6] = depp1P; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* MATE: 12 */ + e->tfdpppp->value[0] = _f20; + // e->tfdpppp->value[1] = label; + // e->tfdpppp->value[2] = dir; + e->tfdpppp->value[3] = govm1P; + e->tfdpppp->value[4] = govP; + // e->tfdpppp->value[5] = depP; + // e->tfdpppp->value[6] = depp1P; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + } + + /* subcat features */ + if(ctx->subcat_features){ + for(i=0; i < subcat_feats_nb; i++){ + e->tflpp->value[0] = _f77; + e->tflpp->value[1] = label; + e->tflpp->value[2] = subcat_feats_array[i]; + e->tflpp->value[3] = govP; + e->tflpp->value[4] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_5(e->tflpp); + } + } + + + + + + /* _f41: morphological features (fields FEAT and PFEAT of the CONLL format) */ + /* if (feats == null) + return; + + if (_f41 > 0) { + short[] featsP = feats[gov], featsD = feats[dep]; + dlf->value[0] = _f41; + dlf->value[1] = label; + dlf->value[2] = govP; + dlf->value[3] = depP; + extractFeat(f, dir, featsP, featsD); + } + */ + + return fv; +} + + +feat_vector *grandchildren(maca_graph_parser_sentence *s, maca_graph_parser_ctx *ctx, int gov, int dep, int gdep, int label, feat_vector *fv){ + /** + * + */ + + if(fv == NULL){ + fv = allocate_feat_vector(GrandchildrenFeatNb); + } else { + /* reset */ + fv->elt_nb = 0; + } + + maca_graph_parser_templ_library *e = ctx->e; + + int dir = (gov < dep) ? ra : la; + /* gov and dep */ + int govF = s->words[gov]; + int depF = s->words[dep]; + int govL = s->lemmas[gov]; + int depL = s->lemmas[dep]; + int govP = s->pos[gov]; + int depP = s->pos[dep]; + /* gov and dep +- 1 */ + int govPm1 = (gov == 0) ? ctx->pos_start : s->pos[gov - 1]; // parent-pos-minus1 + int depPm1 = (dep == 0) ? ctx->pos_start : s->pos[dep - 1]; // child-pos-minus1 + int govPp1 = (gov == s->l - 1) ? ctx->pos_end : s->pos[gov + 1]; + int depPp1 = (dep == s->l - 1) ? ctx->pos_end : s->pos[dep + 1]; + /* gdep == -1 in case the grandchild does not exist */ + int dir_gdep = (dep < gdep) ? ra : la; + int gdepP, gdepF, gdepL, gdepPm1, gdepPp1; + + int subcat_feats_nb = s->synt_feats_nb[gov]; + int *subcat_feats_array = s->synt_feats_array[gov]; + int i,j; + + + if(gdep == -1){ + gdepP = ctx->pos_start; + gdepF = ctx->w_start; + gdepL = ctx->w_start; + gdepPm1 = ctx->pos_start; + gdepPp1 = ctx->pos_end; + } else { + gdepP = s->pos[gdep]; + gdepF = s->words[gdep]; + gdepL = s->lemmas[gdep]; + gdepPm1 = (gdep == 0) ? ctx->pos_start : s->pos[gdep - 1]; + gdepPp1 = (gdep == s->l - 1) ? ctx->pos_end : s->pos[gdep + 1]; + } + + /* (key, key, NN) -NMOD-> (to, to, TO) -PMOD-> (heaven, heaven, NN) */ + + if(1){ + /* (X, X, NN) -NMOD-> (X, X, TO) -X-> (X, X, NN) */ + e->tfddppp->value[0] = _f21; + e->tfddppp->value[1] = label; + e->tfddppp->value[2] = dir; + e->tfddppp->value[3] = dir_gdep; + e->tfddppp->value[4] = govP; + e->tfddppp->value[5] = depP; + e->tfddppp->value[6] = gdepP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfddppp); + + /* (X, X, NN) -NMOD-> (X, X, X) -X-> (X, X, NN) */ + e->tfddpp->value[0] = _f22; + e->tfddpp->value[1] = label; + e->tfddpp->value[2] = dir; + e->tfddpp->value[3] = dir_gdep; + e->tfddpp->value[4] = govP; + e->tfddpp->value[5] = gdepP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddpp); + + /* (X, X, X) -NMOD-> (X, X, TO) -X-> (X, X, NN) */ + e->tfddpp->value[0] = _f23; + // e->tfddpp->value[1] = label; + // e->tfddpp->value[2] = dir; + // e->tfddpp->value[3] = dir_gdep; + e->tfddpp->value[4] = depP; + e->tfddpp->value[5] = gdepP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddpp); + } + + if(ctx->use_full_forms){ + /* (key, X, X) -NMOD-> (X, X, X) -X-> (heaven, X, X) */ + e->tfddww->value[0] = _f24; + e->tfddww->value[1] = label; + e->tfddww->value[2] = dir; + e->tfddww->value[3] = dir_gdep; + e->tfddww->value[4] = govF; + e->tfddww->value[5] = gdepF; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddww); + + /* (X, X, X) -NMOD-> (to, X, X) -X-> (heaven, X, X) */ + e->tfddww->value[0] = _f25; + // e->tfddww->value[1] = label; + // e->tfddww->value[2] = dir; + // e->tfddww->value[3] = dir_gdep; + e->tfddww->value[4] = depF; + // e->tfddww->value[5] = gdepF; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddww); + + /* (X, X, NN) -NMOD-> (X, X, X) -X-> (heaven, X, X) */ + e->tfddwp->value[0] = _f26; + e->tfddwp->value[1] = label; + e->tfddwp->value[2] = dir; + e->tfddwp->value[3] = dir_gdep; + e->tfddwp->value[4] = gdepF; + e->tfddwp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddwp); + + /* (X, X, X) -NMOD-> (X, X, TO) -X-> (heaven, X, X) */ + e->tfddwp->value[0] = _f27; + // e->tfddwp->value[1] = label; + // e->tfddwp->value[2] = dir; + // e->tfddwp->value[3] = dir_gdep; + // e->tfddwp->value[4] = gdepF; + e->tfddwp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddwp); + + /* (key, X, X) -NMOD-> (X, X, X) -X-> (X, X, NN) */ + e->tfddwp->value[0] = _f28; + // e->tfddwp->value[1] = label; + // e->tfddwp->value[2] = dir; + // e->tfddwp->value[3] = dir_gdep; + e->tfddwp->value[4] = govF; + e->tfddwp->value[5] = gdepP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddwp); + + /* (X, X, X) -NMOD-> (to, X, X) -X-> (X, X, NN) */ + e->tfddwp->value[0] = _f29; + // e->tfddwp->value[1] = label; + // e->tfddwp->value[2] = dir; + // e->tfddwp->value[3] = dir_gdep; + e->tfddwp->value[4] = depF; + // e->tfddwp->value[5] = gdepP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddwp); + } + + if(ctx->use_lemmas){ + /* _f23l = _f92 */ + /* feature in paper not in macaon */ + + /* _f24l = _f91 */ + /* (X, key, X) -NMOD-> (X, X, X) -X-> (X, heaven, X) */ + e->tfddww->value[0] = _f24l; + // e->tfddww->value[1] = label; + // e->tfddww->value[2] = dir; + // e->tfddww->value[3] = dir_gdep; + e->tfddww->value[4] = govL; + e->tfddww->value[5] = gdepL; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddww); + + /* _f25l = ??? */ + /* feature in macaon not in paper */ + /* (X, X, X) -NMOD-> (X, to, X) -X-> (X, heaven, X) */ + e->tfddww->value[0] = _f25l; + // e->tfddww->value[1] = label; + // e->tfddww->value[2] = dir; + // e->tfddww->value[3] = dir_gdep; + e->tfddww->value[4] = depL; + e->tfddww->value[5] = gdepL; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddww); + + /* _f26l = _f93 */ + /* (X, X, NN) -NMOD-> (X, X, X) -X-> (X, heaven, X) */ + e->tfddwp->value[0] = _f26l; + // e->tfddwp->value[1] = label; + // e->tfddwp->value[2] = dir; + // e->tfddwp->value[3] = dir_gdep; + e->tfddwp->value[4] = gdepL; + e->tfddwp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddwp); + + /* _f27l = _f94 */ + /* (X, X, X) -NMOD-> (X, X, TO) -X-> (X, heaven, X) */ + e->tfddwp->value[0] = _f27l; + // e->tfddwp->value[1] = label; + // e->tfddwp->value[2] = dir; + // e->tfddwp->value[3] = dir_gdep; + // e->tfddwp->value[4] = gdepL; + e->tfddwp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddwp); + + /* _f28l = _f95 */ + /* (X, key, X) -NMOD-> (X, X, X) -X-> (X, X, NN) */ + e->tfddwp->value[0] = _f28l; + // e->tfddwp->value[1] = label; + // e->tfddwp->value[2] = dir; + // e->tfddwp->value[3] = dir_gdep; + e->tfddwp->value[4] = govL; + e->tfddwp->value[5] = gdepP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddwp); + + /* _f29l = _f96 */ + /* (X, X, X) -NMOD-> (X, to, X) -X-> (X, X, NN) */ + e->tfddwp->value[0] = _f29l; + // e->tfddwp->value[1] = label; + // e->tfddwp->value[2] = dir; + // e->tfddwp->value[3] = dir_gdep; + e->tfddwp->value[4] = depL; + // e->tfddwp->value[5] = gdepP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfddwp); + } + + // linear features + /* (X, X, X) -> (X, depP, X) -> (X, gdepP, gdepPp1) */ + e->tfdppp->value[0] = _f42; + e->tfdppp->value[1] = label; + e->tfdppp->value[2] = dir; + e->tfdppp->value[3] = gdepP; + e->tfdppp->value[4] = gdepPp1; + e->tfdppp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* (X, X, X) -> (X, depP, X) -> (gdepPm1, gdepP, X) */ + e->tfdppp->value[0] = _f43; + // e->tfdppp->value[1] = label; + // e->tfdppp->value[2] = dir; + // e->tfdppp->value[3] = gdepP; + e->tfdppp->value[4] = gdepPm1; + // e->tfdppp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* (X, X, X) -> (X, depP, depPp1) -> (X, gdepP, X) */ + e->tfdppp->value[0] = _f44; + // e->tfdppp->value[1] = label; + // e->tfdppp->value[2] = dir; + // e->tfdppp->value[3] = gdepP; + e->tfdppp->value[4] = depPp1; + // e->tfdppp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* (X, X, X) -> (depPm1, depP, X) -> (X, gdepP, X) */ + e->tfdppp->value[0] = _f45; + // e->tfdppp->value[1] = label; + // e->tfdppp->value[2] = dir; + // e->tfdppp->value[3] = gdepP; + e->tfdppp->value[4] = depPm1; + // e->tfdppp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* (X, X, X) -> (depPm1, depP, X) -> (X, gdepP, gdepPp1) */ + e->tfdpppp->value[0] = _f46; + e->tfdpppp->value[1] = label; + e->tfdpppp->value[2] = dir; + e->tfdpppp->value[3] = gdepP; + e->tfdpppp->value[4] = gdepPp1; + e->tfdpppp->value[5] = depP; + e->tfdpppp->value[6] = depPm1; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* (X, X, X) -> (depPm1, depP, X) -> (gdepPm1, gdepP, X) */ + e->tfdpppp->value[0] = _f47; + // e->tfdpppp->value[1] = label; + // e->tfdpppp->value[2] = dir; + e->tfdpppp->value[3] = gdepPm1; + e->tfdpppp->value[4] = gdepP; + // e->tfdpppp->value[5] = depP; + // e->tfdpppp->value[6] = depPm1; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* (X, X, X) -> (X, depP, depPp1) -> (X, gdepP, gdepPp1) */ + e->tfdpppp->value[0] = _f48; + // e->tfdpppp->value[1] = label; + // e->tfdpppp->value[2] = dir; + e->tfdpppp->value[3] = gdepPp1; + // e->tfdpppp->value[4] = gdepP; + // e->tfdpppp->value[5] = depP; + e->tfdpppp->value[6] = depPp1; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* (X, X, X) -> (X, depP, depPp1) -> (gdepPm1, gdepP, X) */ + e->tfdpppp->value[0] = _f49; + // e->tfdpppp->value[1] = label; + // e->tfdpppp->value[2] = dir; + e->tfdpppp->value[3] = gdepPm1; + // e->tfdpppp->value[4] = gdepP; + // e->tfdpppp->value[5] = depP; + // e->tfdpppp->value[6] = depPp1; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* (X, govP, X) -> (X, X, X) -> (X, gdepP, gdepPp1) */ + e->tfdppp->value[0] = _f50; + // e->tfdppp->value[1] = label; + // e->tfdppp->value[2] = dir; + e->tfdppp->value[3] = gdepPp1; + e->tfdppp->value[4] = gdepP; + e->tfdppp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* (X, govP, X) -> (X, X, X) -> (gdepPm1, gdepP, X) */ + e->tfdppp->value[0] = _f51; + // e->tfdppp->value[1] = label; + // e->tfdppp->value[2] = dir; + e->tfdppp->value[3] = gdepPm1; + // e->tfdppp->value[4] = gdepP; + // e->tfdppp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* (X, govP, govPp1) -> (X,X,X) -> (X, gdepP, X) */ + e->tfdppp->value[0] = _f52; + // e->tfdppp->value[1] = label; + // e->tfdppp->value[2] = dir; + e->tfdppp->value[3] = govPp1; + // e->tfdppp->value[4] = gdepP; + // e->tfdppp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* (govPm1, govP, X) -> (X, X, X) -> (X, gdepP, X) */ + e->tfdppp->value[0] = _f53; + // e->tfdppp->value[1] = label; + // e->tfdppp->value[2] = dir; + e->tfdppp->value[3] = govPm1; + // e->tfdppp->value[4] = gdepP; + // e->tfdppp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* (govPm1, govP, X) -> (X, X, X) -> (X, gdepP, gdepPp1) */ + e->tfdpppp->value[0] = _f54; + // e->tfdpppp->value[1] = label; + // e->tfdpppp->value[2] = dir; + e->tfdpppp->value[3] = govPm1; + e->tfdpppp->value[4] = gdepPp1; + e->tfdpppp->value[5] = gdepP; + e->tfdpppp->value[6] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + // I think that I introduced a bug: + // e->dl1->value[0]= _f55; e->dl1->value[2]=gdepPm1; e->dl1->value[3]=gdepP;//e->dl1->value[4]=depPm1;e->dl1->value[5]=cldP; + + /* (govPm1, govP, X) -> (X, X, X) -> (gdepPm1, gdepP, X) */ + e->tfdpppp->value[0] = _f55; + // e->tfdpppp->value[1] = label; + // e->tfdpppp->value[2] = dir; + // e->tfdpppp->value[3] = govPm1; + e->tfdpppp->value[4] = gdepPm1; + // e->tfdpppp->value[5] = gdepP; + // e->tfdpppp->value[6] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* (X, govP, govPp1) -> (X, X, X) -> (X, gdepP, gdepPp1) */ + e->tfdpppp->value[0] = _f56; + // e->tfdpppp->value[1] = label; + // e->tfdpppp->value[2] = dir; + e->tfdpppp->value[3] = govPp1; + e->tfdpppp->value[4] = gdepPp1; + // e->tfdpppp->value[5] = gdepP; + // e->tfdpppp->value[6] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + // I think that I introduced a bug: + // e->dl1->value[0]= _f57; e->dl1->value[2]=gdepPm1; e->dl1->value[3]=gdepP; e->dl1->value[4]=govP; + // //e->dl1->value[5]=depPp1; + + /* (X, govP, govPp1) -> (X, X, X) -> (gdepPm1, gdepP, X) */ + e->tfdpppp->value[0] = _f57; + // e->tfdpppp->value[1] = label; + // e->tfdpppp->value[2] = dir; + // e->tfdpppp->value[3] = govPp1; + e->tfdpppp->value[4] = gdepPm1; + // e->tfdpppp->value[5] = gdepP; + // e->tfdpppp->value[6] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + + /* subcat features */ + if(ctx->subcat_features){ + for(i=0; i < subcat_feats_nb; i++){ + e->tflppp->value[0] = _f78; + e->tflppp->value[1] = label; + e->tflppp->value[2] = subcat_feats_array[i]; + e->tflppp->value[3] = govP; + e->tflppp->value[4] = depP; + e->tflppp->value[5] = gdepP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tflppp); + + e->tflpwp->value[0] = _f79; + e->tflpwp->value[1] = label; + e->tflpwp->value[2] = subcat_feats_array[i]; + e->tflpwp->value[3] = govP; + e->tflpwp->value[4] = depL; + e->tflpwp->value[5] = gdepP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tflpwp); + } + } + + + /* _f74: morphological features (fields FEAT and PFEAT of the CONLL format) */ + /* + if (feats == null) + return; + + short[] featsP = feats[dep]; + short[] featsD = gdep != -1 ? feats[gdep] : null; + + dlf.v0 = _f74; + dlf.v1 = label; + dlf.v2 = gdepP; + dlf.v3 = depP; + extractFeat(f, dir, featsP, featsD); + */ + return fv; +} + +feat_vector *sibling(maca_graph_parser_sentence *s, maca_graph_parser_ctx *ctx, int gov, int dep, int sblng, int label, feat_vector *fv){ + /** + * + */ + + if(fv == NULL){ + fv = allocate_feat_vector(SiblingFeatNb); + } else { + /* reset */ + fv->elt_nb = 0; + } + + maca_graph_parser_templ_library *e = ctx->e; + + int dir = (gov < dep) ? ra : la; + /* gov and dep */ + int govF = s->words[gov]; + int govL = s->lemmas[gov]; + int govP = s->pos[gov]; + int depF = s->words[dep]; + int depL = s->lemmas[dep]; + int depP = s->pos[dep]; + /* gov and dep +- 1 */ + int govPm1 = (gov == 0) ? ctx->pos_start : s->pos[gov - 1]; // parent-pos-minus1 + int depPm1 = (dep == 0) ? ctx->pos_start : s->pos[dep - 1]; // child-pos-minus1 + int govPp1 = (gov == s->l - 1) ? ctx->pos_end : s->pos[gov + 1]; + int depPp1 = (dep == s->l - 1) ? ctx->pos_end : s->pos[dep + 1]; + /* sblng == -1 in case dep has no sibling */ + int sblP, sblF, sblL, sblPm1, sblPp1; + if(sblng == -1){ + sblP = ctx->pos_start; + sblF = ctx->w_start; + sblL = ctx->w_start; + sblPm1 = ctx->pos_start; + sblPp1 = ctx->pos_end; + } else { + sblP = s->pos[sblng]; + sblF = s->words[sblng]; + sblL = s->lemmas[sblng]; + sblPm1 = (sblng == 0) ? ctx->pos_start : s->pos[sblng - 1]; + sblPp1 = (sblng == s->l - 1) ? ctx->pos_end : s->pos[sblng + 1]; + } + /* distance */ + int real_dist = abs(gov - dep); + int dist; + if (real_dist > 10) + dist = d10; + else if (real_dist > 5) + dist = d5; + else if (real_dist == 5) + dist = d4; + else if (real_dist == 4) + dist = d3; + else if (real_dist == 3) + dist = d2; + else if (real_dist == 2) + dist = d1; + else + dist = di0; + + /* common values to all features */ + e->tfdsppp->value[1] = label; + e->tfdspp->value[1] = label; + e->tfdsww->value[1] = label; + e->tfdswp->value[1] = label; + e->tfdppp->value[1] = label; + e->tfdpppp->value[1] = label; + + e->tfdsppp->value[2] = dir; + e->tfdspp->value[2] = dir; + e->tfdsww->value[2] = dir; + e->tfdswp->value[2] = dir; + e->tfdppp->value[2] = dir; + e->tfdpppp->value[2] = dir; + + e->tfdsppp->value[3] = dist; + e->tfdspp->value[3] = dist; + e->tfdsww->value[3] = dist; + e->tfdswp->value[3] = dist; + + e->tfdppp->value[3] = sblP; + e->tfdpppp->value[3] = sblP; + + + /* dep gov sbl */ + /* (John, John, NNP) <- SBJ - (ate, eat, VB) - OBJ -> (apple, apple, NN) */ + + /* (X, X, NNP) <- SBJ - (X, X, VB) - X -> (X, X, NN) */ + e->tfdsppp->value[0] = _f30; + e->tfdsppp->value[4] = govP; + e->tfdsppp->value[5] = depP; + e->tfdsppp->value[6] = sblP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdsppp); + + /* (X, X, X) <- SBJ - (X, X, VB) - X -> (X, X, NN) */ + e->tfdspp->value[0] = _f31; + e->tfdspp->value[4] = govP; + e->tfdspp->value[5] = sblP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdspp); + + /* (X, X, NNP) <- SBJ - (X, X, X) - X -> (X, X, NN) */ + e->tfdspp->value[0] = _f32; + e->tfdspp->value[4] = depP; + e->tfdspp->value[5] = sblP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdspp); + + if(ctx->use_full_forms){ + /* (X, X, X) <- SBJ - (ate, X, X) - X -> (apple, X, X) */ + e->tfdsww->value[0] = _f33; + e->tfdsww->value[4] = govF; + e->tfdsww->value[5] = sblF; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdsww); + + /* (John, X, X) <- SBJ - (X, X, X) - X -> (apple, X, X) */ + e->tfdsww->value[0] = _f34; + e->tfdsww->value[4] = depF; + e->tfdsww->value[5] = sblF; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdsww); + + /* (X, X, X) <- SBJ - (X, X, VB) - X -> (apple, X, X) */ + e->tfdswp->value[0] = _f35; + e->tfdswp->value[4] = sblF; + e->tfdswp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdswp); + + /* (X, X, NNP) <- SBJ - (X, X, X) - X -> (apple, X, X) */ + e->tfdswp->value[0] = _f36; + e->tfdswp->value[4] = sblF; + e->tfdswp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdswp); + + /* (X, X, X) <- SBJ - (ate, X, X) - X -> (X, X, P) */ + e->tfdswp->value[0] = _f37; + e->tfdswp->value[4] = govF; + e->tfdswp->value[5] = sblP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdswp); + + /* (John, X, X) <- SBJ - (X, X, X) - X -> (X, X, P) */ + e->tfdswp->value[0] = _f38; + e->tfdswp->value[4] = depF; + e->tfdswp->value[5] = sblP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdswp); + } + + // sibling word only could be tried + // lemmas + if(ctx->use_lemmas){ + /* f[97..102] */ + + /* (X, X, X) <- SBJ - (X, eat, X) - X -> (X, apple, X) */ + e->tfdsww->value[0] = _f33l; + e->tfdsww->value[4] = govL; + e->tfdsww->value[5] = sblL; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdsww); + + /* (X, John, X) <- SBJ - (X, X, X) - X -> (X, apple, X) */ + e->tfdsww->value[0] = _f34l; + e->tfdsww->value[4] = depL; + e->tfdsww->value[5] = sblL; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdsww); + + /* (X, X, X) <- SBJ - (X, X, VB) - X -> (X, apple, X) */ + e->tfdswp->value[0] = _f35l; + e->tfdswp->value[4] = sblL; + e->tfdswp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdswp); + + /* (X, X, NNP) <- SBJ - (X, X, X) - X -> (X, apple, X) */ + e->tfdswp->value[0] = _f36l; + e->tfdswp->value[4] = sblL; + e->tfdswp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdswp); + + /* (X, X, X) <- SBJ - (X, eat, X) - X -> (X, X, NN) */ + e->tfdswp->value[0] = _f37l; + e->tfdswp->value[4] = govL; + e->tfdswp->value[5] = sblP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdswp); + + /* (X, John, X) <- SBJ - (X, X, X) - X -> (X, X, NN) */ + e->tfdswp->value[0] = _f38l; + e->tfdswp->value[4] = depL; + e->tfdswp->value[5] = sblP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdswp); + } + + /* linear features */ + /* (depPm1, depP, depPp1) <-L- (govPm1, govP, govPp1) -X-> (sblPm1, sblP, sblPp1) */ + + /* (X, X, X) <-L- (X, govP, X) -X-> (X, sblP, sblPp1) */ + e->tfdppp->value[0] = _f58; + e->tfdppp->value[4] = sblPp1; + e->tfdppp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* (X, X, X) <-L- (X, govP, X) -X-> (sblPm1, sblP, X) */ + e->tfdppp->value[0] = _f59; + e->tfdppp->value[4] = sblPm1; + // e->tfdppp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* (X, X, X) <-L- (X, govP, govPp1) -X-> (X, sblP, X) */ + e->tfdppp->value[0] = _f60; + e->tfdppp->value[4] = govPp1; + // e->tfdppp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* (X, X, X) <-L- (govPm1, govP, X) -X-> (X, sblP, X) */ + e->tfdppp->value[0] = _f61; + e->tfdppp->value[4] = govPm1; + // e->tfdppp->value[5] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* different from paper: no govP in paper */ + /* (X, X, X) <-L- (govPm1, govP, X) -X-> (X, sblP, sblPp1) */ + e->tfdpppp->value[0] = _f62; + e->tfdpppp->value[4] = sblPp1; + e->tfdpppp->value[5] = govPm1; + e->tfdpppp->value[6] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* different from paper: no govP in paper */ + /* (X, X, X) <-L- (govPm1, govP, X) -X-> (sblPm1, sblP, X) */ + e->tfdpppp->value[0] = _f63; + e->tfdpppp->value[4] = sblPm1; + e->tfdpppp->value[5] = govPm1; + e->tfdpppp->value[6] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* different from paper: no govPp1 in paper */ + /* (X, X, X) <-L- (X, govP, govPp1) -X-> (X, sblP, sblPp1) */ + e->tfdpppp->value[0] = _f64; + e->tfdpppp->value[4] = sblPp1; + e->tfdpppp->value[5] = govPp1; + e->tfdpppp->value[6] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* both in paper and here */ + /* (X, X, X) <-L- (X, govP, govPp1) -X-> (sblPm1, sblP, X) */ + e->tfdpppp->value[0] = _f65; + e->tfdpppp->value[4] = sblPm1; + e->tfdpppp->value[5] = govPp1; + e->tfdpppp->value[6] = govP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* (X, depP, X) <-L- (X, X, X) -X-> (X, sblP, sblPp1) */ + e->tfdppp->value[0] = _f66; + e->tfdppp->value[4] = sblPp1; + e->tfdppp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* (X, depP, X) <-L- (X, X, X) -X-> (sblPm1, sblP, X) */ + e->tfdppp->value[0] = _f67; + e->tfdppp->value[4] = sblPm1; + e->tfdppp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* different from paper: no label in paper */ + /* (X, depP, depPp1) <-L- (X, X, X) -X-> (X, sblP, X) */ + e->tfdppp->value[0] = _f68; + e->tfdppp->value[4] = depPp1; + e->tfdppp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* different from paper: no label in paper */ + /* (depPm1, depP, X) <-L- (X, X, X) -X-> (X, sblP, X) */ + e->tfdppp->value[0] = _f69; + e->tfdppp->value[4] = depPm1; + e->tfdppp->value[5] = depP; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_6(e->tfdppp); + + /* different from paper: no label in paper */ + /* (depPm1, depP, X) <-L- (X, X, X) -X-> (X, sblP, sblPp1) */ + e->tfdpppp->value[0] = _f70; + e->tfdpppp->value[4] = sblPp1; + e->tfdpppp->value[5] = depP; + e->tfdpppp->value[6] = depPm1; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* different from paper: no label in paper */ + /* (depPm1, depP, X) <-L- (X, X, X) -X-> (sblPm1, sblP, X) */ + e->tfdpppp->value[0] = _f71; + e->tfdpppp->value[4] = sblPm1; + e->tfdpppp->value[5] = depP; + e->tfdpppp->value[6] = depPm1; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* different from paper: no label in paper */ + /* (X, depP, depPp1) <-L- (X, X, X) -X-> (X, sblP, sblPp1) */ + e->tfdpppp->value[0] = _f72; + e->tfdpppp->value[4] = sblPp1; + e->tfdpppp->value[5] = depP; + e->tfdpppp->value[6] = depPp1; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* different from paper: no label in paper */ + /* (X, depP, depPp1) <-L- (X, X, X) -X-> (sblPm1, sblP, X) */ + e->tfdpppp->value[0] = _f73; + e->tfdpppp->value[4] = sblPm1; + e->tfdpppp->value[5] = depP; + e->tfdpppp->value[6] = depPp1; + fv->array[(fv->elt_nb)++] = maca_graph_parser_encode_feature_7(e->tfdpppp); + + /* _f75, _f76: morphological features (fields FEAT and PFEAT of the CONLL format) */ + /* + if (feats == null) + return; + + feature_t l; + + short[] featsP = feats[dep]; + short[] featsSbl = sblng != -1 ? feats[sblng] : null; + + dlf.v0 = _f75; + dlf.v1 = label; + dlf.v2 = sblP; + dlf.v3 = depP; + extractFeat(f, dir, featsP, featsSbl); + + featsP = feats[gov]; + featsSbl = sblng != -1 ? feats[sblng] : null; + + dlf.v0 = _f76; + dlf.v1 = label; + dlf.v2 = govP; + dlf.v3 = sblP; + if (featsP != null && featsSbl != null) { + for (short i1 = 0; i1 < featsP.length; i1++) { + for (short i2 = 0; i2 < featsSbl.length; i2++) { + dlf.v4 = featsP[i1]; + dlf.v5 = featsSbl[i2]; + l = maca_graph_parser_encode_feature_6(dlf); + l = dlf.maca_graph_parser_encode_feature_s(e->s_dir, gov < sblng ? 1 : 2, l); + fv->array[(fv->elt_nb)++] = l2i(l); + } + } + } else if (featsP == null && featsSbl != null) { + + for (short i2 = 0; i2 < featsSbl.length; i2++) { + dlf.v4 = nofeat; + dlf.v5 = featsSbl[i2]; + l = maca_graph_parser_encode_feature_6(dlf); + l = dlf.maca_graph_parser_encode_feature_s(e->s_dir, dir, l); + fv->array[(fv->elt_nb)++] = l2i(l); + } + + } else if (featsP != null && featsSbl == null) { + + for (short i1 = 0; i1 < featsP.length; i1++) { + dlf.v4 = featsP[i1]; + dlf.v5 = nofeat; + l = maca_graph_parser_encode_feature_6(dlf); + l = dlf.maca_graph_parser_encode_feature_s(e->s_dir, dir, l); + fv->array[(fv->elt_nb)++] = l2i(l); + } + }*/ + + return fv; + +} + + + + + + diff --git a/maca_graph_parser/maca_graph_parser_features.h b/maca_graph_parser/maca_graph_parser_features.h new file mode 100644 index 0000000000000000000000000000000000000000..9c8d8cc6534f5d10fec06fafc7c7a2af08c9cde5 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_features.h @@ -0,0 +1,57 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_GRAPH_PARSER_FEATURES__ +#define __MACA_GRAPH_PARSER_FEATURES__ + +#include"maca_graph_parser.h" +#include"maca_graph_parser_sentence.h" +#include "maca_graph_parser_feature_vector.h" + +#ifdef __cplusplus +extern "C"{ +#endif + + +feat_vector *first(maca_graph_parser_sentence *s, maca_graph_parser_ctx *ctx, int gov, int dep, int label, feat_vector *fv); +feat_vector *basic(maca_graph_parser_sentence *s, maca_graph_parser_ctx *ctx, int gov, int dep, feat_vector *fv); +feat_vector *grandchildren(maca_graph_parser_sentence *s, maca_graph_parser_ctx *ctx, int gov, int dep, int gdep, int label, feat_vector *fv); +feat_vector *sibling(maca_graph_parser_sentence *s, maca_graph_parser_ctx *ctx, int gov, int dep, int sblng, int label, feat_vector *fv); +feat_vector *subcat(maca_graph_parser_sentence *s, maca_graph_parser_ctx *ctx, int gov, int dep, int label, feat_vector *fv); + +void maca_graph_parser_print_feature_bin(FILE *f, feature_t feat); +void maca_graph_parser_print_feature(FILE *f, maca_graph_parser_ctx *ctx, feature_t feat); + +int maca_graph_parser_get_feature_type(feature_t feat, maca_graph_parser_templ_library *tl); +templ *maca_graph_parser_get_templ(feature_t feat, maca_graph_parser_templ_library *tl); +int maca_graph_parser_get_feature_label(feature_t feat, maca_graph_parser_templ_library *tl); +int maca_graph_parser_get_feature_direction(feature_t feat, maca_graph_parser_templ_library *tl); +feature_t maca_graph_parser_get_feature_hash_key(feature_t feat, maca_graph_parser_templ_library *tl); +void maca_graph_parser_decompose_feature(feature_t feat, int *direction, int *label, feature_t *hash_key, maca_graph_parser_templ_library *tl); + +maca_graph_parser_templ_library *maca_graph_parser_templ_library_allocator(maca_graph_parser_ctx *ctx); +void maca_graph_parser_templ_library_free(maca_graph_parser_templ_library *tl); +#ifdef __cplusplus +} +#endif + + +#endif + + diff --git a/maca_graph_parser/maca_graph_parser_hash.c b/maca_graph_parser/maca_graph_parser_hash.c new file mode 100644 index 0000000000000000000000000000000000000000..264f01ef131b5fa29d79d37f68fc1a2b2f4f197e --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_hash.c @@ -0,0 +1,379 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + Frederic Bechet <frederic.bechet@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <math.h> + +// for mmap() +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <unistd.h> +#include <errno.h> + +#include "maca_graph_parser_hash.h" + + +/*----------------------------------------------------------------*/ + +#define CaseVide(indice,table) (table[indice]==VIDE) +#define SiEgal(valeur,indice,table) (table[indice]==valeur) + +/*----------------------------------------------------------------*/ +#define TETA1 0.6180339887 +//#define hash_func(clef,taille) (int)(fmodl((long double)(clef)* (long double) TETA1, (long double)1)*(taille)) + + +#define hash_func(clef,taille) (int)((clef) % (feature_t)(taille)) + +/*int hash_func(feature_t clef, int taille) +{ + return (int)(clef % (feature_t)taille); +}*/ + + +/*----------------------------------------------------------------*/ + +/*----------------------------------------------------------------*/ + +/* Recherche des nombres premiers */ + +/* renvoie le premier nombre premier superieur a un nombre donne en argument */ + +#define MaxPrem 100000 + +int prem_premier(int maxi ) +{ + static int TablPrem[MaxPrem]; + int nbprem,n,i,limite,last; + + limite=(int)ceil(sqrt((double)maxi)); + last=1; + nbprem=0; + for (n=2;last<maxi;n++) + { + for(i=0;(i<nbprem)&&(n%TablPrem[i]!=0);i++); + if (i==nbprem) + { + if (nbprem==MaxPrem-1) { fprintf(stderr,"MaxPrem trop petit !!\n"); exit(0); } + last=n; + if (n<=limite) TablPrem[nbprem++]=n; + } + } + + if (last==maxi) last=prem_premier(maxi+1); /* pour le cas ou maxi est deja premier !! */ + /*printf("Le premier nombre premier superieur a %d est %d\n",maxi,last);*/ + return last; +} + +/*----------------------------------------------------------------*/ +/* low level functions */ + +static inline int lookup_node(maca_graph_parser_hash *t, feature_t clef){ + /** + * Get the index where clef should be or should be put in t. + */ + int len_t = t->taille; + int indice = hash_func(clef,len_t); + + int essai; + for(essai=1; (essai<=len_t) && (!CaseVide(indice,t->table_clef)); essai++){ + if(SiEgal(clef,indice,t->table_clef)){ + /* fprintf(stderr, "%d\n", essai); */ + return indice; + } + indice = (indice + 1) % len_t; + } + /* fprintf(stderr, "%d\n", essai); */ + + return indice; +} + + +static inline int insert_node(maca_graph_parser_hash *t, int node_index, feature_t clef, float valeur, float valeur2){ + /** + * Insert clef: (valeur, valeur2) in t at node_index. + * Returns TRUE if clef did not already exist. + */ + + int already_exists = SiEgal(clef,node_index,t->table_clef); + + if(!already_exists){ + t->nbelem++; + t->table_clef[node_index]=clef; + } + + t->params[node_index]=valeur; + t->total[node_index]=valeur2; + + return !already_exists; +} + + +/*----------------------------------------------------------------*/ +/* high level functions */ + +int recherche_hash_index(maca_graph_parser_hash *t, feature_t clef){ + /** + * Returns the index of the node where clef is in t, or -1 is clef is absent. + */ + int node_index = lookup_node(t, clef); + return (SiEgal(clef, node_index, t->table_clef)) + ? node_index + : -1; +} + + +float *recherche_hash(maca_graph_parser_hash *t, feature_t clef){ + /** + * Get the value associated to key clef in table table. + * + * This function is the hotspot of the parser (as of 2014-06). + */ + + /* basic version */ + /* + int node_index = lookup_node(t, clef); + if(!CaseVide(node_index,t->table_clef)){ + if(SiEgal(clef,node_index,t->table_clef)){ + return &(t->params[node_index]); + } + return NULL; + } + return NULL; + */ + + /* optimized version (manual inlining ++) */ + /* code adapted from lookup_node */ + int len_t = t->taille; + int indice = hash_func(clef,len_t); + + int essai; + for(essai=1; (essai<=len_t) && (!CaseVide(indice,t->table_clef)); essai++){ + if(SiEgal(clef,indice,t->table_clef)){ + // fprintf(stderr, "%d\n", essai); + return &(t->params[indice]); + } + indice = (indice + 1) % len_t; + } +// fprintf(stderr, "%d\n", essai); + return NULL; +} + + +int range_hash(maca_graph_parser_hash *t, feature_t clef, float valeur, float valeur2){ + /** + * Insert clef: (valeur, valeur2) in t. + * Returns -1 if t was already full, 0 if clef already existed + * and 1 if the insertion succeeded. + */ + + int node_index = lookup_node(t, clef); + + if(!CaseVide(node_index,t->table_clef)){ + if(!SiEgal(clef,node_index,t->table_clef)){ + int len_t = t->taille; + fprintf(stdout,"Hash table full (size = %d).\n", len_t); + return -1; + } + return 0; /* key already in */ + } + + return insert_node(t, node_index, clef, valeur, valeur2); +} + + +/*----------------------------------------------------------------*/ +maca_graph_parser_hash *load_table(FILE *f) +{ + maca_graph_parser_hash *t = malloc(sizeof(maca_graph_parser_hash)); + fread(&(t->taille), sizeof(int), 1, f); + fread(&(t->nbelem), sizeof(int), 1, f); + + t->table_clef = malloc((size_t)t->taille * sizeof(feature_t)); + t->params = malloc((size_t)t->taille * sizeof(float)); + t->total = NULL; + fread(t->table_clef, sizeof(feature_t), (size_t)t->taille, f); + fread(t->params, sizeof(float), (size_t)t->taille, f); + return t; +} + +/*----------------------------------------------------------------*/ + +void dump_table(maca_graph_parser_hash *t, FILE *f) +{ + fwrite(&(t->taille), sizeof(int), 1, f); + fwrite(&(t->nbelem), sizeof(int), 1, f); + fwrite(t->table_clef, sizeof(feature_t), (size_t)t->taille, f); + fwrite(t->params, sizeof(float), (size_t)t->taille, f); +} + + +/*----------------------------------------------------------------*/ + +maca_graph_parser_hash *creation_table(int nbelem, float coeff) +{ + int i; + maca_graph_parser_hash *t = malloc(sizeof(maca_graph_parser_hash)); + + t->nbelem = 0; + /* t->taille = prem_premier((int)((float)nbelem/coeff)); */ + t->taille = (int)((double)nbelem/coeff); // cast to double: prevent non-portable "excess precision" + + t->params = malloc(sizeof(float) * (size_t)t->taille); + t->total = malloc(sizeof(float) * (size_t)t->taille); + t->table_clef = malloc(sizeof(feature_t) * (size_t)t->taille); + + if((t->params == NULL) || (t->total == NULL) || (t->table_clef == NULL)){ + fprintf(stderr, "memory allocation error!\n"); + exit(1); + } + + for(i = 0; i < t->taille; i++) { + t->table_clef[i]=VIDE; + t->params[i]=0; + t->total[i]=0; + } + return t; +} + +void free_table(maca_graph_parser_hash *t) +{ + free(t->params); + free(t->table_clef); + + if(t->total) + free(t->total); + + free(t); +} + + +maca_graph_parser_feature_weight_table *feat_hash2feat_array(maca_graph_parser_hash *t) +{ + int i, j = 0; + + maca_graph_parser_feature_weight_table *table = malloc(sizeof(maca_graph_parser_feature_weight_table)); + table->size = 0; + + for(i = 0; i < t->taille; i++) + if((t->table_clef[i] != VIDE) && (t->params[i] != 0)) + table->size++; + + table->features = malloc(table->size * sizeof(feature_t)); + table->weights = malloc(table->size * sizeof(float)); + for(i = 0; i < t->taille; i++){ + if((t->table_clef[i] != VIDE) && (t->params[i] != 0)){ + table->features[j] = t->table_clef[i]; + table->weights[j] = t->params[i]; + j++; + } + } + + return table; +} + + +/*----------------------------------------------------------------*/ +maca_graph_parser_feature_weight_table *load_feature_weight_table(FILE *f) +{ + maca_graph_parser_feature_weight_table *t = malloc(sizeof(maca_graph_parser_feature_weight_table)); + /* size */ + fread(&(t->size), sizeof(uint32_t), 1, f); + /* features */ + t->features = malloc(t->size * sizeof(feature_t)); + fread(t->features, sizeof(feature_t), t->size, f); + /* weights */ + t->weights = malloc(t->size * sizeof(float)); + fread(t->weights, sizeof(float), t->size, f); + + return t; +} + + +void feature_weight_table_free(maca_graph_parser_feature_weight_table *t) +{ + free(t->features); + free(t->weights); + free(t); +} + + +void dump_feature_weight_table(maca_graph_parser_feature_weight_table *t, FILE *f) +{ + fwrite(&(t->size), sizeof(uint32_t), 1, f); + fwrite(t->features, sizeof(feature_t), t->size, f); + fwrite(t->weights, sizeof(float), t->size, f); +} + +/*----------------------------------------------------------------*/ + + +float *recherche_dicho(void *table, feature_t f) +{ + maca_graph_parser_feature_weight_table *t = table; + int first = 0; /* Indice du premier élément du sous-tableau analysé */ + int last = (int)t->size - 1; /* Indice du dernier élément du sous-tableau analysé */ + int middle; /* Indice de l'élément du milieu du sous-tableau analysé */ + /* Tant qu'on a pas trouve l'élément recherché ou que le sous-tableau */ + /* contient plus de 1 élément */ + while(first <= last) + { + /* Calcul de la position de l'élément du milieu */ + middle=(first+last)/2; + /* Si l'élément du milieu est l'élément recherché */ + + if(t->features[middle] == f){ + return &(t->weights[middle]); + } + else + { + /* Si la valeur recherchée est plus petite */ + /* que la valeur du l'élément du milieu */ + /* Alors on regarde le sous-tableau de gauche */ + if(t->features[middle] > f) + last = middle -1; + /* sinon on regarde le sous-tableau de droite*/ + else first = middle +1; + } + } + return NULL; +} + + +maca_graph_parser_hash *feat_array2feat_hash(maca_graph_parser_feature_weight_table *feat_array, float hash_fill_rate){ + + /* printf("building hash table\n"); */ + maca_graph_parser_hash *feat_ht = creation_table(feat_array->size, hash_fill_rate); + size_t i; + for(i=0; i < feat_array->size; i++){ + range_hash(feat_ht, feat_array->features[i], feat_array->weights[i], 0); + } + /* printf("building hash table : done (%u elements)\n", i); */ + return feat_ht; +} + + + diff --git a/maca_graph_parser/maca_graph_parser_hash.h b/maca_graph_parser/maca_graph_parser_hash.h new file mode 100644 index 0000000000000000000000000000000000000000..2ca12324bd78e6ac12d97ab101e383a73c25b491 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_hash.h @@ -0,0 +1,65 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + Frederic Bechet <frederic.bechet@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + + +#ifndef __MACA_GRAPH_PARSER_HASH__ +#define __MACA_GRAPH_PARSER_HASH__ + +#include "maca_graph_parser.h" + +#define VIDE 0 + + +#ifdef __cplusplus +extern "C"{ +#endif + + +/* hash table */ +int hash_func(feature_t clef, int taille); +/* high level functions */ +int recherche_hash_index(maca_graph_parser_hash *t, feature_t clef); +float *recherche_hash(maca_graph_parser_hash *t, feature_t clef); +int range_hash(maca_graph_parser_hash *t, feature_t clef, float valeur, float valeur2); + + +/* persistency */ +maca_graph_parser_hash *load_table(FILE *f); +void dump_table(maca_graph_parser_hash *t, FILE *f); +/* init and free */ +maca_graph_parser_hash *creation_table(int nbelem, float coeff); +void free_table(maca_graph_parser_hash *t); +/* feature_weight_table */ +maca_graph_parser_feature_weight_table *load_feature_weight_table(FILE *f); +void dump_feature_weight_table(maca_graph_parser_feature_weight_table *t, FILE *f); +void feature_weight_table_free(maca_graph_parser_feature_weight_table *t); +float *recherche_dicho(void *t, feature_t f); +/* conversion */ +maca_graph_parser_feature_weight_table *feat_hash2feat_array(maca_graph_parser_hash *t); +maca_graph_parser_hash *feat_array2feat_hash(maca_graph_parser_feature_weight_table *feat_array, float hash_fill_rate); + +/* dead */ +maca_graph_parser_feature_weight_table *maca_graph_parser_hash_compress(maca_graph_parser_hash *t); +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/maca_graph_parser/maca_graph_parser_heapq.c b/maca_graph_parser/maca_graph_parser_heapq.c new file mode 100644 index 0000000000000000000000000000000000000000..e46435fa8b4dbeb9bac8b881473a297594b771a2 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_heapq.c @@ -0,0 +1,210 @@ +/******************************************************************************* + Copyright (C) 2013 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Mathieu Morey <mathieu.morey@lif.univ-mrs.fr> + Frederic Bechet <frederic.bechet@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> + +#include "maca_graph_parser_heapq.h" + + +void heap_swap(heap *h, const int a, const int b){ + // swap values + float tmp_value = h->values[a]; + h->values[a] = h->values[b]; + h->values[b] = tmp_value; + // swap objects + void *tmp_object = h->objects[a]; + h->objects[a] = h->objects[b]; + h->objects[b] = tmp_object; +} + +/** + * n-max + */ +/* nmax: private functions */ +void heap_heapify_min(heap *h, int index){ + const int left = index << 1; + const int right = left + 1; + int largest = index; + if(left < h->num && h->values[left] < h->values[index]) largest = left; + if(right < h->num && h->values[right] < h->values[largest]) largest = right; + if(largest != index){ + heap_swap(h, index, largest); + heap_heapify_min(h, largest); + } +} + +void heap_build_min(heap *h){ + int index = (h->num - 1) >> 1; + while(index > 0){ + heap_heapify_min(h, index); + index >>= 1; + } + heap_heapify_min(h, index); +} + +void heap_extract_min(heap *h){ + h->values[0] = h->values[h->num - 1]; + h->objects[0] = h->objects[h->num -1]; + h->num--; + heap_heapify_min(h, 0); +} + +/* nmax: public functions */ +void heap_sort_nmax(heap *h){ + const int saved_num = h->num; + int index = h->num - 1; + while(index > 0){ + heap_swap(h, 0, index); + h->num--; + heap_heapify_min(h, 0); + index--; + } + h->num = saved_num; +} + +void heap_insert_nmax(heap *h, float value, void *object){ + if(h->num >= h->max_size){ + if(value < h->values[0]) return; + heap_extract_min(h); + } + int index = h->num; + h->values[index] = value; + h->objects[index] = object; + h->num += 1; + int parent = index >> 1; + while(index > 0 && h->values[parent] > h->values[index]){ + heap_swap(h, index, parent); + index = parent; + parent >>= 1; + } +} + +/** + * n-min + */ +/* nmin: private functions */ +void heap_heapify_max(heap *h, int index){ + const int left = index << 1; + const int right = left + 1; + int largest = index; + if(left < h->num && h->values[left] > h->values[index]) largest = left; + if(right < h->num && h->values[right] > h->values[largest]) largest = right; + if(largest != index) { + heap_swap(h, index, largest); + heap_heapify_max(h, largest); + } +} + +void heap_build_max(heap *h){ + int index = (h->num - 1) >> 1; + while(index > 0){ + heap_heapify_max(h, index); + index >>= 1; + } + heap_heapify_max(h, index); +} + +void heap_extract_max(heap *h){ + h->values[0] = h->values[h->num - 1]; + h->objects[0] = h->objects[h->num - 1]; + h->num--; + heap_heapify_max(h, 0); +} + +/* nmin: public functions */ +void heap_sort_nmin(heap *h){ + const int saved_num = h->num; + int index = h->num - 1; + while(index > 0){ + heap_swap(h, 0, index); + h->num--; + heap_heapify_max(h, 0); + index--; + } + h->num = saved_num; +} + +void heap_insert_nmin(heap *h, float value, void *object){ + if(h->num >= h->max_size){ + if(value > h->values[0]) return; + heap_extract_max(h); + } + int index = h->num; + h->values[index] = value; + h->objects[index] = object; + h->num += 1; + int parent = index >> 1; + while(index > 0 && h->values[parent] < h->values[index]){ + heap_swap(h, index, parent); + index = parent; + parent >>= 1; + } +} + +/** + * Common functions to nmin and nmax. + */ +heap *heap_create(const int max_size, const float dft_value){ + int i; + heap *h = malloc(sizeof(heap)); + h->num = 0; + h->max_size = max_size; + h->dft_value = dft_value; + h->values = malloc(max_size * sizeof(float)); + h->objects = malloc(max_size * sizeof(void *)); + /* init */ + for (i=0; i<max_size; i++){ + h->values[i] = dft_value; + h->objects[i] = NULL; + } + return h; +} + +void heap_destroy(heap* h){ + int i; + /* reset */ + for (i=0; i<h->max_size; i++){ + h->values[i] = 0; + h->objects[i] = NULL; + } + h->num = 0; + h->max_size = 0; + h->dft_value = 0; + free(h->values); + free(h->objects); + free(h); +} + +void *heap_get(heap *h, const int index){ + return h->objects[index]; +} + +float heap_get_value(heap *h, const int index){ + return h->values[index]; +} + +int heap_size(heap *h){ + return h->num; +} + +void heap_clear(heap *h){ + h->num = 0; +} diff --git a/maca_graph_parser/maca_graph_parser_heapq.h b/maca_graph_parser/maca_graph_parser_heapq.h new file mode 100644 index 0000000000000000000000000000000000000000..daf7f3e252df9d84045b32a729a2479d73399913 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_heapq.h @@ -0,0 +1,52 @@ +typedef struct { + int num; + int max_size; + float dft_value; + float *values; + void **objects; +} heap; + +#ifdef __cplusplus +extern "C"{ +#endif + +void heap_swap(heap *h, const int a, const int b); + +// n-max +void heap_heapify_min(heap *h, int index); + +void heap_build_min(heap *h); + +void heap_extract_min(heap *h); + +void heap_sort_nmax(heap *h); + +void heap_insert_nmax(heap *h, float value, void *object); + +// n-min +void heap_heapify_max(heap *h, int index); + +void heap_build_max(heap *h); + +void heap_extract_max(heap *h); + +void heap_sort_nmin(heap *h); + +void heap_insert_nmin(heap *h, float value, void *object); + +// common +heap *heap_create(const int max_size, const float dft_value); + +void heap_destroy(heap* h); + +void *heap_get(heap *h, const int index); + +float heap_get_value(heap *h, const int index); + +int heap_size(heap *h); + +void heap_clear(heap *h); + + #ifdef __cplusplus +} +#endif diff --git a/maca_graph_parser/maca_graph_parser_hyperdecoder.c b/maca_graph_parser/maca_graph_parser_hyperdecoder.c new file mode 100644 index 0000000000000000000000000000000000000000..50b98ec4c2f720aeaac8514c1e9f8f522a7a6150 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_hyperdecoder.c @@ -0,0 +1,1223 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#include<stdlib.h> +#include<stdio.h> +#include<time.h> + +#include"maca_constants.h" +#include"maca_msg.h" +//#include"maca_lex.h" + +#include"maca_graph_parser_features.h" +#include"maca_graph_parser_feature_table.h" +#include"maca_graph_parser_hash.h" +#include "maca_graph_parser_model.h" +#include"maca_graph_parser.h" +/* kbest */ +#include "maca_graph_parser_hyperdecoder.h" +/* end kbest */ +#include "maca_graph_parser_dep_count_table.h" + +/*-------------------------------------------------------------------------------------------*/ +/*-------------------------------------------------------------------------------------------*/ + +/** + * Global state + */ + +/* vertices */ +/* CLOSED: start, end, direction of the attachment (left or right) */ +/* + Vertex *CLOSED + [MACA_MAX_LENGTH_SENTENCE] + [MACA_MAX_LENGTH_SENTENCE] + [2] + ; +*/ +/* OPEN: start, end, direction of the attachment (left or right), label */ +/* + Vertex *OPEN + [MACA_MAX_LENGTH_SENTENCE] + [MACA_MAX_LENGTH_SENTENCE] + [2] + [NB_LABELS] + ; +*/ + +/* k-best derivations with backpointers */ +/* + vec_Dbp *CDERIV + [MACA_MAX_LENGTH_SENTENCE] + [MACA_MAX_LENGTH_SENTENCE] + [2] + ; +*/ + +/* + vec_Dbp *ODERIV + [MACA_MAX_LENGTH_SENTENCE] + [MACA_MAX_LENGTH_SENTENCE] + [2] + [NB_LABELS] + ; +*/ + +/** + * Util funcs + */ + +cand_id *alloc_cand_id(int bs_i, int j[2]){ + /** + * Allocate a candidate identifier + */ + cand_id *res = malloc(sizeof(cand_id)); + if(res == NULL){ + fprintf(stderr, "memory allocation problem\n"); + exit(1); + } + res->bs_i = bs_i; + res->j[0] = j[0]; + res->j[1] = j[1]; + return res; +} + + +void free_cand_id(cand_id *c){ + /** + * Free a candidate ID + */ + if(!c) + return; + free(c); +} + + +vec_Dbp *alloc_vec_Dbp(int capacity){ + /** + * Allocate a vector of derivations with backpointers + */ + + vec_Dbp *res = malloc(sizeof(vec_Dbp)); + if(res == NULL){ + fprintf(stderr, "Mem prob\n"); + exit(1); + } + + res->num = 0; + res->capacity = capacity; + res->elts = malloc(capacity * sizeof(DerivBP *)); + if(res->elts == NULL){ + fprintf(stderr, "Mem prob\n"); + exit(1); + } + + return res; +} + +void free_vec_Dbp(vec_Dbp *vd){ + /** + * Free a vector of derivations with backpointers + */ + if(!vd) + return; + + vd->num = 0; + vd->capacity = 0; + free(vd->elts); + free(vd); +} + +void vec_Dbp_append(vec_Dbp *vd, DerivBP *dbp){ + /** + * Append to a vector of derivations with backpointers + */ + if(vd->num >= vd->capacity){ + fprintf(stderr, "Cannot append: Vector full\n"); + return; + } + vd->elts[vd->num] = dbp; + vd->num += 1; +} + +void reset_derivBP(DerivBP *dbp){ + /** + * Reset a derivation with backpointers. + */ + dbp->weight = MINF; + dbp->e = NULL; + dbp->j[0] = 0; + dbp->j[1] = 0; +} + +/* end utils */ + + +/** + * Decoder-specific helpers + */ + +vec_Dbp *get_vec_Dbp(maca_graph_parser_ctx *ctx, Vertex *v){ + /** + * Get the vector of k-best derivations (with backpointers) of v. + */ + + VertexSignature *vs = v->vsign; + int start; + int end; + int dir; + int label; + vec_Dbp *res; + + if(vs->type == TYP_OPEN){ + start = vs->open.start; + end = vs->open.end; + dir = vs->open.dir; + label = vs->open.label; + res = ctx->ODERIV[start][end][dir][label]; + } else if(vs->type == TYP_CLOSED){ + start = vs->closed.start; + end = vs->closed.end; + dir = vs->closed.dir; + res = ctx->CDERIV[start][end][dir]; + } else { + fprintf(stderr, "not implemented yet\n"); + exit(1); + } + + return res; +} + + +float eval_weight(maca_graph_parser_ctx *ctx, Hyperarc *e, int j[2], + maca_graph_parser_feature_table *feat_table){ + /** + * Evaluate the weight of a potential derivation with backpointers. + */ + float w_D0; + float w_D1; + VertexSignature *vs = e->head->vsign; + SgOpen h; + float res; + + /* the weight of a derivation is a function of the weights of + * the subderivations + */ + + /* weight of the j[i]-th derivation for T_i ; 0 for closed of span 0 */ + w_D0 = (e->tail[0]) ? get_vec_Dbp(ctx, e->tail[0])->elts[j[0]]->weight : 0 ; + w_D1 = (e->tail[1]) ? get_vec_Dbp(ctx, e->tail[1])->elts[j[1]]->weight : 0 ; + + if(vs->type == TYP_OPEN){ + /* w_open = w(subd0) + w(subd1) + w(head) */ + h = vs->open; + /* here, w(head) = w_basic_feats(head) + w_first_order_feats(head) */ + res = (w_D0 + w_D1 + + feat_table->pl[h.start][h.end][h.dir] + + feat_table->lab[h.start][h.end][h.label][h.dir]); + } else { + /* w_closed = w(subd0) + w(subd1) */ + res = w_D0 + w_D1; + } + + return res; +} + + +/*-------------------------------------------------------------------------------------------*/ +void maca_graph_parser_hyperdecoder_init(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s){ + /** + * Initialize the decoder and build the corresponding hypergraph. + * + */ + int span, start, end, dir, label, gov, dep; + /* ctx */ + int labels_nb = ctx->labels_nb; + int k = ctx->k; + /* sentence */ + int sentence_length = s->l; /* ctx->max_sent_length; */ + /* indices */ + int i; + int k_i; + VertexSignature *vs; + int m; + int j[2] = {0, 0}; + /* min dep count */ + int dep_count; + /* test: default edge label */ + /* int dft_label = maca_tags_get_code(ctx->cfg, "morpho", "fct", "__JOKER__"); */ + int dft_label = maca_alphabet_get_code(ctx->labels_alphabet, "__JOKER__"); + + /* allocation of CLOSEDK, OPENK */ + ctx->CLOSEDK = malloc(sizeof(Vertex***) * sentence_length); + ctx->OPENK = malloc(sizeof(Vertex****) * sentence_length); + if(sentence_length > 0) { + ctx->CLOSEDK[0] = malloc(sizeof(Vertex**) * sentence_length * sentence_length); + ctx->CLOSEDK[0][0] = malloc(sizeof(Vertex*) * sentence_length * sentence_length * 2); + for(start = 0; start < sentence_length; start++) { + ctx->CLOSEDK[start] = ctx->CLOSEDK[0] + start * sentence_length; + for(end = 0; end < sentence_length; end++) { + ctx->CLOSEDK[start][end] = ctx->CLOSEDK[0][0] + ((start * sentence_length) + end) * 2; + } + } + ctx->OPENK[0] = malloc(sizeof(Vertex***) * sentence_length * sentence_length); + ctx->OPENK[0][0] = malloc(sizeof(Vertex**) * sentence_length * sentence_length * 2); + ctx->OPENK[0][0][0] = malloc(sizeof(Vertex*) * sentence_length * sentence_length * 2 * ctx->labels_nb); + for(start = 0; start < sentence_length; start++) { + ctx->OPENK[start] = ctx->OPENK[0] + start * sentence_length; + for(end = 0; end < sentence_length; end++) { + ctx->OPENK[start][end] = ctx->OPENK[0][0] + ((start * sentence_length) + end) * 2; + for(dir = 0; dir < 2; dir++) { + ctx->OPENK[start][end][dir] = ctx->OPENK[0][0][0] + ((((start * sentence_length) + end) * 2) + dir) * ctx->labels_nb; + } + } + } + } + /* ensure CLOSEDK and OPENK are clean */ + for(start=0; start < sentence_length; start++){ + for(end=0; end < sentence_length; end++){ + for(dir=0; dir<2; dir++){ + /* closed */ + ctx->CLOSEDK[start][end][dir] = NULL; + /* open */ + for(label=0; label<labels_nb; label++){ + ctx->OPENK[start][end][dir][label] = NULL; + } + } + } + } + + /* allocation of CDERIV, ODERIV */ + ctx->CDERIV = malloc(sizeof(vec_Dbp***) * sentence_length); + ctx->ODERIV = malloc(sizeof(vec_Dbp****) * sentence_length); + if(sentence_length > 0) { + ctx->CDERIV[0] = malloc(sizeof(vec_Dbp**) * sentence_length * sentence_length); + ctx->CDERIV[0][0] = malloc(sizeof(vec_Dbp*) * sentence_length * sentence_length * 2); + for(start = 0; start < sentence_length; start++) { + ctx->CDERIV[start] = ctx->CDERIV[0] + start * sentence_length; + for(end = 0; end < sentence_length; end++) { + ctx->CDERIV[start][end] = ctx->CDERIV[0][0] + ((start * sentence_length) + end) * 2; + } + } + ctx->ODERIV[0] = malloc(sizeof(vec_Dbp***) * sentence_length * sentence_length); + ctx->ODERIV[0][0] = malloc(sizeof(vec_Dbp**) * sentence_length * sentence_length * 2); + ctx->ODERIV[0][0][0] = malloc(sizeof(vec_Dbp*) * sentence_length * sentence_length * 2 * ctx->labels_nb); + for(start = 0; start < sentence_length; start++) { + ctx->ODERIV[start] = ctx->ODERIV[0] + start * sentence_length; + for(end = 0; end < sentence_length; end++) { + ctx->ODERIV[start][end] = ctx->ODERIV[0][0] + ((start * sentence_length) + end) * 2; + for(dir = 0; dir < 2; dir++) { + ctx->ODERIV[start][end][dir] = ctx->ODERIV[0][0][0] + ((((start * sentence_length) + end) * 2) + dir) * ctx->labels_nb; + } + } + } + } + /* ensure CLOSEDK and OPENK are clean */ + for(start=0; start < sentence_length; start++){ + for(end=0; end < sentence_length; end++){ + for(dir=0; dir<2; dir++){ + /* closed */ + ctx->CDERIV[start][end][dir] = NULL; + /* open */ + for(label=0; label<labels_nb; label++){ + ctx->ODERIV[start][end][dir][label] = NULL; + } + } + } + } + + + /* allocation of the hypergraph */ + /* items in topological order */ + for(span=1; span < sentence_length; span++){ + /* fprintf(stderr, "span: %i\n", span); */ + + /* low-level management of backstars */ + /* open */ + int bs_size_open = span; /* maximal size of backstar */ + vec_Vertex *tails_open = alloc_vec_Vertex(2*bs_size_open); + /* closed */ + int bs_size_closed = span * labels_nb; /* maximal size of backstar */ + vec_Vertex *tails_closed = alloc_vec_Vertex(2*bs_size_closed); + /* end low-level */ + int length_class; + int dep_count; + for(start=0; start+span < sentence_length; start++){ + end = start + span; + length_class = maca_graph_parser_dep_count_table_compute_length_class(start,end); + + /* create open items */ + for(dir=0; dir<2; dir++){ + gov = (dir == la) ? end : start; + dep = (dir == la) ? start : end; + + for(label=0; label<labels_nb; label++){ + /* min dep count filter */ + /* TODO: try (unlabelled) length dictionary filter */ + dep_count = ctx->dep_count_table[s->pos[gov]][s->pos[dep]][label][length_class][dir]; + if((dep_count >= ctx->min_dep_count) || + (label == dft_label)){ + /* reset backstar */ + tails_open->num = 0; /* tail vertices */ + i = 0; /* actual size of backstar */ + for(m=start; m<end; m++){ + if(((ctx->CLOSEDK[start][m][ra] != NULL) || (m == start)) && + ((ctx->CLOSEDK[m+1][end][la] != NULL) || (m+1 == end))){ + vec_Vertex_append(tails_open, ctx->CLOSEDK[start][m][ra]); /* start m */ + vec_Vertex_append(tails_open, ctx->CLOSEDK[m+1][end][la]); /* m+1 end */ + i++; + } + } + if(tails_open->num > 0){ /* non-empty backstar */ + Vertex *v = alloc_vertex(); + /* vertex signature */ + vs = alloc_vertexSignature(); + init_hyperopen(vs, start, end, label, dir); + init_vertex(v, tails_open, i, vs); + /* fprintf(stderr, "ctx->OPENK[%d][%d][%d][%d]\n", start, end, label, dir); */ + ctx->OPENK[start][end][dir][label] = v; + /* derivations */ + ctx->ODERIV[start][end][dir][label] = alloc_vec_Dbp(k); + for(k_i=0; k_i<k; k_i++){ + vec_Dbp_append(ctx->ODERIV[start][end][dir][label], alloc_derivBP(MINF, NULL, j)); + } + } + } + } /* end for label */ + } /* end for dir */ + + /* create closed items */ + /* ra */ + /* reset backstar */ + tails_closed->num = 0; /* tail vertices */ + i = 0; /* actual size of backstar */ + for(m=start+1; m<=end; m++){ + for(label=0; label<labels_nb; label++){ + if((ctx->OPENK[start][m][ra][label] != NULL) && + ((ctx->CLOSEDK[m][end][ra] != NULL) || (m == end))){ + vec_Vertex_append(tails_closed, ctx->OPENK[start][m][ra][label]); + vec_Vertex_append(tails_closed, ctx->CLOSEDK[m][end][ra]); + i++; + } + } + } + if(tails_closed->num > 0){ /* backstar is non empty */ + Vertex *v = alloc_vertex(); + /* vertex signature */ + vs = alloc_vertexSignature(); + init_hyperclosed(vs, start, end, ra); + init_vertex(v, tails_closed, i, vs); + ctx->CLOSEDK[start][end][ra] = v; + /* derivations */ + ctx->CDERIV[start][end][ra] = alloc_vec_Dbp(k); + for(k_i=0; k_i<k; k_i++){ + vec_Dbp_append(ctx->CDERIV[start][end][ra], alloc_derivBP(MINF, NULL, j)); + } + } + + + /* la */ + tails_closed->num = 0; /* tail vertices */ + i = 0; /* actual size of backstar */ + for(m=start; m<end; m++){ + for(label=0; label<labels_nb; label++){ + if((ctx->OPENK[m][end][la][label] != NULL) && + ((ctx->CLOSEDK[start][m][la] != NULL) || (m == start))){ + vec_Vertex_append(tails_closed, ctx->OPENK[m][end][la][label]); + vec_Vertex_append(tails_closed, ctx->CLOSEDK[start][m][la]); + i++; + } + } + } + if(tails_closed->num > 0){ + Vertex *v = alloc_vertex(); + /* vertex */ + vs = alloc_vertexSignature(); + init_hyperclosed(vs, start, end, la); + init_vertex(v, tails_closed, i, vs); + ctx->CLOSEDK[start][end][la] = v; + /* derivations */ + ctx->CDERIV[start][end][la] = alloc_vec_Dbp(k); + for(k_i=0; k_i<k; k_i++){ + vec_Dbp_append(ctx->CDERIV[start][end][la], alloc_derivBP(MINF, NULL, j)); + } + } + } /* end for start */ + free_vec_Vertex(tails_open); + free_vec_Vertex(tails_closed); + } +} + + +void maca_graph_parser_hyperdecoder_cleanup(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s){ + /** + * Cleanup the decoder and remove the corresponding hypergraph. + * + */ + /* global parameters */ + int labels_nb = ctx->labels_nb; + int k = ctx->k; + /* sentence */ + int sentence_length = s->l; /* ctx->max_sent_length; */ + /* indices */ + int span; + int start; + int end; + int label; /*label*/ + int k_i; + + /* items in reverse topological order */ + for(span=sentence_length-1; span > 0; span--){ + for(start=0; start+span < sentence_length; start++){ + end = start + span; + + /* free closed items */ + /* ra */ + /* derivations */ + if(ctx->CDERIV[start][end][ra] != NULL){ + for(k_i=0; k_i<k; k_i++){ + if(ctx->CDERIV[start][end][ra]->elts[k_i]){ + free_derivBP(ctx->CDERIV[start][end][ra]->elts[k_i]); + ctx->CDERIV[start][end][ra]->elts[k_i] = NULL; + } + } + free_vec_Dbp(ctx->CDERIV[start][end][ra]); + ctx->CDERIV[start][end][ra] = NULL; + } + /* vertex */ + if(ctx->CLOSEDK[start][end][ra] != NULL){ + free_vertex(ctx->CLOSEDK[start][end][ra]); + ctx->CLOSEDK[start][end][ra] = NULL; + } + + /* la */ + /* derivations */ + if(ctx->CDERIV[start][end][la] != NULL){ + for(k_i=0; k_i<k; k_i++){ + if(ctx->CDERIV[start][end][la]->elts[k_i]){ + free_derivBP(ctx->CDERIV[start][end][la]->elts[k_i]); + ctx->CDERIV[start][end][la]->elts[k_i] = NULL; + } + } + free_vec_Dbp(ctx->CDERIV[start][end][la]); + ctx->CDERIV[start][end][la] = NULL; + } + /* vertex */ + if(ctx->CLOSEDK[start][end][la] != NULL){ + free_vertex(ctx->CLOSEDK[start][end][la]); + ctx->CLOSEDK[start][end][la] = NULL; + } + + /* free open items */ + for(label=0; label<labels_nb; label++){ + /* ra */ + /* derivations */ + if(ctx->ODERIV[start][end][ra][label] != NULL){ + for(k_i=0; k_i<k; k_i++){ + if(ctx->ODERIV[start][end][ra][label]->elts[k_i]){ + free_derivBP(ctx->ODERIV[start][end][ra][label]->elts[k_i]); + ctx->ODERIV[start][end][ra][label]->elts[k_i] = NULL; + } + } + free_vec_Dbp(ctx->ODERIV[start][end][ra][label]); + ctx->ODERIV[start][end][ra][label] = NULL; + } + if(ctx->OPENK[start][end][ra][label] != NULL){ + free_vertex(ctx->OPENK[start][end][ra][label]); + ctx->OPENK[start][end][ra][label] = NULL; + } + + /* la */ + /* derivations */ + if(ctx->ODERIV[start][end][la][label] != NULL){ + for(k_i=0; k_i<k; k_i++){ + if(ctx->ODERIV[start][end][la][label]->elts[k_i]){ + free_derivBP(ctx->ODERIV[start][end][la][label]->elts[k_i]); + ctx->ODERIV[start][end][la][label]->elts[k_i] = NULL; + } + } + free_vec_Dbp(ctx->ODERIV[start][end][la][label]); + ctx->ODERIV[start][end][la][label] = NULL; + } + free_vertex(ctx->OPENK[start][end][la][label]); + ctx->OPENK[start][end][la][label] = NULL; + } + } + } + + /* cleanup indexing arrays */ + if(sentence_length > 0){ + /* free CLOSEDK, OPENK */ + free(ctx->OPENK[0][0][0]); + free(ctx->OPENK[0][0]); + free(ctx->OPENK[0]); + free(ctx->OPENK); + free(ctx->CLOSEDK[0][0]); + free(ctx->CLOSEDK[0]); + free(ctx->CLOSEDK); + /* free CDERIV, ODERIV */ + free(ctx->ODERIV[0][0][0]); + free(ctx->ODERIV[0][0]); + free(ctx->ODERIV[0]); + free(ctx->ODERIV); + free(ctx->CDERIV[0][0]); + free(ctx->CDERIV[0]); + free(ctx->CDERIV); + } + +} + + +/*--------------------------------------------------------------------------------------------*/ +void get_candidates(maca_graph_parser_ctx *ctx, Vertex *v, int k, + heap *cand, cand_id ****seen_cand_ids, + maca_graph_parser_feature_table *feat_table){ + /** + * Get the top k of 1-best derivations for v along each hyperarc. + * + * Parameters + * ---------- + * cand: heap storing the result + */ + + if(v==NULL) + return; + + /* DEBUG */ + /* + printf("candidates for "); + print_vertexSignature(v->vsign); + printf(":\n"); + */ + /* DEBUG */ + + /* 11: temp = set(<e,\bold{1}> for e \in BS(v)) */ + /* 12: cand[v] = top_k_elements(temp) */ /* (optional) prune away useless candidates */ + /* 13: heapify(cand[v]) */ + int j[2] = {0, 0}; /* 1-best vector in 0-based notation */ + + int i; + for(i=0; i<v->bs_size; i++){ + cand_id *c = seen_cand_ids[i][j[0]][j[1]] = alloc_cand_id(i, j); + float w = eval_weight(ctx, v->bs[i], j, feat_table); + + /* DEBUG */ + /* + printf("\t"); + if(v->bs[i]->tail[0] != NULL) + print_vertexSignature(v->bs[i]->tail[0]->vsign); + if((v->bs[i]->tail[0] != NULL) && + (v->bs[i]->tail[1] != NULL)) + printf(" + "); + if(v->bs[i]->tail[1] != NULL) + print_vertexSignature(v->bs[i]->tail[1]->vsign); + printf(" : %f\n", w); + */ + /* DEBUG */ + + heap_insert_nmax(cand, w, c); + } +} + + +void append_next(maca_graph_parser_ctx *ctx, heap *cand, DerivBP *p, + Vertex *v, cand_id ****seen_cand_ids, + maca_graph_parser_feature_table *feat_table){ + /** + * + * Parameters + * ---------- + * cand: candidate set of derivations implemented as a priority queue + * + * p: array cell in which the next best derivation will be stored + * (should be: vector of i-best derivations, i < k, to which the + * next best derivation will be appended) + * + * seen_cand_ids: array of already seen cand ids, i.e. neighbours in the cube + * + */ + + cand_id *c; + float w; + Hyperarc *e; + int j[2]; + int i; + int ip; + int jp[2]; + vec_Dbp *vDbp_Ti; + cand_id *cp; + float wp; + + /* 9: <e,j> <- extract_min(cand) */ + heap_sort_nmax(cand); + c = heap_get(cand, 0); + w = heap_get_value(cand, 0); + heap_extract_min(cand); + /* c: (bs_i, j0, j1) */ + e = v->bs[c->bs_i]; + j[0] = c->j[0]; + j[1] = c->j[1]; + + /* 10: append <e,j> to p */ + p->weight = w; + p->e = e; + p->j[0] = j[0]; + p->j[1] = j[1]; + + /* 11: for i=1; i<|e|; i++ do */ /* add the |e| neighbours */ + /* 12: j' = j + b^i */ + /* 13: if (j'_i <= |^D(T_i(e))| and <e,j'> \notin cand then */ + /* 14: insert(cand, <e,j'>) */ /* add to heap */ + + /* 11 */ + for(i=0; i<2; i++){ + /* 12 */ + for(ip=0; ip<2; ip++){ + jp[ip] = (i == ip) ? j[i]+1 : j[i]; + } + /* 13 */ + if(e->tail[i] == NULL){ + /* NULLs don't have 2nd-bests */ + continue; + } + vDbp_Ti = get_vec_Dbp(ctx, e->tail[i]); + /* in the current implementation, the first test only guarantees + we are not off limits (1) */ + if(jp[i] < vDbp_Ti->num){ + /* if the candidate has not already been used */ + if(seen_cand_ids[c->bs_i][jp[0]][jp[1]] == NULL){ + cp = seen_cand_ids[c->bs_i][jp[0]][jp[1]] = alloc_cand_id(c->bs_i, jp); + + /* (1): here is the real checking of j'_i <= |^D(T_i(e))| */ + if (vDbp_Ti->elts[jp[i]]->e != NULL){ + /* 14 */ + wp = eval_weight(ctx, e, jp, feat_table); + heap_insert_nmax(cand, wp, cp); /* add to heap */ + } + } + } + } + +} + + +void find_kbest(maca_graph_parser_ctx *ctx, Vertex *v, int k, maca_graph_parser_feature_table *feat_table){ + /** + * Find the k best derivations for v. + */ + + int i; + int j[2] = {-1, -1}; + int ja; + int jb; + heap *cand; + cand_id ****seen_cand_ids; /* [bs_i][j0][j1] */ + vec_Dbp *vDbp_v; + + if(v == NULL) + return; + + vDbp_v = get_vec_Dbp(ctx, v); + /* alloc and init the array of already seen cand_ids */ + seen_cand_ids = malloc(sizeof(cand_id ***) * (v->bs_size)); + if(seen_cand_ids == NULL){ + fprintf(stderr, "Mem alloc error\n"); + exit(1); + } + if(v->bs_size > 0){ + seen_cand_ids[0] = malloc(sizeof(cand_id **) * (v->bs_size) * k); + if(seen_cand_ids[0] == NULL){ + fprintf(stderr, "Mem alloc error\n"); + exit(1); + } + seen_cand_ids[0][0] = malloc(sizeof(cand_id *) * (v->bs_size) * k * k); + if(seen_cand_ids[0][0] == NULL){ + fprintf(stderr, "Mem alloc error\n"); + exit(1); + } + for(i=0; i<v->bs_size; i++){ + seen_cand_ids[i] = seen_cand_ids[0] + i * k; + for(ja=0; ja<k; ja++){ + seen_cand_ids[i][ja] = seen_cand_ids[0][0] + (i * k + ja) * k; + for(jb=0; jb<k; jb++){ + seen_cand_ids[i][ja][jb] = NULL; + } + } + } + } + + /* 6: get_candidates(v, k) */ /* initialize the heap */ + cand = heap_create(k, MINF); + get_candidates(ctx, v, k, cand, seen_cand_ids, feat_table); + + /* 7: while | \bold{^D}(v) | < k and | cand[v] | > 0 do + 8: append_next(cand[v], \bold{^D}(v)) */ + for(i=0; i<k; i++){ + if(heap_size(cand) <= 0) + break; + append_next(ctx, cand, vDbp_v->elts[i], v, seen_cand_ids, feat_table); + } + + /* cleanup */ + if(seen_cand_ids){ + for(i=0; i<v->bs_size; i++){ + if(seen_cand_ids[i]){ + for(ja=0; ja<k; ja++){ + if(seen_cand_ids[i][ja]){ + for(jb=0; jb<k; jb++){ + if(seen_cand_ids[i][ja][jb]){ + free_cand_id(seen_cand_ids[i][ja][jb]); + seen_cand_ids[i][ja][jb] = NULL; + } + } + } + } + } + } + free(seen_cand_ids[0][0]); + free(seen_cand_ids[0]); + free(seen_cand_ids); + seen_cand_ids = NULL; + } + heap_destroy(cand); + + /* FIXME: move this info msg where it belongs */ + /* + if (ctx->verbose_flag > 4) + fprintf(stderr, "\tctx->OPENK[%d][%d][%d][%d](%f) = ctx->CLOSEDK[%d][%d][1], ctx->CLOSEDK[%d][%d][0]\n", + start, end, dir, label, (score_max + feat_table->pl[start][end][dir] + w), + start, m_argmax, m_argmax+1, end); + */ + +} + + +void find_all_kbest(int k, + maca_graph_parser_ctx *ctx, + maca_graph_parser_sentence *s, + maca_graph_parser_feature_table *feat_table){ + /** + * Find the k best analyses for a sentence. + */ + + int sentence_length = s->l; + int labels_nb = ctx->labels_nb; + Vertex *head; + + /* sanitary check */ + if (k > ctx->k){ + fprintf(stderr, "ERR: k > ctx->k\n"); + } + + /* 2: for v \in V in topological order do : + 3: find_kbest(v,k) + */ + int span; + for(span = 1; span < sentence_length; span++){ + int start; + for(start = 0; start+span < sentence_length; start++){ + int end = start + span; + if(ctx->verbose_flag > 4) fprintf(stderr, "start = %d end = %d\n",start,end); + + int dir; + for(dir=0; dir<2; dir++){ + /* OPEN table */ + int label; + for(label=0; label<labels_nb; label++){ + head = ctx->OPENK[start][end][dir][label]; + find_kbest(ctx, head, k, feat_table); + } + + /* CLOSED table */ + head = ctx->CLOSEDK[start][end][dir]; + find_kbest(ctx, head, k, feat_table); + + } /* end for dir */ + } /* end for start */ + } /* end for start */ +} + + +void maca_graph_parser_set_parse(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, int ki){ + /** + * Annotate s with its ki-th parse. + * + * FIXME: this is a dirty temporary hack. + */ + int i; + + /* backport ki-th best from kb to s proper */ + for(i=1; i<s->l; i++){ + s->gov[i] = s->kb->gov[i][ki]; + s->label[i] = s->kb->label[i][ki]; + } + s->score = s->kb->score[ki]; +} + + +void maca_graph_parser_kbest_output(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s){ + /** + * Output the kbest solutions found. + */ + + int ki; + DerivBP *bestSpan = NULL; + vec_Dbp *vd; + + /* ensure there is a kb to store the k-best parses */ + if(s->kb == NULL){ + s->kb = maca_graph_parser_allocate_sentence_kbest(ctx); + } + /* reset kb */ + maca_graph_parser_reset_sentence_kbest(s->kb); + + /* store the k-best parses from the forest into kb */ + vd = ctx->CDERIV[0][s->l-1][ra]; + if(vd){ + for(ki=0; (ki < vd->num) && (ki < ctx->k); ki++){ + bestSpan = vd->elts[ki]; + if(bestSpan->e != NULL){ + /* in the current implementation, e == NULL + iff there is no ki-th best solution */ + create_closedBP(ctx, bestSpan, s, ki); + s->kb->score[ki] = bestSpan->weight; + } + } + } + + /* backport 1-st best from kb to s proper */ + maca_graph_parser_set_parse(ctx, s, 0); +} + + +feature_counter_array *extract_features_from_kbest_parse_fca(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, feature_counter_array *a, int ki){ + /** + * Extract features from the ki-th best parse of s. + * + * Adapted from maca_graph_parser_feature_counter_array.c:extract_features_from_parse_fca(). + * + * FIXME: find a way to share code with the original function. + * FIXME: find a way to directly use the features used for decoding, + * which are currently computed and forgotten in feature_table. + */ + + if(s->kb == NULL){ + return a; /* FIXME: raise an error */ + } + + if(a == NULL){ + a = allocate_feature_counter_array(ctx, (s->l - 1)); + } else { /* reset a */ + free_feature_counter_array(a); + a = allocate_feature_counter_array(ctx, (s->l - 1)); + } + + feature_counter *c; + feat_vector *v; + int dep, gdep, sbl; + int g; + int i; + + if(ctx->basic_features){ + c = a->basic_feature_counter; + v = NULL; + for(dep=1; dep < s->l; dep++){ + v = basic(s, ctx, s->kb->gov[dep][ki], dep, v); + feature_counter_update_vector(c, v); + } + free_feat_vector(v); + v = NULL; + } + + if(ctx->first_features){ + c = a->first_feature_counter; + v = NULL; + for(dep=1; dep < s->l; dep++){ + v = first(s, ctx, s->kb->gov[dep][ki], dep, s->kb->label[dep][ki], v); + feature_counter_update_vector(c, v); + } + free_feat_vector(v); + v = NULL; + } + + if(ctx->sibling_features){ + c = a->sibling_feature_counter; + v = NULL; + for(dep=1; dep < s->l; dep++){ + // MM + sbl = -1; + g = s->kb->gov[dep][ki]; // governor + /* wanted sibling: child of g in [g..dep] that is closest to dep */ + if (g < dep) { /* ra */ + for(i=dep-1; i > g; i--){ + if(g == s->kb->gov[i][ki]){ // && (dep != i) + sbl = i; + break; + } + } + } else { /* la */ + for(i=dep+1; i < g; i++){ + if(g == s->kb->gov[i][ki]){ // (dep != i) && + sbl = i; + break; + } + } + } + /* sbl == -1 if no sibling */ + v = sibling(s, ctx, s->kb->gov[dep][ki], dep, sbl, s->kb->label[dep][ki], v); + feature_counter_update_vector(c, v); + } + free_feat_vector(v); + v = NULL; + } + + if(ctx->grandchildren_features){ + c = a->grandchildren_feature_counter; + v = NULL; + for(dep=1; dep < s->l; dep++){ + // MM + g = s->kb->gov[dep][ki]; + if (g < dep){ /* ra */ + /* cmi: inside [g;dep] */ + gdep = -1; + for(i=dep-1; i > g; i--){ + if(s->kb->gov[i][ki] == dep){ + gdep = i; + } + } + v = grandchildren(s, ctx, g, dep, gdep, s->kb->label[dep][ki], v); + feature_counter_update_vector(c, v); + /* cmo: outside [g;dep] */ + gdep = -1; + for(i=dep+1; i<s->l; i++){ + if(s->kb->gov[i][ki] == dep){ + gdep = i; + } + } + v = grandchildren(s, ctx, g, dep, gdep, s->kb->label[dep][ki], v); + feature_counter_update_vector(c, v); + } else { /* la */ + /* cmi: inside [dep;g] */ + gdep = -1; + for(i=dep+1; i < g; i++){ + if(s->kb->gov[i][ki] == dep){ + gdep = i; + } + } + v = grandchildren(s, ctx, g, dep, gdep, s->kb->label[dep][ki], v); + feature_counter_update_vector(c, v); + /* cmo: outside [dep;g] */ + gdep = -1; + for(i=dep-1; i>0; i--){ + if(s->kb->gov[i][ki] == dep){ + gdep = i; + } + } + v = grandchildren(s, ctx, g, dep, gdep, s->kb->label[dep][ki], v); + feature_counter_update_vector(c, v); + } + } + free_feat_vector(v); + v = NULL; + } + + return a; +} + + +int maca_graph_parser_rescore_kbest(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s){ + /** + * Rescore the k-best parses of s with higher-order factors. + * + * Currently uses second-order factors. + * + * FIXME: Currently returns the index of the best parse. + */ + + vec_Dbp *derivs; + int ki; + feature_counter_array *hyp_feature_counter_array; + int nb_cands; + float score_cand; + float score_max; + int index_max; + + /* init */ + nb_cands = 0; + score_max = MINF; + /* activate 2nd order features */ + int gra_feats = ctx->grandchildren_features; + int sib_feats = ctx->sibling_features; + ctx->grandchildren_features = 1; + ctx->sibling_features = 1; + /* end activate */ + hyp_feature_counter_array = allocate_feature_counter_array(ctx, (s->l - 1)); + + /* */ + derivs = ctx->CDERIV[0][s->l-1][ra]; + if(derivs){ + for(ki=0; (ki < derivs->num) && (ki < ctx->k); ki++){ + if(derivs->elts[ki]->e != NULL){ + /* rescore */ + hyp_feature_counter_array = extract_features_from_kbest_parse_fca(ctx, s, hyp_feature_counter_array, ki); + score_cand = score_feature_counter_array(hyp_feature_counter_array, ctx->model2); + /* DEBUG */ + /* fprintf(stderr, "ki = %d: score = %f\n", ki, score_cand); */ + /* DEBUG */ + + nb_cands++; + + if(nb_cands == 1){ + index_max = ki; + score_max = score_cand; + } else { + if(score_cand > score_max){ + index_max = ki; + score_max = score_cand; + } + } + } + } /* end for ki */ + } + + /* teardown feat counter */ + free_feature_counter_array(hyp_feature_counter_array); + /* restore 2nd order features */ + ctx->grandchildren_features = gra_feats; + ctx->sibling_features = sib_feats; + + return index_max; +} + + +void maca_graph_parser_hyperdecoder_parse(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, maca_graph_parser_feature_table *feat_table){ + /** + * Parse s with the hypergraph decoder. + */ + + find_all_kbest(ctx->k, ctx, s, ctx->feature_table); + /* annotate the sentence with the k-best structures */ + maca_graph_parser_kbest_output(ctx, s); + /* rescore kbest with higher-order factors */ + if(ctx->model2){ + int index_max = maca_graph_parser_rescore_kbest(ctx, s); + maca_graph_parser_set_parse(ctx, s, index_max); + } +} + +/*-------------------------------------------------------------------------------------------*/ + +void create_closedBP(maca_graph_parser_ctx *ctx, DerivBP *d, maca_graph_parser_sentence *s, int ki){ + /** + * Annotate sentence with the ki-th best parse. + */ + + /* + if(c->dir == la){ + printf("create_closed [%d;(%d);<%d>] from closed [%d;<%d>] and open [%d;<%d>]\n", + c->start, c->breakpoint, c->end, + (c->d)? c->d->start : -1, (c->d)? c->d->end : -1, + (c->u)? c->u->start : -1, (c->u)? c->u->end : -1); + } else { + printf("create_closed [<%d>;(%d);%d] from open [<%d>;%d] and closed [<%d>;%d]\n", + c->start, c->breakpoint, c->end, + (c->u)? c->u->start : -1, (c->u)? c->u->end : -1, + (c->d)? c->d->start : -1, (c->d)? c->d->end : -1); + } + */ + int i; + Vertex *Ti_e; + DerivBP *DbpTi_e; + + if(d->e == NULL){ + fprintf(stderr, "hyperdecoder.c:closedBP(): weird but ignored at the moment\n"); + return; + } + /* recursive calls along each component */ + /* should be: one create_closedBP(), one create_openBP() */ + for(i=0; i<2; i++){ + Ti_e = d->e->tail[i]; + if(Ti_e){ + DbpTi_e = get_vec_Dbp(ctx, Ti_e)->elts[d->j[i]]; + if(Ti_e->vsign->type == TYP_CLOSED){ + create_closedBP(ctx, DbpTi_e, s, ki); + } else if(Ti_e->vsign->type == TYP_OPEN){ + create_openBP(ctx, DbpTi_e, s, ki); + } else { + fprintf(stderr, "Not implemented yet\n"); + } + } + } +} + +/*-------------------------------------------------------------------------------------------*/ + +void create_openBP(maca_graph_parser_ctx *ctx, DerivBP *d, maca_graph_parser_sentence *s, int ki){ + /** + * Annotate sentence with the ki-th best parse. + */ + + /* + if (o->dir == la){ + printf("create_open [%d;<%d>] from left closed [<%d>;%d] and right closed [%d;<%d>]\n", + o->start, o->end, + (o->left)? o->left->start : -1, (o->left)? o->left->end : -1, + (o->right)? o->right->start : -1, (o->right)? o->right->end : -1); + } else { + printf("create_open [<%d>;%d] from left closed [<%d;%d>] and right closed [<%d;%d>]\n", + o->start, o->end, + (o->left)? o->left->start : -1, (o->left)? o->left->end : -1, + (o->right)? o->right->start : -1, (o->right)? o->right->end : -1); + } + */ + SgOpen o; + int gov; + int dep; + int i; + Vertex *Ti_e; + DerivBP *DbpTi_e; + + + if(d->e == NULL){ + fprintf(stderr, "hyperdecoder.c:openBP(): weird but ignored at the moment\n"); + return; + } + /* update sentence */ + o = d->e->head->vsign->open; + gov = (o.dir == la) ? o.end : o.start; + dep = (o.dir == la) ? o.start : o.end; + s->kb->gov[dep][ki] = gov; + s->kb->label[dep][ki] = o.label; + + /* recursive calls along each component */ + /* should be: two create_closedBP() */ + for(i=0; i<2; i++){ + Ti_e = d->e->tail[i]; + if(Ti_e){ + DbpTi_e = get_vec_Dbp(ctx, Ti_e)->elts[d->j[i]]; + if(Ti_e->vsign->type == TYP_CLOSED){ + create_closedBP(ctx, DbpTi_e, s, ki); + } else if(Ti_e->vsign->type == TYP_OPEN){ + create_openBP(ctx, DbpTi_e, s, ki); + } else { + fprintf(stderr, "Not implemented yet\n"); + } + } + } +} diff --git a/maca_graph_parser/maca_graph_parser_hyperdecoder.h b/maca_graph_parser/maca_graph_parser_hyperdecoder.h new file mode 100644 index 0000000000000000000000000000000000000000..6c12295ca9d446804dac0029f30f8d74221b9a2f --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_hyperdecoder.h @@ -0,0 +1,59 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_GRAPH_PARSER_DECODER_K__ +#define __MACA_GRAPH_PARSER_DECODER_K__ + +#include "maca_common.h" +#include "maca_constants.h" +#include "maca_graph_parser.h" +#include "maca_graph_parser_sentence.h" + +/* kbest */ +#include "maca_graph_parser_heapq.h" +/* #include "maca_graph_parser_hypergraph.h" */ /* included in maca_graph_parser.h as a dirty hack */ + + +typedef struct { + int bs_i; /* rank of edge in the head vertex' backstar */ + int j[2]; /* */ +} cand_id; + +/* was: definition of vec_Dbp, moved to maca_graph_parser.h + as a dirty hack */ + +#ifdef __cplusplus +extern "C"{ +#endif + + +void maca_graph_parser_hyperdecoder_init(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s); +void maca_graph_parser_hyperdecoder_cleanup(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s); +void maca_graph_parser_hyperdecoder_parse(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, maca_graph_parser_feature_table *feat_table); + +void create_openBP(maca_graph_parser_ctx *ctx, DerivBP *d, maca_graph_parser_sentence *s, int ki); +void create_closedBP(maca_graph_parser_ctx *ctx, DerivBP *d, maca_graph_parser_sentence *s, int ki); + +#ifdef __cplusplus +} +#endif + + + +#endif diff --git a/maca_graph_parser/maca_graph_parser_hypergraph.c b/maca_graph_parser/maca_graph_parser_hypergraph.c new file mode 100644 index 0000000000000000000000000000000000000000..a8694f9ecc76685adf8e515b73420f6a3fa0dc9c --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_hypergraph.c @@ -0,0 +1,323 @@ +/** + * Hypergraph. + */ + +#include <stddef.h> +#include <stdlib.h> +#include <stdio.h> + +#include "maca_graph_parser_hypergraph.h" + + +/** + * Vertex signature + */ + +VertexSignature *alloc_vertexSignature(){ + /** + * Allocate a vertex signature. + */ + + VertexSignature *v = malloc(sizeof(VertexSignature)); + if(v == NULL){ + fprintf(stderr, "memory allocation problem\n"); + exit(1); + } + + return v; +} + +void free_vertexSignature(VertexSignature *v){ + /** + * + */ + if(!v) + return; + + free(v); +} + + +void print_vertexSignature(VertexSignature *v){ + /** + * Print a vertex signature to stdout. + */ + if(v->type == TYP_OPEN){ + printf("OPEN[%d][%d][%d][%d]", v->open.start, v->open.end, v->open.dir, v->open.label); + } else if (v->type == TYP_CLOSED){ + printf("CLOSED[%d][%d][%d]", v->closed.start, v->closed.end, v->closed.dir); + } else if (v->type == TYP_CLOSED2){ + printf("CLOSED2[%d][%d][%d][%d]", v->closed2.start, v->closed2.end, v->closed2.dir, v->closed2.breakpoint); + } else { + printf("Not implemented yet\n"); + } +} + + +void init_hyperopen(VertexSignature *vs, int start, int end, int label, int dir){ + /** + * + */ + + if(vs == NULL){ + fprintf(stderr, "Cannot set NULL vertex signature\n"); + exit(1); + } + vs->open.type = TYP_OPEN; + vs->open.start = start; + vs->open.end = end; + vs->open.dir = dir; + vs->open.label = label; +} + + +void init_hyperclosed(VertexSignature *vs, int start, int end, int dir){ + /** + * + */ + + if(vs == NULL){ + fprintf(stderr, "Cannot set NULL vertex signature\n"); + exit(1); + } + vs->closed.type = TYP_CLOSED; + vs->closed.start = start; + vs->closed.end = end; + vs->closed.dir = dir; +} + + +void init_hyperclosed2(VertexSignature *vs, int start, int end, int breakpoint, int dir){ + /** + * + */ + + if(vs == NULL){ + fprintf(stderr, "Cannot set NULL vertex signature\n"); + exit(1); + } + vs->closed2.type = TYP_CLOSED2; + vs->closed2.start = start; + vs->closed2.end = end; + vs->closed2.dir = dir; + vs->closed2.breakpoint = breakpoint; +} + + +/** + * Vertex + */ + +Vertex *alloc_vertex(){ + /** + * Allocate a vertex whose tail tails has size bs_size. + */ + + Vertex *v = malloc(sizeof(Vertex)); + if(v == NULL){ + fprintf(stderr, "memory allocation problem\n"); + exit(1); + } + /* vertex signature */ + v->vsign = NULL; + /* backstar */ + v->bs = NULL; + v->bs_size = 0; + + return v; +} + + +void init_vertex(Vertex *v, vec_Vertex *tails, size_t bs_size, VertexSignature *vs){ + /** + * Init a vertex with a signature and a backstar. + * TODO: s/vec_Vertex *tails, size_t bs_size/vec_Hyperarc *backstar + */ + + /* vertex signature */ + v->vsign = vs; + /* backstar */ + v->bs = malloc(bs_size * sizeof(Hyperarc *)); + Vertex *t[2]; + int i; + for(i=0; i<bs_size; i++){ + t[0] = tails->elts[2*i]; + t[1] = tails->elts[2*i+1]; + v->bs[i] = alloc_hyperarc(t, v); + } + v->bs_size = bs_size; +} + + +void free_vertex(Vertex *v){ + /** + * + */ + int i; + + if(!v) + return; + + free_vertexSignature(v->vsign); + v->vsign = NULL; + + for(i=0; i<v->bs_size; i++){ + free_hyperarc(v->bs[i]); + v->bs[i] = NULL; + } + free(v->bs); + v->bs = NULL; + + free(v); +} + + +/** + * Hyperarc stuff + */ + +Hyperarc *alloc_hyperarc(Vertex *tail[2], Vertex *head){ + /** + * Allocate an hyperarc. + * + * Parameters + * ---------- + * + */ + Hyperarc *e = malloc(sizeof(Hyperarc)); + if(e == NULL){ + fprintf(stderr, "memory allocation problem\n"); + exit(1); + } + e->tail[0] = tail[0]; + e->tail[1] = tail[1]; + e->head = head; + return e; +} + +void free_hyperarc(Hyperarc *e){ + /** + * + */ + if (!e) + return; + + /* free e and NULLify its content */ + e->head = NULL; + e->tail[0] = NULL; + e->tail[1] = NULL; + free(e); +} + +/** + * Derivation stuff + */ + +Derivation *alloc_derivation(float weight, Hyperarc *e, Derivation *subd[2]){ + /** + * + */ + Derivation *d = malloc(sizeof(Derivation)); + if(d==NULL){ + fprintf(stderr, "memory allocation problem\n"); + exit(1); + } + d->weight = weight; + d->e = e; + d->subd[0] = subd[0]; + d->subd[1] = subd[1]; + return d; +} + +void free_derivation(Derivation *d){ + /** + * + */ + if (!d) + return; + + d->e = NULL; + free(d); +} + +DerivBP *alloc_derivBP(float weight, Hyperarc *e, int j[2]){ + /** + * Allocate a derivation with backpointers. + * + * Parameters + * ---------- + * + */ + DerivBP *d = malloc(sizeof(DerivBP)); + if(d == NULL){ + fprintf(stderr, "memory allocation problem\n"); + exit(1); + } + d->weight = weight; + d->e = e; + d->j[0] = j[0]; + d->j[1] = j[1]; + return d; +} + +void free_derivBP(DerivBP *d){ + /** + * + */ + if (!d) + return; + + d->e = NULL; + free(d); +} + + +/** + * Vector of vertices + */ + +vec_Vertex *alloc_vec_Vertex(int capacity){ + /** + * Allocate a vector of vertices. + */ + vec_Vertex *res = malloc(sizeof(vec_Vertex)); + if(res == NULL){ + fprintf(stderr, "Mem prob\n"); + exit(1); + } + + res->num = 0; + res->capacity = capacity; + res->elts = malloc(capacity * sizeof(Vertex *)); + if(res->elts == NULL){ + fprintf(stderr, "Mem prob\n"); + exit(1); + } + + return res; +} + + +void free_vec_Vertex(vec_Vertex *vv){ + /** + * Free a vector of vertices + */ + if(!vv) + return; + + free(vv->elts); + free(vv); +} + + +void vec_Vertex_append(vec_Vertex *vv, Vertex *v){ + /** + * Append to a vector of vertices + */ + if(vv->num >= vv->capacity){ + fprintf(stderr, "Cannot append: Vector full\n"); + return; + } + vv->elts[vv->num] = v; + vv->num += 1; +} diff --git a/maca_graph_parser/maca_graph_parser_hypergraph.h b/maca_graph_parser/maca_graph_parser_hypergraph.h new file mode 100644 index 0000000000000000000000000000000000000000..1ba689f07ed3bc917389692b45d6df20ee41ba24 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_hypergraph.h @@ -0,0 +1,117 @@ +#ifndef __HYPERGRAPH__ +#define __HYPERGRAPH__ + +#include <stdlib.h> +/** + * Hypergraph + */ +#define TYP_OPEN 0 +#define TYP_CLOSED 1 +#define TYP_CLOSED2 2 + +typedef struct { + int type; /* must not be changed */ + int start; + int end; + int dir; + int label; +} SgOpen; + +typedef struct { + int type; /* must not be changed */ + int start; + int end; + int dir; +} SgClosed; + +typedef struct { + int type; /* must not be changed */ + int start; + int end; + int dir; + int breakpoint; +} SgClosed2; + +/* http://stackoverflow.com/a/18577481 */ +/* http://www.sbin.org/doc/Xlib/chapt_21_app_E.html */ +typedef union _VertexSignature { + int type; /* must not be changed */ + SgOpen open; + SgClosed closed; + SgClosed2 closed2; +} VertexSignature; + +typedef struct _Hyperarc Hyperarc; + +typedef struct { + VertexSignature *vsign; /* vertex signature */ + Hyperarc **bs; /* backstar */ + size_t bs_size; /* backstar size */ +} Vertex; + +struct _Hyperarc { + Vertex *tail[2]; /* tail: vector of vertices; here: fixed arity = 2 */ + Vertex *head; /* head */ + /* weight function from R^{|T(e)|} to R */ + int ds_i; /* derivation index in the backstar of e's head vertex */ /* test */ +}; + +/* derivation */ +typedef struct Derivation { + /* vertex v; */ /* == e.head */ + float weight; /* score */ + Hyperarc *e; + struct Derivation *subd[2]; /* vector (here array) of subderivations, one for each vertex in tail */ +} Derivation; + +/* derivation with backpointers */ +typedef struct { + /* vertex v; */ /* == e.head */ + float weight; /* score */ + Hyperarc *e; + int j[2]; /* vector (here array) of subderivation indices, one for each vertex in tail */ +} DerivBP; + +typedef struct { + int num; /* current size */ + int capacity; /* max size */ + Vertex **elts; /* array of elements */ +} vec_Vertex; /* vector of vertices */ + + +#ifdef __cplusplus +extern "C"{ +#endif + + +void get_derivBPs(Vertex *v, DerivBP **result); + +VertexSignature *alloc_vertexSignature(); +void free_vertexSignature(VertexSignature *v); +void print_vertexSignature(VertexSignature *v); +void init_hyperopen(VertexSignature *vs, int start, int end, int label, int dir); +void init_hyperclosed(VertexSignature *vs, int start, int end, int dir); +void init_hyperclosed2(VertexSignature *vs, int start, int end, int breakpoint, int dir); + +Vertex *alloc_vertex(); +void init_vertex(Vertex *v, vec_Vertex *tails, size_t bs_size, VertexSignature *vs); +void free_vertex(Vertex *v); + +Hyperarc *alloc_hyperarc(Vertex *tail[2], Vertex *head); +void free_hyperarc(Hyperarc *e); + +Derivation *alloc_derivation(float weight, Hyperarc *e, Derivation *subd[2]); +void free_derivation(Derivation *d); + +DerivBP *alloc_derivBP(float weight, Hyperarc *e, int j[2]); +void free_derivBP(DerivBP *d); + +vec_Vertex *alloc_vec_Vertex(int capacity); +void free_vec_Vertex(vec_Vertex *vv); +void vec_Vertex_append(vec_Vertex *vv, Vertex *v); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/maca_graph_parser/maca_graph_parser_main.c b/maca_graph_parser/maca_graph_parser_main.c new file mode 100644 index 0000000000000000000000000000000000000000..ea3c6379fda08eb893d756b9fcbc2534c585ba0b --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_main.c @@ -0,0 +1,414 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + Jeremy Auguste <jeremy.auguste@etu.univ-amu.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +**************************************************************************/ + +#include<getopt.h> +#include<unistd.h> +#include<stdio.h> +#include<string.h> + +#include "maca_common.h" +#include "maca_constants.h" +/* #include <maca_lex.h> */ +#include "maca_graph_parser.h" +#include "maca_graph_parser_model.h" +#include "maca_graph_parser_sentence.h" +#include "maca_graph_parser_decoder.h" +#include "maca_graph_parser_metrics.h" +#include "maca_graph_parser_train.h" +/* #include "maca_graph_parser_conll2007_format.h" */ +#include "maca_graph_parser_features.h" +#include "maca_graph_parser_feature_table.h" + + +void maca_graph_parser_decode_main(maca_graph_parser_ctx * ctx); +void maca_graph_parser_train_main(maca_graph_parser_ctx * ctx); + + +int main(int argc, char **argv) +{ + maca_graph_parser_ctx * ctx; + + /* no argument on command line */ + if(argc == 2 && !strcmp(argv[1], "-h")) + { + //maca_graph_parser_PrintHelpMessage(argv[0]); + exit(0); + } + + /* sinon lecture des options dans la ligne de commande */ + ctx = maca_graph_parser_LoadCTX(argc,argv); + + /* maca_graph_parser_init(ctx); */ + + /* maca_lex_load_cfg(ctx->cfg); */ + + + if(ctx->mode == DECODE_MODE) + maca_graph_parser_decode_main(ctx); + else + if(ctx->mode == TRAIN_MODE) + maca_graph_parser_train_main(ctx); + + maca_graph_parser_free_all(ctx); + return 0; +} + + + +void maca_graph_parser_decode_main(maca_graph_parser_ctx * ctx) +{ + maca_alphabet_array *alpha_array; + int i; + int sent_num; + /*maca_sentence * ms;*/ + + /* model */ + //ctx->model = maca_graph_parser_model_mmap(ctx, ctx->model_file_name); + ctx->model = maca_graph_parser_model_load(ctx, ctx->model_file_name); + /* model2 */ + if(ctx->model2_file_name != NULL){ + ctx->model2 = maca_graph_parser_model_load(ctx, ctx->model2_file_name); + } else { + ctx->model2 = NULL; + } + /* set active feature types for the decoder */ + ctx->min_dep_count = ctx->model->min_dep_count; + ctx->use_lemmas = ctx->model->use_lemmas; + ctx->use_full_forms = ctx->model->use_full_forms; + ctx->basic_features = ctx->model->basic_features; + ctx->first_features = ctx->model->first_features; + ctx->grandchildren_features = ctx->model->grandchildren_features; + ctx->sibling_features = ctx->model->sibling_features; + ctx->subcat_features = ctx->model->subcat_features; + + if(ctx->sibling_features || ctx->grandchildren_features) ctx->order = 2; + + /* alphabets */ + /* load alphabets */ + + alpha_array = maca_alphabet_array_new_from_file(ctx->alphabet_file_name); + if (alpha_array == NULL) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "couldn't open the alphabet file!\n"); + exit(1); + } + ctx->words_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_WORDS); + ctx->words_nb = (ctx->words_alphabet != NULL) ? maca_alphabet_size(ctx->words_alphabet) : 0; + ctx->labels_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_LABELS); + ctx->labels_nb = (ctx->labels_alphabet != NULL) ? maca_alphabet_size(ctx->labels_alphabet) : 0; + ctx->pos_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_POS); + ctx->pos_nb = (ctx->pos_alphabet != NULL) ? maca_alphabet_size(ctx->pos_alphabet) : 0; + ctx->morpho_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_MORPHO); + ctx->morpho_nb = (ctx->morpho_alphabet != NULL) ? maca_alphabet_size(ctx->morpho_alphabet) : 0; + ctx->synt_feats_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_SYNT_FEATS); + ctx->synt_feats_nb = (ctx->synt_feats_alphabet != NULL) ? maca_alphabet_size(ctx->synt_feats_alphabet) : 0; + + /* /\* store special values in ctx and check that every necessary alphabet is loaded *\/ */ + if (ctx->use_full_forms || ctx->use_lemmas) { + if (ctx->words_alphabet == NULL) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "missing the '"MACA_ALPHABET_WORDS"' alphabet in the alphabet file\n"); + exit(1); + } + ctx->w_start = maca_alphabet_get_code(ctx->words_alphabet, "__START__"); + ctx->w_end = maca_alphabet_get_code(ctx->words_alphabet, "__END__"); + } + + if (ctx->pos_alphabet == NULL) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "missing the '"MACA_ALPHABET_POS"' alphabet in the alphabet file\n"); + exit(1); + } + ctx->pos_start = maca_alphabet_get_code(ctx->pos_alphabet, "__START__"); + ctx->pos_end = maca_alphabet_get_code(ctx->pos_alphabet, "__END__"); + + if (ctx->labels_alphabet == NULL) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "missing the '"MACA_ALPHABET_LABELS"' alphabet in the alphabet file\n"); + exit(1); + } + ctx->fct_joker = maca_alphabet_get_code(ctx->labels_alphabet, "__JOKER__"); + + if (ctx->subcat_features) { + if (ctx->synt_feats_alphabet == NULL) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "missing the '"MACA_ALPHABET_SYNT_FEATS"' alphabet in the alphabet file\n"); + exit(1); + } + } + /* end alphabets */ + + + + /* template library allocator needs: words_nb, pos_nb, labels_nb */ + ctx->e = maca_graph_parser_templ_library_allocator(ctx); + + /* load dep_count_table */ + ctx->dep_count_table = maca_graph_parser_dep_count_table_read(ctx, ctx->dep_count_table_file_name); + + + /* allocate feature table */ + if(ctx->store_in_feature_table){ + maca_graph_parser_feature_table_allocator(ctx); + } + + ctx->s = maca_graph_parser_allocate_sentence(ctx); + + if(ctx->print_ctx) maca_graph_parser_print_ctx(ctx); + + + + if(ctx->mcf_file_name){ + maca_mcf_sentence *mcf_sent; + maca_mcf_column *column; + maca_mcf *format = maca_mcf_new_with_alphabet_array(ctx->mcf_file_name, alpha_array); + char buffer[128]; + + /* INPUTS */ + /* full form */ + if(ctx->use_full_forms){ + ctx->mcf_form_id = maca_mcf_input(format, MACA_MCF_FORM); + if(ctx->mcf_form_id == -1){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column FORM not found in the train file\n"); + exit(1); + } + /* check that the alphabet used is correct */ + column = maca_mcf_get_column_info(format, ctx->mcf_form_id); + maca_mcf_column_get_type(column, buffer, sizeof(buffer)); + if (strcmp(buffer, MACA_ALPHABET_WORDS) != 0) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column FORM is not using the "MACA_ALPHABET_WORDS" alphabet\n"); + exit(1); + } + } + + /* lemmas */ + if (ctx->use_lemmas){ + ctx->mcf_lemma_id = maca_mcf_input(format, MACA_MCF_LEMMA); + if (ctx->mcf_lemma_id == -1){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column LEMMA not found in the train file\n"); + exit(1); + } + /* check that the alphabet used is correct */ + column = maca_mcf_get_column_info(format, ctx->mcf_lemma_id); + maca_mcf_column_get_type(column, buffer, sizeof(buffer)); + if (strcmp(buffer, MACA_ALPHABET_WORDS) != 0) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column LEMMA is not using the "MACA_ALPHABET_WORDS" alphabet\n"); + exit(1); + } + } + + /* postag */ + ctx->mcf_postag_id = maca_mcf_input(format, MACA_MCF_POSTAG); + if (ctx->mcf_postag_id == -1){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column POSTAG not found in the train file\n"); + exit(1); + } + /* check that the alphabet used is correct */ + column = maca_mcf_get_column_info(format, ctx->mcf_postag_id); + maca_mcf_column_get_type(column, buffer, sizeof(buffer)); + if (strcmp(buffer, MACA_ALPHABET_POS) != 0) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column POSTAG is not using the "MACA_ALPHABET_POS" alphabet\n"); + exit(1); + } + /* lock the postag alphabet */ + maca_alphabet_lock(ctx->pos_alphabet); + + /* subcat */ + if (ctx->subcat_features) { + ctx->mcf_subcat_id = maca_mcf_input(format, MACA_MCF_SUBCAT); + if (ctx->mcf_subcat_id == -1) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column SUBCAT not found in the train file\n"); + exit(1); + } + /* check that the alphabet used is correct */ + column = maca_mcf_get_column_info(format, ctx->mcf_subcat_id); + maca_mcf_column_get_type(column, buffer, sizeof(buffer)); + if (strcmp(buffer, MACA_ALPHABET_SYNT_FEATS) != 0) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column SUBCAT is not using the "MACA_ALPHABET_SYNT_FEATS" alphabet\n"); + exit(1); + } + } + + /* OUTPUTS */ + /* deprel */ + ctx->mcf_deprel_id = maca_mcf_output(format, MACA_MCF_DEPREL, MACA_ALPHABET_LABELS); + if(ctx->mcf_deprel_id == -1){ + ctx->mcf_deprel_id = maca_mcf_input(format, MACA_MCF_DEPREL); + if (ctx->mcf_deprel_id == -1) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "undefined error, talk to a developper about this issue\n"); + exit(1); + } + /* If we are in input mode, we need to check that the alphabet is the correct one */ + column = maca_mcf_get_column_info(format, ctx->mcf_deprel_id); + maca_mcf_column_get_type(column, buffer, sizeof(buffer)); + if (strcmp(buffer, MACA_ALPHABET_LABELS) != 0) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column DEPREL is not using the "MACA_ALPHABET_LABELS" alphabet\n"); + exit(1); + } + } + /* lock the deprel alphabet */ + maca_alphabet_lock(ctx->labels_alphabet); + + + /* head */ + ctx->mcf_head_id = maca_mcf_output(format, MACA_MCF_HEAD, "INT"); + if(ctx->mcf_head_id == -1){ + ctx->mcf_head_id = maca_mcf_input(format, MACA_MCF_HEAD); + if(ctx->mcf_head_id == -1){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "undefined error, talk to a developper about this issue\n"); + exit(1); + } + /* check head is using integers */ + column = maca_mcf_get_column_info(format, ctx->mcf_head_id); + maca_mcf_column_get_type(column, buffer, sizeof(buffer)); + if (strcmp(buffer, "INT") != 0) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "column HEAD is not using the INT type\n"); + exit(1); + } + } + + maca_mcf_print_header(format, stdout); + + /* --- */ + + for(mcf_sent = maca_graph_parser_read_mcf_sentence(ctx, format, ctx->s), sent_num = 0; + mcf_sent && (sent_num < ctx->sent_nb); + mcf_sent = maca_graph_parser_read_mcf_sentence(ctx, format, ctx->s), sent_num++){ + + maca_graph_parser_print_verbose(ctx, 2, MACA_MESSAGE, "parsing sentence"); + maca_graph_parser_decoder_parse(ctx, ctx->s); + /* maca_graph_parser_update_sentence(ctx, ctx->s); */ /* TODO: backport modifs to maca_sentence if IO format is maca_xml */ + + /* write parsed sentence to file_out (default to stdout) */ + //maca_graph_parser_dump_conll_sentence(ctx, ctx->s, ctx->file_out); + + maca_graph_parser_sentence_fill_mcf_output(mcf_sent, ctx, ctx->s); + maca_mcf_sentence_print(mcf_sent, ctx->file_out); + + /* fprintf(stderr, "%d\n", sent_num); */ + maca_mcf_sentence_release(mcf_sent); + maca_graph_parser_sentence_clear(ctx->s); + } + if (mcf_sent != NULL) { + maca_mcf_sentence_release(mcf_sent); + mcf_sent = NULL; + } + } + + else{ + /*maca_load_data(1); + ms = maca_common_get_iterator_sentences(); + for(ms;ms;ms=maca_next_sentence(ms)){ + maca_graph_parser_ProcessSentence(ms,ctx); + }*/ + } + + /* maca_graph_parser_add_stamp(maca_common_get_xml_root_node());*/ + /* fermeture et libération memoire */ + /*maca_close(); */ +} + +void maca_graph_parser_train_main(maca_graph_parser_ctx *ctx) +{ + + /* load training corpus and populate the alphabets */ + if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "loading training corpus\n"); + } + hyp_ref_vector *corpus = load_mcf_corpus(ctx); + fprintf(stderr, "Corpus size: %d\n", corpus->size); + + /* determine size of the populated alphabets */ + + ctx->words_nb = (ctx->words_alphabet) ? maca_alphabet_size(ctx->words_alphabet) : 0; + ctx->labels_nb = (ctx->labels_alphabet) ? maca_alphabet_size(ctx->labels_alphabet) : 0; + ctx->pos_nb = (ctx->pos_alphabet) ? maca_alphabet_size(ctx->pos_alphabet) : 0; + ctx->morpho_nb = (ctx->morpho_alphabet) ? maca_alphabet_size(ctx->morpho_alphabet) : 0; + ctx->synt_feats_nb = (ctx->synt_feats_alphabet) ? maca_alphabet_size(ctx->synt_feats_alphabet) : 0; + + /* allocate dep_count table */ + ctx->dep_count_table = maca_graph_parser_dep_count_table_allocate(ctx->pos_nb, ctx->labels_nb); + + /* preprocessing: fill dep count table */ + int sent_id = 0; + for(sent_id=0; sent_id < corpus->size; sent_id++){ + maca_graph_parser_dep_count_table_update(ctx, corpus->ref[sent_id]); + } + + /* preprocessing: relabel rare dependencies */ + sent_id = 0; + for(sent_id=0; sent_id < corpus->size; sent_id++){ + maca_graph_parser_sentence_relabel_rare_deps(ctx, corpus->ref[sent_id]); + } + + /* init new model */ + ctx->model = maca_graph_parser_model_allocate(ctx); + maca_graph_parser_model_init(ctx, ctx->model); + /* set feature types for model */ + ctx->model->min_dep_count = ctx->min_dep_count; + ctx->model->use_lemmas = ctx->use_lemmas; + ctx->model->use_full_forms = ctx->use_full_forms; + ctx->model->basic_features = ctx->basic_features; + ctx->model->first_features = ctx->first_features; + ctx->model->grandchildren_features = ctx->grandchildren_features; + ctx->model->sibling_features = ctx->sibling_features; + ctx->model->subcat_features = ctx->subcat_features; + if(ctx->sibling_features || ctx->grandchildren_features) ctx->order = 2; + + + /* templ_library_allocator cannot be created earlier as it needs the + complete alphabet, which is built during load_conll_corpus + FIXME: properly (semi-)freeze the alphabet, + cf. extractor_allocator() + */ + ctx->e = maca_graph_parser_templ_library_allocator(ctx); + + /* allocate feature table */ + if(ctx->store_in_feature_table){ + maca_graph_parser_feature_table_allocator(ctx); + } + + ctx->s = maca_graph_parser_allocate_sentence(ctx); + + if(ctx->print_ctx) maca_graph_parser_print_ctx(ctx); + + maca_graph_parser_train(ctx, corpus); + + /* dump model and dep count table */ + maca_graph_parser_model_dump(ctx, ctx->model, ctx->model_file_name, ctx->produce_hash_model); + maca_graph_parser_dep_count_table_print(ctx, ctx->dep_count_table_file_name); + + /* free corpus data */ + free_corpus(corpus); +} + diff --git a/maca_graph_parser/maca_graph_parser_metrics.c b/maca_graph_parser/maca_graph_parser_metrics.c new file mode 100644 index 0000000000000000000000000000000000000000..bb4377b987fe0742eb5bfb106f6d96a07e1d8048 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_metrics.c @@ -0,0 +1,118 @@ +#include "maca_graph_parser_metrics.h" + + +double maca_graph_parser_sentence_errors(maca_graph_parser_sentence *ref, maca_graph_parser_sentence *hyp){ + /** + * Compute the number of errors in hyp wrt ref. + */ + + double correct = 0; + int i; + double x; + + /* start at 1 because 0 is the fake root */ + for(i=1; i < ref->l; i++){ + /* coarse cost function */ + /* + if((ref->gov[i] == hyp->gov[i]) && (ref->label[i] == hyp->label[i])) + correct += 1; + */ + /* fine cost function */ + if(ref->gov[i] == hyp->gov[i]){ + correct += 0.5; + if(ref->label[i] == hyp->label[i]){ + correct += 0.5; + } + } + } + x = ((double) ref->l - 1 - correct); + return x; +} + + +double maca_graph_parser_sentence_compute_las(maca_graph_parser_sentence *ref, maca_graph_parser_sentence *hyp, int absolute){ + /** + * Compute the Labelled Attachment Score of hyp wrt ref. + * + * Should not be used for English, where punctuation is + * traditionnally ignored for scoring. + */ + + double result; + int i; + int correct = 0; + + for(i = 1; i < ref->l; i++){ /* start at 1 because 0 is the fake root */ + if((ref->gov[i] == hyp->gov[i]) && (ref->label[i] == hyp->label[i])) + correct++; + } + + if(absolute){ + result = correct; + } else { + result = ((double) correct) / ((double) (ref->l - 1)); /* l-1 to remove the fake root */ + } + + return result; +} + + +double maca_graph_parser_sentence_compute_uas(maca_graph_parser_sentence *ref, maca_graph_parser_sentence *hyp, int absolute){ + /** + * Compute the Unlabelled Attachment Score of hyp wrt ref. + * + * Should not be used for English, where punctuation is + * traditionnally ignored for scoring. + */ + + int i; + int correct = 0; + for(i = 1; i < ref->l; i++){ + if(ref->gov[i] == hyp->gov[i]) correct++; + /* printf("%d\t%d\t%d\t%d\n", ref->gov[i], hyp->gov[i], ref->label[i], hyp->label[i]); */ + } + if(absolute) + return correct; + else + return ((double) correct / (double) (ref->l - 1)); +} + + +double maca_graph_parser_sentence_compute_las_oracle(maca_graph_parser_sentence *ref, maca_graph_parser_sentence *hyp, int k, int absolute){ + /** + * Compute the oracle LAS as max_{i=0..k-1}(LAS(hyp[i])). + * + * FIXME: partial duplicate of compute_las + */ + + int ki; + double cur_las; + double max_las; + /* almost duplicate of compute_las */ + int i; + int correct; + + max_las = 0.0; + for(ki=0; ki<k; ki++){ + /* cur_las = maca_graph_parser_sentence_compute_las(ref, hyp, ki, absolute); */ + + /* instead of the preceding line, almost duplicate of compute_las */ + correct = 0; + for(i = 1; i < ref->l; i++){ /* start at 1 because 0 is the fake root */ + if((ref->gov[i] == hyp->kb->gov[i][ki]) && + (ref->label[i] == hyp->kb->label[i][ki])) + correct++; + /* printf("%d\t%d\t%d\t%d\n", ref->gov[i], hyp->gov[i], ref->label[i], hyp->label[i]); */ + } + if(absolute) + cur_las = correct; + else + cur_las = ((double) correct / (double) (ref->l - 1)); /* l-1 to remove the fake root */ + /* end of almost duplicate */ + + if (cur_las > max_las) + max_las = cur_las; + } + + return max_las; +} diff --git a/maca_graph_parser/maca_graph_parser_metrics.h b/maca_graph_parser/maca_graph_parser_metrics.h new file mode 100644 index 0000000000000000000000000000000000000000..e84ee18714a79961bfdf2269ca36645df66a97e8 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_metrics.h @@ -0,0 +1,15 @@ +#include "maca_graph_parser.h" + +#ifdef __cplusplus +extern "C"{ +#endif + + +double maca_graph_parser_sentence_errors(maca_graph_parser_sentence *ref, maca_graph_parser_sentence *hyp); +double maca_graph_parser_sentence_compute_las(maca_graph_parser_sentence *ref, maca_graph_parser_sentence *hyp, int absolute); +double maca_graph_parser_sentence_compute_uas(maca_graph_parser_sentence *ref, maca_graph_parser_sentence *hyp, int absolute); +double maca_graph_parser_sentence_compute_las_oracle(maca_graph_parser_sentence *ref, maca_graph_parser_sentence *hyp, int k, int absolute); + + #ifdef __cplusplus +} +#endif diff --git a/maca_graph_parser/maca_graph_parser_model.c b/maca_graph_parser/maca_graph_parser_model.c new file mode 100644 index 0000000000000000000000000000000000000000..978b47107d0991acf3c076a2ff6774632d277135 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_model.c @@ -0,0 +1,272 @@ +#include "maca_graph_parser_hash.h" +/* legacy includes from maca_graph_parser_hash.c */ +/* TODO: clean up */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <math.h> + +// for mmap() +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <unistd.h> +#include <errno.h> + +#include "maca_graph_parser_model.h" + + +maca_graph_parser_model *maca_graph_parser_model_allocate(maca_graph_parser_ctx *ctx){ + /** + * Allocate a model. + */ + + maca_graph_parser_model *m = malloc(sizeof(maca_graph_parser_model)); + if(m == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr,"maca_graph_parser_model_allocate: memory allocation problem\n"); + exit(1); + } + return m; +} + + +void maca_graph_parser_model_init(maca_graph_parser_ctx *ctx, maca_graph_parser_model *model){ + /** + * Initialize model. + */ + + /* maca_graph_parser_hash */ + /* model->feat_ht = creation_table(ctx->H, ctx->hash_fill_rate); */ + model->feat_ht = creation_table(ctx->H, 1); +} + + +void maca_graph_parser_model_free(maca_graph_parser_model *model){ + if(model == NULL) + return; + + if(model->feat_array) + feature_weight_table_free(model->feat_array); + if(model->feat_ht) + free_table(model->feat_ht); + free(model); +} + + +void maca_graph_parser_model_print(maca_graph_parser_ctx *ctx){ + +} + + +/* persistence */ +maca_graph_parser_model *maca_graph_parser_model_load(maca_graph_parser_ctx *ctx, char *model_file_name){ + /** + * Load a model from a file. + */ + + maca_graph_parser_model *model = maca_graph_parser_model_allocate(ctx); + + /* open file */ + if(model_file_name == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "maca_graph_parser_model_load: model file name is missing\n"); + exit(1); + } + FILE *model_file = fopen(model_file_name, "rb"); + if(model_file == NULL){ + fprintf(stderr, "cannot open file %s\n", model_file_name); + exit(1); + } + + /* read header */ + fread(&(model->is_hash_model), sizeof(int), 1, model_file); + /* printf("hash model = %d\n", model->is_hash_model); */ + fread(&(model->min_dep_count), sizeof(int), 1, model_file); + /* printf("min dep count = %d\n", ctx->min_dep_count); */ + fread(&(model->use_lemmas), sizeof(int), 1, model_file); + /* printf("use lemmas = %d\n", ctx->use_lemmas); */ + fread(&(model->use_full_forms), sizeof(int), 1, model_file); + /* printf("use full forms = %d\n", ctx->use_full_forms); */ + fread(&(model->basic_features), sizeof(int), 1, model_file); + /* printf("basic features = %d\n", ctx->basic_features); */ + fread(&(model->first_features), sizeof(int), 1, model_file); + /* printf("first features = %d\n", ctx->first_features); */ + fread(&(model->grandchildren_features), sizeof(int), 1, model_file); + /* printf("grandchildren features = %d\n", ctx->grandchildren_features); */ + fread(&(model->sibling_features), sizeof(int), 1, model_file); + /* printf("sibling features = %d\n", ctx->sibling_features); */ + fread(&(model->subcat_features), sizeof(int), 1, model_file); + /* printf("sibling features = %d\n", ctx->sibling_features); */ + + /* read content */ + if(model->is_hash_model) + model->feat_ht = load_table(model_file); + else{ + model->feat_array = load_feature_weight_table(model_file); + model->feat_ht = feat_array2feat_hash(model->feat_array, ctx->hash_fill_rate); + /* ctx->hash_model = 1; */ + feature_weight_table_free(model->feat_array); + model->feat_array = NULL; + } + + fclose(model_file); + + return model; +} + + +void maca_graph_parser_model_dump(maca_graph_parser_ctx *ctx, maca_graph_parser_model *model, char *file_name, int produce_hash_model){ + /** + * Dump model to file_name in hash table or array format. + */ + + FILE *model_file = fopen(file_name, "w"); + if(model_file == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "cannot open file %s\n", ctx->model_file_name); + } + + /* + size_t cfg_strlen = strlen(ctx->cfg); + fwrite(&cfg_strlen, sizeof(int), 1, model_file); + fwrite(ctx->cfg, sizeof(char), cfg_strlen + 1, model_file); + */ + fwrite(&(produce_hash_model), sizeof(int), 1, model_file); + fwrite(&(model->min_dep_count), sizeof(int), 1, model_file); + fwrite(&(model->use_lemmas), sizeof(int), 1, model_file); + fwrite(&(model->use_full_forms), sizeof(int), 1, model_file); + fwrite(&(model->basic_features), sizeof(int), 1, model_file); + fwrite(&(model->first_features), sizeof(int), 1, model_file); + fwrite(&(model->grandchildren_features), sizeof(int), 1, model_file); + fwrite(&(model->sibling_features), sizeof(int), 1, model_file); + fwrite(&(model->subcat_features), sizeof(int), 1, model_file); + + if(produce_hash_model) + dump_table(model->feat_ht, model_file); + else{ + model->feat_array = feat_hash2feat_array(model->feat_ht); + dump_feature_weight_table(model->feat_array, model_file); + } + + fclose(model_file); +} + + +/* mmap */ +maca_graph_parser_model *maca_graph_parser_model_mmap(maca_graph_parser_ctx *ctx, char *model_file_name){ + /** + * Create a memory mapped model from model_file_name. + */ + + maca_graph_parser_model *model = maca_graph_parser_model_allocate(ctx); + + model->model_fd = open(model_file_name, O_RDONLY); + if(model->model_fd == -1) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "could not open parser model \"%s\"\n", model_file_name); + exit(1); + } + + struct stat sb; + if (fstat(model->model_fd, &sb) == -1) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "could not fstat parser model \"%s\"\n", model_file_name); + exit(1); + } + model->mmap_length = sb.st_size; + model->mmap_data = (const char*) mmap(NULL, model->mmap_length, PROT_READ, MAP_PRIVATE, model->model_fd, 0); + if(model->mmap_data == MAP_FAILED) { + perror("mmap"); + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "mmap() failed on parser model \"%s\"\n", model_file_name); + exit(1); + } + + int field_id = 0; + /* header */ + model->is_hash_model = ((int*)model->mmap_data)[field_id++]; + model->min_dep_count = ((int*)model->mmap_data)[field_id++]; + model->use_lemmas = ((int*)model->mmap_data)[field_id++]; + model->use_full_forms = ((int*)model->mmap_data)[field_id++]; + model->basic_features = ((int*)model->mmap_data)[field_id++]; + model->first_features = ((int*)model->mmap_data)[field_id++]; + model->grandchildren_features = ((int*)model->mmap_data)[field_id++]; + model->sibling_features = ((int*)model->mmap_data)[field_id++]; + /* content */ + if(model->is_hash_model) { + //model->feat_ht = load_table(model_file); + maca_graph_parser_hash *t = NULL; + t = malloc(sizeof(maca_graph_parser_hash)); + t->taille = ((int*)model->mmap_data)[field_id++]; + t->nbelem = ((int*)model->mmap_data)[field_id++]; + t->table_clef = (feature_t*) (model->mmap_data + field_id * sizeof(int)); + t->params = (float*) (model->mmap_data + field_id * sizeof(int) + t->taille * sizeof(feature_t)); + t->total = NULL; + model->feat_ht = t; + } else { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "feature weight table not supported in mmap'ed model \"%s\"\n", model_file_name); + } + return model; +} + +void maca_graph_parser_model_munmap(maca_graph_parser_model *model) { + if(model->mmap_data != MAP_FAILED) munmap((void*) model->mmap_data, model->mmap_length); + if(model->model_fd != -1) close(model->model_fd); +} + + +/* scoring */ +float score_feat_vector(feat_vector *fv, maca_graph_parser_model *model){ + /** + * Score a feature vector against a model. + */ + + float sum = 0; + + if(fv == NULL) + return sum; + + int i; + for(i=0; i<fv->elt_nb; i++){ + float *w = recherche_hash(model->feat_ht, fv->array[i]); + if(w) + sum += *w; + } + + return sum; +} + + +float score_feature_counter_array(feature_counter_array *a, maca_graph_parser_model *model){ + /** + * Score feature counter array a against model. + */ + + float score = 0; + + int i; + for(i=0; i < a->size; i++){ + feature_counter *c = a->array[i]; + if(c == NULL){ + continue; + } + + int j; + for(j=0; j < c->size; j++){ + int v = c->values[j]; + if(v != 0){ + float *w = recherche_hash(model->feat_ht, c->keys[j]); + if(w != NULL){ + score += (v * (*w)); + } + } + } + } + + return score; +} diff --git a/maca_graph_parser/maca_graph_parser_model.h b/maca_graph_parser/maca_graph_parser_model.h new file mode 100644 index 0000000000000000000000000000000000000000..c2ca36a415c5f1f32cbce148ef0783371e5843d6 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_model.h @@ -0,0 +1,21 @@ +#include "maca_graph_parser_feature_counter_array.h" + +#ifdef __cplusplus +extern "C"{ +#endif + + + +/* basics */ +maca_graph_parser_model *maca_graph_parser_model_allocate(maca_graph_parser_ctx *ctx); +void maca_graph_parser_model_init(maca_graph_parser_ctx *ctx, maca_graph_parser_model *model); +void maca_graph_parser_model_free(maca_graph_parser_model *model); +/* persistence */ +maca_graph_parser_model *maca_graph_parser_model_load(maca_graph_parser_ctx *ctx, char *model_file_name); +void maca_graph_parser_model_dump(maca_graph_parser_ctx *ctx, maca_graph_parser_model *model, char *file_name, int produce_hash_model); +/* scoring */ +float score_feat_vector(feat_vector *fv, maca_graph_parser_model *model); +float score_feature_counter_array(feature_counter_array *a, maca_graph_parser_model *model); +#ifdef __cplusplus +} +#endif diff --git a/maca_graph_parser/maca_graph_parser_print_model_main.c b/maca_graph_parser/maca_graph_parser_print_model_main.c new file mode 100644 index 0000000000000000000000000000000000000000..20023da84268b1339e318bb9e395a9050b6e5e96 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_print_model_main.c @@ -0,0 +1,85 @@ +#include"maca_graph_parser_hash.h" +#include"maca_graph_parser_model.h" +#include"maca_graph_parser_features.h" +#include"maca_alphabet_wrapper.h" +#include"maca_constants.h" + +int main(int argc, char *argv[]) +{ + int i,j; + FILE *f = stdout; + maca_graph_parser_ctx * ctx; + int hval; + maca_alphabet_array *alpha_array; + maca_graph_parser_model *model = NULL; + + ctx = maca_graph_parser_LoadCTX(argc,argv); + + + /* alphabets */ + /* load alphabets */ + + alpha_array = maca_alphabet_array_new_from_file(ctx->alphabet_file_name); + ctx->words_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_WORDS); + ctx->words_nb = (ctx->words_alphabet != NULL) ? maca_alphabet_size(ctx->words_alphabet) : 0; + ctx->labels_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_LABELS); + ctx->labels_nb = (ctx->labels_alphabet != NULL) ? maca_alphabet_size(ctx->labels_alphabet) : 0; + ctx->pos_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_POS); + ctx->pos_nb = (ctx->pos_alphabet != NULL) ? maca_alphabet_size(ctx->pos_alphabet) : 0; + ctx->morpho_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_MORPHO); + ctx->morpho_nb = (ctx->morpho_alphabet != NULL) ? maca_alphabet_size(ctx->morpho_alphabet) : 0; + ctx->synt_feats_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_SYNT_FEATS); + ctx->synt_feats_nb = (ctx->synt_feats_alphabet != NULL) ? maca_alphabet_size(ctx->synt_feats_alphabet) : 0; + + /* template library allocator needs: words_nb, pos_nb, labels_nb */ + ctx->e = maca_graph_parser_templ_library_allocator(ctx); + + /* model */ + + model = ctx->model = maca_graph_parser_model_load(ctx, ctx->model_file_name); + + /* model2 */ + if(ctx->model2_file_name != NULL){ + ctx->model2 = maca_graph_parser_model_load(ctx, ctx->model2_file_name); + } else { + ctx->model2 = NULL; + } + /* set active feature types for the decoder */ + ctx->min_dep_count = ctx->model->min_dep_count; + ctx->use_lemmas = ctx->model->use_lemmas; + ctx->use_full_forms = ctx->model->use_full_forms; + ctx->basic_features = ctx->model->basic_features; + ctx->first_features = ctx->model->first_features; + ctx->grandchildren_features = ctx->model->grandchildren_features; + ctx->sibling_features = ctx->model->sibling_features; + ctx->subcat_features = ctx->model->subcat_features; + + + fprintf(f, "is hash model \t = %d\n", model->is_hash_model); + fprintf(f, "min dep count \t = %d\n", ctx->min_dep_count); + fprintf(f, "use lemmas \t = %d\n", ctx->use_lemmas); + fprintf(f, "use full forms \t = %d\n", ctx->use_full_forms); + fprintf(f, "basic features \t = %d\n", ctx->basic_features); + fprintf(f, "first order features \t = %d\n", ctx->basic_features); + fprintf(f, "grandchildre features \t = %d\n", ctx->grandchildren_features); + fprintf(f, "sibling features \t = %d\n", ctx->sibling_features); + fprintf(f, "subcat feautres \t = %d\n", ctx->subcat_features); + + fprintf(f, "features hash table size \t = %d\n", model->feat_ht->taille); + fprintf(f, "nb of features \t = %d\n", model->feat_ht->nbelem); + for(i=0, j=0; i < ctx->model->feat_ht->taille; i++){ + j++; + if(ctx->model->feat_ht->table_clef[i] != VIDE){ + fprintf(f, "%d\t%f\t", ++j, ctx->model->feat_ht->params[i]); +/* hval = hash_func(ctx->model->feat_ht->table_clef[i],ctx->model->feat_ht->taille); */ +/* fprintf(f, "%f h=%d", ctx->model->feat_ht->params[i],hval); */ + maca_graph_parser_print_feature(f, ctx, ctx->model->feat_ht->table_clef[i]); + + fprintf(f, "\n"); + /* fprintf(f, "%d %lld %f\n", ++j, ctx->model->feat_ht->table_clef[i], ctx->model->feat_ht->params[i]); */ + /* fprintf(f, "%d %lld %f type = %lld\n", ++j, feat_ht->table_clef[i], feat_ht->params[i], feature_get_type(feat_ht->table_clef[i])); */ + } + } + maca_graph_parser_model_free(ctx->model); + return 0; +} diff --git a/maca_graph_parser/maca_graph_parser_resize_model_main.c b/maca_graph_parser/maca_graph_parser_resize_model_main.c new file mode 100644 index 0000000000000000000000000000000000000000..f3eb3334a1c889307b4e793b4a362e81fd3c7041 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_resize_model_main.c @@ -0,0 +1,69 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> + +// quick hack to resize the hash table +int main(int argc, char** argv) { + if(argc != 3) { + fprintf(stderr, "usage: %s <input-model> <output-model>\n", argv[0]); + return 1; + } + FILE* input = fopen(argv[1], "r"); + FILE* output = fopen(argv[2], "w"); + int i; + int value; + for(i = 0; i < 8; i++) { + fread(&value, sizeof(int), 1, input); + fwrite(&value, sizeof(int), 1, output); + } + int nb_elements, size; + fread(&size, sizeof(int), 1, input); + fread(&nb_elements, sizeof(int), 1, input); + fprintf(stderr, "size = %d\n", size); + fprintf(stderr, "elements = %d\n", nb_elements); + uint64_t* table = malloc(sizeof(uint64_t) * size); + fread(table, size, sizeof(uint64_t), input); + float* weights = malloc(sizeof(float) * size); + fread(weights, size, sizeof(float), input); + + int new_size = nb_elements * 3; + fwrite(&new_size, sizeof(int), 1, output); + fwrite(&nb_elements, sizeof(int), 1, output); + + uint64_t* table2 = calloc(sizeof(uint64_t), new_size); + float* weights2 = calloc(sizeof(float), new_size); + + int num_collisions = 0; + int max_collisions = 0; + for(i = 0; i < size; i++) { + if(table[i] != 0) { + int hash = (int)(table[i] % (uint64_t)new_size); + int offset; + for(offset = 0; offset < new_size; offset++) { + int location = (hash + offset) % new_size; + if(table2[location] == 0) { + table2[location] = table[i]; + weights2[location] = weights[i]; + break; + } + num_collisions++; + } + if(offset > max_collisions) max_collisions = offset; + } + } + fwrite(table2, sizeof(uint64_t), new_size, output); + fwrite(weights2, sizeof(float), new_size, output); + fprintf(stderr, "collisions = %f (max = %d)\n", 1.0 * num_collisions / nb_elements, max_collisions); + + fclose(input); + fclose(output); + + // MM: free memory + free(table2); + table2 = NULL; + free(weights); + weights = NULL; + free(weights2); + weights2 = NULL; + return 0; +} diff --git a/maca_graph_parser/maca_graph_parser_sentence.c b/maca_graph_parser/maca_graph_parser_sentence.c new file mode 100644 index 0000000000000000000000000000000000000000..d50cc4f1d24d89652aa3c27e62768324d1f50698 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_sentence.c @@ -0,0 +1,557 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#include <string.h> +#include"maca_graph_parser_sentence.h" +#include"maca_graph_parser.h" +#include"maca_mcf_wrapper.h" + + +maca_graph_parser_sentence *maca_graph_parser_allocate_sentence(maca_graph_parser_ctx *ctx){ + /** + * Allocate a maca_graph_parser_sentence. + */ + + maca_graph_parser_sentence *s = malloc(sizeof(maca_graph_parser_sentence)); + if(s == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr,"maca_graph_parser_allocate_sentence: memory allocation problem\n"); + exit(1); + } + s->l = 0; + + /* dynamic allocation of fields */ + s->word_adr = malloc(MACA_MAX_LENGTH_SENTENCE * sizeof(void*)); + if(s->word_adr == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr,"maca_graph_parser_allocate_sentence: memory allocation problem (2)\n"); + exit(1); + } + int i; + for(i=0; i<MACA_MAX_LENGTH_SENTENCE; i++){ + s->word_adr[i] = NULL; + } + s->words = malloc(MACA_MAX_LENGTH_SENTENCE * sizeof(int)); + s->lemmas = malloc(MACA_MAX_LENGTH_SENTENCE * sizeof(int)); + s->pos = malloc(MACA_MAX_LENGTH_SENTENCE * sizeof(int)); + + s->morpho = malloc(MACA_MAX_LENGTH_SENTENCE * sizeof(int *)); + for(i=0; i<MACA_MAX_LENGTH_SENTENCE; i++){ + s->morpho[i] = malloc(1 * sizeof(int)); + if(s->morpho[i] == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr,"maca_graph_parser_allocate_sentence: memory allocation problem (3)\n"); + exit(1); + } + } + s->gov = malloc(MACA_MAX_LENGTH_SENTENCE * sizeof(int)); + s->label = malloc(MACA_MAX_LENGTH_SENTENCE * sizeof(int)); + + s->synt_feats_nb = malloc(MACA_MAX_LENGTH_SENTENCE * sizeof(int)); + + s->synt_feats_array = malloc(MACA_MAX_LENGTH_SENTENCE * sizeof(int *)); + for(i=0; i<MACA_MAX_LENGTH_SENTENCE; i++){ + s->synt_feats_array[i] = NULL; + } + + + if((s->words == NULL) || (s->lemmas == NULL) || (s->pos == NULL) || + (s->morpho == NULL) || (s->gov == NULL) || (s->label == NULL)){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr,"maca_graph_parser_allocate_sentence: memory allocation problem (4)\n"); + exit(1); + } + /* s->score = 0; */ + s->kb = NULL; + + /* add fake root */ + int root_w = (ctx->use_full_forms || ctx->use_lemmas) ? maca_alphabet_get_code(ctx->words_alphabet, "__START__") : -1; + /* int root_pos = maca_tags_get_code(ctx->cfg, "morpho", "stype", "__START__"); */ + int root_pos = maca_alphabet_get_code(ctx->pos_alphabet, "__START__"); + maca_graph_parser_sentence_add_word(ctx, s, NULL, root_w, root_w, root_pos, 0, -1, 0, NULL); + /* MM: gov 0, label -1: if these default values change, make sure they get backported to the call to the same function _sentence_add_word in the preceding function, + to ensure consistency */ + + return s; +} + + +void maca_graph_parser_free_sentence(maca_graph_parser_sentence *s){ + /** + * Free a sentence + */ + int i; + + if(s == NULL) + return; + + if(s->word_adr != NULL){ + free(s->word_adr); + s->word_adr = NULL; + } + if(s->words != NULL){ + free(s->words); + s->words = NULL; + } + if(s->lemmas != NULL){ + free(s->lemmas); + s->lemmas = NULL; + } + if(s->pos != NULL){ + free(s->pos); + s->pos = NULL; + } + if(s->morpho != NULL){ + for(i=0; i < MACA_MAX_LENGTH_SENTENCE; i++){ + if(s->morpho[i] != NULL){ + free(s->morpho[i]); + s->morpho[i] = NULL; + } + } + free(s->morpho); + s->morpho = NULL; + } + if(s->gov != NULL){ + free(s->gov); + s->gov = NULL; + } + if(s->label != NULL){ + free(s->label); + s->label = NULL; + } + + if(s->synt_feats_nb != NULL){ + free(s->synt_feats_nb); + s->synt_feats_nb = NULL; + } + + + if(s->synt_feats_array != NULL){ + for(i=0; i < MACA_MAX_LENGTH_SENTENCE; i++){ + if(s->synt_feats_array[i] != NULL){ + free(s->synt_feats_array[i]); + s->synt_feats_array[i] = NULL; + } + } + free(s->synt_feats_array); + s->synt_feats_array = NULL; + } + + + + if(s->kb != NULL){ + maca_graph_parser_free_sentence_kbest(s->kb); + s->kb = NULL; + } + + free(s); +} + +void maca_graph_parser_sentence_clear (maca_graph_parser_sentence *s) { + int k; + + for (k = 1; k < s->l; k++) { + free(s->synt_feats_array[k]); + s->synt_feats_array[k] = NULL; + s->synt_feats_nb[k] = 0; + } + + s->l = 1; +} + +void maca_graph_parser_sentence_add_word(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s, void* adr, int word, int lemma, int pos, int gov, int label, int synt_feats_nb, int *synt_feats_array){ + /** + * Add word to sentence. + */ + int i; + + if(s->l >= MACA_MAX_LENGTH_SENTENCE){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "sentence too long, change MACA_MAX_LENGTH_SENTENCE value\n"); + exit(1); + } + s->word_adr[s->l] = adr; + s->words[s->l] = word; + s->lemmas[s->l] = lemma; + s->pos[s->l] = pos; + s->gov[s->l] = gov; + s->label[s->l] = label; + s->synt_feats_nb[s->l] = synt_feats_nb; + if(synt_feats_nb > 0){ + s->synt_feats_array[s->l] = malloc(synt_feats_nb * sizeof(int)); + for(i=0; i<synt_feats_nb; i++){ + s->synt_feats_array[s->l][i] = synt_feats_array[i]; + } + } + s->l++; +} + + +void maca_graph_parser_sentence_print_sentence(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s){ + /** + * Print s. + */ + + int i; + char word[128]; + char lemma[128]; + char pos[128]; + char label[128]; + + + for(i=1; i < s->l; i++){ + if (s->words[i] != -1) { + maca_alphabet_get_symbol(ctx->words_alphabet, s->words[i], word, sizeof(word)); + } else { + strcpy(word, "NA"); + } + if (s->lemmas[i] != -1) { + maca_alphabet_get_symbol(ctx->words_alphabet, s->lemmas[i], lemma, sizeof(lemma)); + } else { + strcpy(lemma, "NA"); + } + if (s->pos[i] != -1) { + maca_alphabet_get_symbol(ctx->pos_alphabet, s->pos[i], pos, sizeof(pos)); + } else { + strcpy(pos, "NA"); + } + if (s->label[i] != -1) { + maca_alphabet_get_symbol(ctx->labels_alphabet, s->label[i], label, sizeof(label)); + } else { + strcpy(label, "NA"); + } + printf("%d\t%s\t%s\t%s\t%d\t%s\n", + i, + word, + lemma, + pos, + s->gov[i], + label); + } + printf("\n"); +} + + +maca_graph_parser_sentence *maca_graph_parser_duplicate(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *sent, maca_graph_parser_sentence *copy){ + /** + * Duplicate sent to copy. + */ + + int i; + + if(sent == NULL) return NULL; + + if(copy == NULL){ + copy = maca_graph_parser_allocate_sentence(ctx); + } + + copy->l = 0; + + for(i=0; i < sent->l; i++){ + copy->word_adr[i] = sent->word_adr[i]; + copy->words[i] = sent->words[i]; + copy->lemmas[i] = sent->lemmas[i]; + copy->pos[i] = sent->pos[i]; + copy->gov[i] = sent->gov[i]; + copy->label[i] = sent->label[i]; + copy->l++; + } + /* copy score too (shouldn't we ?) */ + copy->score = sent->score; + + return copy; +} + + +maca_graph_parser_sentence *maca_graph_parser_duplicate_sentence(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *sent, maca_graph_parser_sentence *copy){ + /** + * Duplicate sent to copy without the dependencies (gov and label) + * + * FIXME: merge into maca_graph_parser_duplicate() + */ + + int i,j; + + if(sent == NULL) return NULL; + + if(copy == NULL){ + copy = maca_graph_parser_allocate_sentence(ctx); + } + + copy->l = 0; + + for(i=0; i < sent->l; i++){ + copy->word_adr[i] = sent->word_adr[i]; + copy->words[i] = sent->words[i]; + copy->lemmas[i] = sent->lemmas[i]; + copy->pos[i] = sent->pos[i]; + copy->gov[i] = 0; /* MM: ensure consistency with calls to _sentence_add_word in the functions above: gov=0, lab=-1 by default */ + copy->label[i] = -1; + copy->synt_feats_nb[i] = sent->synt_feats_nb[i]; + if(copy->synt_feats_nb[i] > 0){ + copy->synt_feats_array[i] = malloc(copy->synt_feats_nb[i] * sizeof(int)); + for(j=0; j<copy->synt_feats_nb[i]; j++){ + copy->synt_feats_array[i][j] = sent->synt_feats_array[i][j]; + } + } + copy->l++; + } + /* reset score */ + copy->score = MINF; + + return copy; +} + + + + +/** + * kbest + */ + +maca_graph_parser_sentence_kbest *maca_graph_parser_allocate_sentence_kbest(maca_graph_parser_ctx *ctx){ + /** + * Allocate a structure to store the k-best parses of a sentence. + */ + + maca_graph_parser_sentence_kbest *kb = malloc(sizeof(maca_graph_parser_sentence_kbest)); + if(kb == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr,"maca_graph_parser_allocate_sentence_kbest: memory allocation problem\n"); + exit(1); + } + + /* fields */ + kb->k = ctx->k; + kb->gov = malloc(MACA_MAX_LENGTH_SENTENCE * sizeof(int *)); + if(kb->gov == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr,"maca_graph_parser_allocate_sentence_kbest: memory allocation problem (2)\n"); + exit(1); + } + kb->label = malloc(MACA_MAX_LENGTH_SENTENCE * sizeof(int *)); + if(kb->label == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr,"maca_graph_parser_allocate_sentence_kbest: memory allocation problem (3)\n"); + exit(1); + } + + int ki; + int i; + for(i=0; i<MACA_MAX_LENGTH_SENTENCE; i++){ + kb->gov[i] = malloc((kb->k) * sizeof(int)); + kb->label[i] = malloc((kb->k) * sizeof(int)); + for(ki=0; ki<kb->k; ki++){ + kb->gov[i][ki] = 0; + kb->label[i][ki] = -1; + } + } + kb->score = malloc((kb->k) * sizeof(float)); + for(ki=0; ki<kb->k; ki++){ + kb->score[ki] = MINF; + } + + return kb; +} + + +void maca_graph_parser_free_sentence_kbest(maca_graph_parser_sentence_kbest *kb){ + /** + * Free a sentence_kbest. + */ + + + + int i; + + if(kb == NULL) + return; + + for(i=0; i<MACA_MAX_LENGTH_SENTENCE; i++){ + if(kb->gov[i] != NULL){ + free(kb->gov[i]); + kb->gov[i] = NULL; + } + if(kb->label[i] != NULL){ + free(kb->label[i]); + kb->label[i] = NULL; + } + } + if(kb->score != NULL){ + free(kb->score); + kb->score = NULL; + } + + if(kb->gov != NULL){ + free(kb->gov); + kb->gov = NULL; + } + if(kb->label != NULL){ + free(kb->label); + kb->label = NULL; + } + free(kb); +} + + +void maca_graph_parser_reset_sentence_kbest(maca_graph_parser_sentence_kbest *kb){ + /** + * Reset a sentence_kbest. + */ + + int i; + int ki; + + for(i=1; i < MACA_MAX_LENGTH_SENTENCE; i++){ + for(ki=0; ki < kb->k; ki++){ + kb->gov[i][ki] = 0; + kb->label[i][ki] = -1; + } + } + for(ki=0; ki < kb->k; ki++){ + kb->score[ki] = MINF; + } + +} + +maca_mcf_sentence *maca_graph_parser_read_mcf_sentence(maca_graph_parser_ctx *ctx, maca_mcf *format, maca_graph_parser_sentence *s) +{ + maca_mcf_sentence *mcf_sentence; + int length; + maca_mcf_word *mcf_word = NULL; + int index, k; + int code_postag, code_lemma, code_form, code_label, code_synt_feat, gov; + int nb_synt_feats; + int *synt_feats; + char invalid_sentence; + + do { + invalid_sentence = 0; + do{ + mcf_sentence = maca_mcf_read_next_sentence(format); + if(mcf_sentence == NULL) return NULL; + length = maca_mcf_sentence_get_length(mcf_sentence); + if(length > ctx->max_sent_length){ + maca_mcf_sentence_release(mcf_sentence); + if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "sentence too long, skipping it\n"); + } + } + } + while(length > ctx->max_sent_length); + + for(index=1; index < length; index++){ + mcf_word = maca_mcf_sentence_get_word(mcf_sentence, index); + + code_form = -1; + if(ctx->use_full_forms){ + if(maca_mcf_word_get_nb_values(mcf_word, ctx->mcf_form_id)){ + code_form = maca_mcf_word_get_value_int(mcf_word, ctx->mcf_form_id, 0); + } + else{ + if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_WARNING); + fprintf(stderr, "missing FORM in sentence, skipping it\n"); + invalid_sentence = 1; + } + } + } + + code_lemma = -1; + if(ctx->use_lemmas){ + if(maca_mcf_word_get_nb_values(mcf_word, ctx->mcf_lemma_id)){ + code_lemma = maca_mcf_word_get_value_int(mcf_word, ctx->mcf_lemma_id, 0); + } + } + + code_postag = -1; + if (maca_mcf_word_get_nb_values(mcf_word, ctx->mcf_postag_id)) { + code_postag = maca_mcf_word_get_value_int(mcf_word, ctx->mcf_postag_id, 0); + } else { + if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_WARNING); + fprintf(stderr, "missing POSTAG in sentence, skipping it\n"); + invalid_sentence = 1; + } + } + + gov = -1; + if (maca_mcf_word_get_nb_values(mcf_word, ctx->mcf_head_id)) { + gov = maca_mcf_word_get_value_int(mcf_word, ctx->mcf_head_id, 0); + } else { + if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_WARNING); + fprintf(stderr, "missing HEAD in sentence, skipping it\n"); + invalid_sentence = 1; + } + } + + code_label = -1; + if (maca_mcf_word_get_nb_values(mcf_word, ctx->mcf_deprel_id)) { + code_label = maca_mcf_word_get_value_int(mcf_word, ctx->mcf_deprel_id, 0); + } else { + if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_WARNING); + fprintf(stderr, "missing DEPREL in sentence, skipping it\n"); + invalid_sentence = 1; + } + } + + synt_feats = NULL; + nb_synt_feats = 0; + if(ctx->subcat_features){ + nb_synt_feats = maca_mcf_word_get_nb_values(mcf_word, ctx->mcf_subcat_id); + if (nb_synt_feats) { + synt_feats = malloc(nb_synt_feats * sizeof(int)); + for (k = 0; k < nb_synt_feats; k++) { + synt_feats[k] = maca_mcf_word_get_value_int(mcf_word, ctx->mcf_subcat_id, k); + } + } + } + + /* Sentence is invalid, clear it and search for another one */ + if (invalid_sentence) { + maca_graph_parser_sentence_clear(s); + break; + } + + maca_graph_parser_sentence_add_word(ctx, s, NULL, code_form, code_lemma, + code_postag, gov, code_label, + nb_synt_feats, synt_feats); + } + } while (invalid_sentence); + return mcf_sentence; +} + +void maca_graph_parser_sentence_fill_mcf_output(maca_mcf_sentence *mcf_sent, + maca_graph_parser_ctx *ctx, + maca_graph_parser_sentence *s) { + int k; + maca_mcf_word *mcf_word; + char buffer[128]; + for (k = 1; k < s->l; k++) { + mcf_word = maca_mcf_sentence_get_word(mcf_sent, k); + maca_mcf_word_clear_values(mcf_word, ctx->mcf_head_id); + maca_mcf_word_add_value_int(mcf_word, ctx->mcf_head_id, s->gov[k]); + maca_mcf_word_clear_values(mcf_word, ctx->mcf_deprel_id); + maca_alphabet_get_symbol(ctx->labels_alphabet, s->label[k], buffer, sizeof(buffer)); + maca_mcf_word_add_value_alphabet(mcf_word, ctx->mcf_deprel_id, buffer, ctx->labels_alphabet); + } +} diff --git a/maca_graph_parser/maca_graph_parser_sentence.h b/maca_graph_parser/maca_graph_parser_sentence.h new file mode 100644 index 0000000000000000000000000000000000000000..3fde8962e7cda5d58cd6443c13e14772a9c7d46e --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_sentence.h @@ -0,0 +1,58 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_GRAPH_PARSER_SENTENCE__ +#define __MACA_GRAPH_PARSER_SENTENCE__ + +#include "maca_common.h" +#include "maca_mcf_wrapper.h" +#include "maca_constants.h" +//#include "maca_tags.h" +#include "maca_msg.h" +#include "maca_graph_parser.h" + +#ifdef __cplusplus +extern "C"{ +#endif + + +maca_graph_parser_sentence *maca_graph_parser_allocate_sentence(maca_graph_parser_ctx *ctx); +void maca_graph_parser_free_sentence(maca_graph_parser_sentence *s); +void maca_graph_parser_sentence_clear (maca_graph_parser_sentence *s); +void maca_graph_parser_sentence_add_word(maca_graph_parser_ctx * ctx, maca_graph_parser_sentence *s, void* adr, int word, int lemma, int pos, int gov, int label, int synt_feats_nb, int *synt_feats_array); +void maca_graph_parser_sentence_print_sentence(maca_graph_parser_ctx * ctx, maca_graph_parser_sentence *s); +maca_graph_parser_sentence *maca_graph_parser_duplicate(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *sent, maca_graph_parser_sentence *copy); +maca_graph_parser_sentence *maca_graph_parser_duplicate_sentence(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *sent, maca_graph_parser_sentence *copy); + +void maca_graph_parser_load_sentence(maca_graph_parser_ctx * ctx, maca_graph_parser_sentence *s); +void maca_graph_parser_update_sentence(maca_graph_parser_ctx * ctx, maca_graph_parser_sentence *s); + +/* kbest */ +maca_graph_parser_sentence_kbest *maca_graph_parser_allocate_sentence_kbest(maca_graph_parser_ctx *ctx); +void maca_graph_parser_free_sentence_kbest(maca_graph_parser_sentence_kbest *kb); +void maca_graph_parser_reset_sentence_kbest(maca_graph_parser_sentence_kbest *kb); +maca_mcf_sentence *maca_graph_parser_read_mcf_sentence(maca_graph_parser_ctx *ctx, maca_mcf *format, maca_graph_parser_sentence *s); + +void maca_graph_parser_sentence_fill_mcf_output(maca_mcf_sentence *mcf_sent, maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *s); +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/maca_graph_parser/maca_graph_parser_train.c b/maca_graph_parser/maca_graph_parser_train.c new file mode 100644 index 0000000000000000000000000000000000000000..76df607eb6d128d964a931f9bf70c4ce14446a98 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_train.c @@ -0,0 +1,201 @@ +/*************************************************************************** + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +**************************************************************************/ + +#include "maca_graph_parser_train.h" + +#include "maca_graph_parser_sentence.h" +#include "maca_graph_parser_alphabet.h" +#include "maca_graph_parser_decoder.h" +#include "maca_graph_parser_metrics.h" +#include "maca_graph_parser_features.h" +#include "maca_graph_parser_hash.h" +#include "maca_graph_parser_model.h" +// #include <inttypes.h> + + +void update_weights(maca_graph_parser_ctx *ctx, maca_graph_parser_sentence *ref_s, maca_graph_parser_sentence *hyp_s, feature_counter_array *ref_feat_cnt_array, feature_counter_array *hyp_feat_cnt_array, int upd){ + /** + * Update the weights in ctx->model->feat_ht. + */ + + /* check for "non-violation", i.e. score(hyp) < score(ref) */ + /* with the projective decoder, this can happen + if ref is non-projective */ + /* TODO: check that it only happens in this case */ + float score_ref = score_feature_counter_array(ref_feat_cnt_array, ctx->model); + float score_hyp = score_feature_counter_array(hyp_feat_cnt_array, ctx->model); + if(score_hyp < score_ref){ + /* baseline strategy: skip update */ + return; + } + + if(ctx->verbose_flag > 4){ + switch(ctx->algorithm) { + case PERCEPTRON_TRAINING: fprintf(stderr, ">>> perceptron update\n"); break; + case MIRA_TRAINING: fprintf(stderr, ">>> mira update\n"); break; + case ADAGRAD_TRAINING: fprintf(stderr, ">>> adagrad update\n"); break; + } + } + + /* learning rate (= step size) */ + double alpha = 1.0; /* default value is 1 */ + /* ref feature counter - hyp feature counter */ + feature_counter_array *ref_minus_hyp = feature_counter_array_difference(ctx, ref_feat_cnt_array, hyp_feat_cnt_array); + + if(ctx->algorithm == MIRA_TRAINING) { /* MIRA adapts step size */ + double C_aggr = 1.0; /* aggressiveness parameter */ + + float lam_dist = score_ref - score_hyp; /* lam_dist is <= 0 */ + /* cost of choosing a wrong hyp over the ref ; after update, the + minimal margin between ref and hyp must be >= this cost */ + double err = maca_graph_parser_sentence_errors(ref_s, hyp_s) + 1; /* why + 1 ? */ /* err is > 0 */ + /* structured loss */ + float b = (float)err - lam_dist; /* b is > 0 */ + /* square norm of the distance vector */ + int dist = feature_counter_array_squared_norm(ref_minus_hyp); + /* PA-I update: alpha = min(C_aggr, b/dist) */ + alpha = (dist == 0)? 0.0 : ((double) b / (double) dist); /* default was 1 */ + if(alpha > C_aggr) alpha = C_aggr; + if(alpha < 0) alpha = 0.0; /* useless as at this point, b and dist are both guaranteed >= 0 */ + + if(ctx->verbose_flag > 4) fprintf(stderr, "score ref = %f score hyp = %f diff = %f err = %lf dist = %d alpha = %lf\n", score_ref, score_hyp, lam_dist, err, dist, alpha); + } + + /* update features */ + int i, j; + feature_counter *fc; + feature_t ft; + int c; + int index; + double new_param; + + for(i=0; i < ref_minus_hyp->size; i++){ + fc = ref_minus_hyp->array[i]; + if(fc){ + for(j=0; j < fc->size; j++){ + c = fc->values[j]; /* feature count */ + /* update weight of features with non-zero counts */ + if(c != 0){ + ft = fc->keys[j]; + index = recherche_hash_index(ctx->model->feat_ht, ft); + /* printf(" key: %" PRIu64 "\n", f); */ + if(index == -1){ /* init new feature */ + if(ctx->algorithm == ADAGRAD_TRAINING) { + range_hash(ctx->model->feat_ht, ft, alpha, (c*c)); + } else { + range_hash(ctx->model->feat_ht, ft, (alpha*c), (upd*alpha*c)); + } + } else { + if(ctx->algorithm == ADAGRAD_TRAINING) { + ctx->model->feat_ht->total[index] += (c*c); + new_param = (double) ctx->model->feat_ht->params[index] + (alpha*c) / sqrt((double) ctx->model->feat_ht->total[index]); + if(!isnan(new_param)) ctx->model->feat_ht->params[index] = (float) new_param; + } else { + ctx->model->feat_ht->params[index] += (alpha*c); + ctx->model->feat_ht->total[index] += (upd*alpha*c); + } + } + /* printf(" ref index %d: (params %f, total %f)\n", index, ctx->model->feat_ht->params[index], ctx->model->feat_ht->total[index]); */ + } + } + } + } + + /* cleanup */ + if(ref_minus_hyp){ + free_feature_counter_array(ref_minus_hyp); + ref_minus_hyp = NULL; + } +} + + +void maca_graph_parser_train(maca_graph_parser_ctx *ctx, hyp_ref_vector *corpus){ + /** + * Train a parsing model on corpus. + */ + + int i; + int sent_id; + int upd; /* update counter */ + int l; + double e; /* error(ref, hyp) */ + double score = 0.0; + + maca_graph_parser_sentence *ref_s = NULL; + maca_graph_parser_sentence *hyp_s = NULL; + + feature_counter_array *hyp_feat_cnt_array = NULL; + feature_counter_array *ref_feat_cnt_array = NULL; + + if(corpus == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "maca_graph_parser_train: corpus is NULL\n"); + exit(1); + } + + upd = (ctx->I * corpus->size) + 1; + + for(i=0; i < ctx->I; i++){ + /* TODO: permute sentences in each iteration */ + for(sent_id=0; sent_id < corpus->size; sent_id++){ + ref_s = corpus->ref[sent_id]; + hyp_s = corpus->hyp[sent_id]; + + /* decrease update counter */ + upd--; + + /* parse */ + maca_graph_parser_decoder_parse(ctx, hyp_s); + + /* if hyp != ref, update weights */ + e = maca_graph_parser_sentence_errors(ref_s, hyp_s); + if (e > 0.0){ + ref_feat_cnt_array = extract_features_from_parse_fca(ctx, ref_s, ref_feat_cnt_array); + hyp_feat_cnt_array = extract_features_from_parse_fca(ctx, hyp_s, hyp_feat_cnt_array); + + update_weights(ctx, ref_s, hyp_s, ref_feat_cnt_array, hyp_feat_cnt_array, upd); + } + + /* progress monitoring */ + score = maca_graph_parser_sentence_compute_las(ref_s, hyp_s, 0); + printf("[%d][%d] %d %lf %d\n", i, sent_id, (ref_s->l - 1), score, ctx->model->feat_ht->nbelem); + } + /* TODO: average weights after each iteration */ + } + + /* final weight averaging */ + l = ctx->I * corpus->size; + if(ctx->algorithm == PERCEPTRON_TRAINING || ctx->algorithm == MIRA_TRAINING) { + for(i=0; i < ctx->model->feat_ht->taille; i++){ + ctx->model->feat_ht->params[i] = ctx->model->feat_ht->total[i] / l; + /* if(ctx->model->feat_ht->params[i] != 0) + fprintf(stderr, "feature weight = %f\n", ctx->model->feat_ht->params[i]);*/ + } + } + + /* free used feature matrices */ + if(hyp_feat_cnt_array){ + free_feature_counter_array(hyp_feat_cnt_array); + hyp_feat_cnt_array = NULL; + } + if(ref_feat_cnt_array){ + free_feature_counter_array(ref_feat_cnt_array); + ref_feat_cnt_array = NULL; + } +} diff --git a/maca_graph_parser/maca_graph_parser_train.h b/maca_graph_parser/maca_graph_parser_train.h new file mode 100644 index 0000000000000000000000000000000000000000..13b80e40176117a7f7bb4bd532f86cd4fe748ac6 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_train.h @@ -0,0 +1,41 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_GRAPH_PARSER_TRAIN__ +#define __MACA_GRAPH_PARSER_TRAIN__ + +#include "maca_common.h" +#include "maca_constants.h" +#include "maca_graph_parser.h" +#include "maca_graph_parser_corpora.h" + +#ifdef __cplusplus +extern "C"{ +#endif + + +void maca_graph_parser_train(maca_graph_parser_ctx * ctx, hyp_ref_vector *corpus); + + +#ifdef __cplusplus +} +#endif + + +#endif diff --git a/maca_graph_parser/maca_graph_parser_train_main.c b/maca_graph_parser/maca_graph_parser_train_main.c new file mode 100644 index 0000000000000000000000000000000000000000..9d977990d6e8891247f97efe68640dbcb50d0415 --- /dev/null +++ b/maca_graph_parser/maca_graph_parser_train_main.c @@ -0,0 +1,75 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + This file is part of maca_graph_parser. + + maca_graph_parser is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_graph_parser is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_graph_parser. If not, see <http://www.gnu.org/licenses/>. +**************************************************************************/ + +#include<getopt.h> +#include<unistd.h> +#include<stdio.h> +#include<string.h> + +#include "maca_common.h" +#include "maca_constants.h" +#include "maca_graph_parser_model.h" +#include "maca_graph_parser_train.h" +#include "maca_graph_parser_features.h" + +/*-------------------------------------------------------------------------------------------*/ + +int main(int argc, char **argv) +{ + + /* sinon lecture des options dans la ligne de commande */ + maca_graph_parser_ctx* ctx; + ctx = maca_graph_parser_LoadCTX(argc,argv); + + /* load training corpus */ + if(ctx->verbose_flag > 1){ + maca_msg(ctx->module, MACA_MESSAGE); + fprintf(stderr, "loading training corpus\n"); + } + hyp_ref_vector *corpus = load_mcf_corpus(ctx); + fprintf(stderr, "Corpus size: %d\n", corpus->size); + + /* preprocessing: relabel rare dependencies */ + int sent_id = 0; + for(sent_id=0; sent_id < corpus->size; sent_id++){ + maca_graph_parser_sentence_relabel_rare_deps(ctx, corpus->ref[sent_id]); + } + + /* extractor_allocator() cannot be created earlier as it needs the + complete alphabet, which is built during load_conll_corpus + FIXME: properly (semi-)freeze the alphabet, + cf. extractor_allocator() + */ + ctx->e = maca_graph_parser_templ_library_allocator(ctx); + /* dump model */ + maca_graph_parser_train(ctx, corpus); + maca_graph_parser_model_dump(ctx, ctx->model, ctx->model_file_name, ctx->produce_hash_model); + /* maca_graph_parser_alphabet_print(ctx->alphabet_file_name, ctx->alphabet); */ + maca_graph_parser_dep_count_table_print(ctx, ctx->dep_count_table_file_name); + + /* free corpus data */ + free_corpus(corpus); + + maca_graph_parser_free_all(ctx); + + return 0; +} + +/*-------------------------------------------------------------------------------------------*/ + diff --git a/maca_graph_parser/maca_mcf.cc b/maca_graph_parser/maca_mcf.cc new file mode 100644 index 0000000000000000000000000000000000000000..a3b44bba1e62e16af61e1eeabc0dea8230a12373 --- /dev/null +++ b/maca_graph_parser/maca_mcf.cc @@ -0,0 +1,592 @@ +#include "maca_mcf.hh" + +// int main(int argc, char **argv) { +// // std::vector<std::shared_ptr<macaon::McfValue>> array; + +// // std::shared_ptr<macaon::McfValueInt> iValue(new macaon::McfValueInt(42)); +// // //std::shared_ptr<macaon::McfValue> value = iValue; +// // array.push_back(std::shared_ptr<macaon::McfValue>(iValue)); + +// // std::cout << "array[0]: " << array[0].get()->toString() << std::endl; + + +// // std::list<std::unique_ptr<macaon::McfValue>> list; +// // std::unique_ptr<macaon::McfValue> value(new macaon::McfValueInt(42)); +// // list.push_back(std::move(value)); + +// // std::cout << list.front().get()->toString() << std::endl; + +// // macaon::McfSentence sentence(3); + +// // macaon::McfWord &word = sentence.addWord(); +// // word.addValueString(0,"BAR"); +// // word.addValueString(0,"FOO"); +// // word.addValueFloat(2,42.4); + +// // float f = word.getFirstFloatValue(2); + +// // std::cout << "f: " << f << "\n"; + +// // word.print(); +// // std::cout << "\n"; + +// if (argc - 1 != 2) { +// std::cerr << "Usage: " << argv[0] << " ftb alpha\n"; +// return 1; +// } +// std::string ftb(argv[1]); +// std::string alpha(argv[2]); +// // std::ifstream input(ftb); +// macaon::Mcf mcf(ftb, alpha); + +// mcf.readHeader(); +// int formId = mcf.input("FORM"); +// int lemmaId = mcf.input("LEMMA"); +// int posId = mcf.input("POSTAG"); +// int deprelId = mcf.output("OUT_DEPREL", "LABELS"); + + +// std::cout << "Form id: " << formId << ", lemma id: " << lemmaId << "\n"; +// std::cout << "out_deprel id: " << deprelId << "\n"; + +// std::cout << "form's alpha: " << mcf.getColumnInfo(formId).alphabet->getName() << "\n"; +// std::cout << "lemma's alpha: " << mcf.getColumnInfo(lemmaId).alphabet->getName() << "\n"; +// std::cout << "pos's alpha: " << mcf.getColumnInfo(posId).alphabet->getName() << "\n"; +// std::cout << "deprel's alpha: " << mcf.getColumnInfo(deprelId).alphabet->getName() << "\n"; + +// // mcf.getColumnInfo(posId).alphabet->lock(); + +// mcf.printHeader(stdout); + +// //std::shared_ptr<macaon::McfSentences> sents = mcf.readSentences(); + +// //sents->print(); + +// //input.close(); + +// return 0; +// } + +bool to_bool(std::string str) { + if (str == "0" || str == "False" || str == "false") + return false; + if (str == "1" || str == "True" || str == "true") + return true; + + throw std::invalid_argument("Can't convert given string to bool"); +} + +namespace macaon { + + McfWord::McfWord(const McfWord &other): id(other.id), word(other.word.size()) { + for (std::size_t i = 0; i < other.word.size(); i++) { + const std::vector<std::unique_ptr<McfValue>> &vec = other.word[i]; + for (const std::unique_ptr<McfValue> &ptrOther : vec) { + std::unique_ptr<McfValue> ptr(ptrOther.get()->clone()); + word[i].push_back(std::move(ptr)); + } + } + } + + McfWord &McfWord::operator =(const McfWord &other) { + id = other.id; + std::vector<std::vector<std::unique_ptr<McfValue>>>(other.word.size()).swap(word); + for (std::size_t i = 0; i < other.word.size(); i++) { + const std::vector<std::unique_ptr<McfValue>> &vec = other.word[i]; + for (const std::unique_ptr<McfValue> &ptrOther : vec) { + std::unique_ptr<McfValue> ptr(ptrOther.get()->clone()); + word[i].push_back(std::move(ptr)); + } + } + return *this; + } + + int McfWord::getId() { + return id; + } + + void McfWord::addValueInt(int column, int value) { + std::unique_ptr<McfValue> ptr(new McfValueInt(value)); + word.at(column).push_back(std::move(ptr)); + } + + void McfWord::addValueFloat(int column, float value) { + std::unique_ptr<McfValue> ptr(new McfValueFloat(value)); + word.at(column).push_back(std::move(ptr)); + } + + void McfWord::addValueString(int column, std::string value) { + std::unique_ptr<McfValue> ptr(new McfValueString(value)); + word.at(column).push_back(std::move(ptr)); + } + + void McfWord::addValueBool(int column, bool value) { + std::unique_ptr<McfValue> ptr(new McfValueBool(value)); + word.at(column).push_back(std::move(ptr)); + } + + void McfWord::addValueAlphabet(int column, std::string symbol, std::shared_ptr<Alphabet> a) { + std::unique_ptr<McfValue> ptr(new McfValueAlphabet(symbol, a)); + word.at(column).push_back(std::move(ptr)); + } + + std::list<int> McfWord::getIntValues(int column) { + std::vector<std::unique_ptr<McfValue>> &mcfValues = word.at(column); + std::list<int> values; + + for (std::unique_ptr<McfValue> &mcfVal : mcfValues) { + McfValueInt *ptrVal = dynamic_cast<McfValueInt*>(mcfVal.get()); + values.push_back(ptrVal->i); + } + + return values; + } + + std::list<float> McfWord::getFloatValues(int column) { + std::vector<std::unique_ptr<McfValue>> &mcfValues = word.at(column); + std::list<float> values; + + for (std::unique_ptr<McfValue> &mcfVal : mcfValues) { + McfValueFloat *ptrVal = dynamic_cast<McfValueFloat*>(mcfVal.get()); + values.push_back(ptrVal->f); + } + + return values; + } + + std::list<std::string> McfWord::getStringValues(int column) { + std::vector<std::unique_ptr<McfValue>> &mcfValues = word.at(column); + std::list<std::string> values; + + for (std::unique_ptr<McfValue> &mcfVal : mcfValues) { + McfValueString *ptrVal = dynamic_cast<McfValueString*>(mcfVal.get()); + values.push_back(ptrVal->s); + } + + return values; + } + + std::list<bool> McfWord::getBoolValues(int column) { + std::vector<std::unique_ptr<McfValue>> &mcfValues = word.at(column); + std::list<bool> values; + + for (std::unique_ptr<McfValue> &mcfVal : mcfValues) { + McfValueBool *ptrVal = dynamic_cast<McfValueBool*>(mcfVal.get()); + values.push_back(ptrVal->b); + } + + return values; + } + + int McfWord::getNbValues(int column) { + return word.at(column).size(); + } + + int &McfWord::getIntValue(int column, int index) { + std::unique_ptr<McfValue> &mcfValue = word.at(column).at(index); + return dynamic_cast<McfValueInt*>(mcfValue.get())->i; + } + + float &McfWord::getFloatValue(int column, int index) { + std::unique_ptr<McfValue> &mcfValue = word.at(column).at(index); + return dynamic_cast<McfValueFloat*>(mcfValue.get())->f; + } + + std::string &McfWord::getStringValue(int column, int index) { + std::unique_ptr<McfValue> &mcfValue = word.at(column).at(index); + return dynamic_cast<McfValueString*>(mcfValue.get())->s; + } + + bool &McfWord::getBoolValue(int column, int index) { + std::unique_ptr<McfValue> &mcfValue = word.at(column).at(index); + return dynamic_cast<McfValueBool*>(mcfValue.get())->b; + } + + void McfWord::clearValues(int column) { + word.at(column).clear(); + } + + void McfWord::print(std::ostream &output) { + bool firstColumn = true; + for (std::vector<std::unique_ptr<McfValue>> &v : word) { + if (!firstColumn) + output << "\t"; + if (v.empty()) { + output << "_"; + } else { + bool firstInList = true; + for (std::unique_ptr<McfValue> &ptr : v) { + if (!firstInList) + output << "|"; + output << ptr.get()->toString(); + firstInList = false; + } + } + firstColumn = false; + } + output << "\n"; + } + + void McfWord::print(FILE *output) { + bool firstColumn = true; + for (std::vector<std::unique_ptr<McfValue>> &v : word) { + if (!firstColumn) + fprintf(output,"\t"); + if (v.empty()) { + fprintf(output,"_"); + } else { + bool firstInList = true; + for (std::unique_ptr<McfValue> &ptr : v) { + if (!firstInList) + fprintf(output,"|"); + fprintf(output,"%s",ptr.get()->toString().c_str()); + firstInList = false; + } + } + firstColumn = false; + } + fprintf(output,"\n"); + } + + McfSentence::McfSentence(int nbColumns): nbColumns(nbColumns), length(0) { + addWord(); + } + + McfWord &McfSentence::addWord() { + sentence.push_back(McfWord(length, nbColumns)); + length++; + return sentence.back(); + } + + McfWord &McfSentence::getWord(int i) { + return sentence.at(i); + } + + void McfSentence::clear() { + sentence.clear(); + addWord(); + length = 1; + } + + void McfSentence::print(std::ostream &output) { + for (std::size_t i = 1; i < sentence.size(); i++) { + sentence[i].print(output); + } + output << "\n"; + } + + void McfSentence::print(FILE *output) { + for (std::size_t i = 1; i < sentence.size(); i++) { + sentence[i].print(output); + } + fprintf(output,"\n"); + } + + McfSentences::McfSentences(): nbSentences(0) {} + + void McfSentences::addSentence(std::shared_ptr<McfSentence> s) { + sentences.push_back(s); + } + + std::shared_ptr<McfSentence> McfSentences::getSentence(int index) { + return sentences.at(index); + } + + std::shared_ptr<McfSentence> McfSentences::operator[](int index) { + return sentences[index]; + } + + int McfSentences::size() { + return nbSentences; + } + + void McfSentences::print(std::ostream &output) { + for (std::shared_ptr<McfSentence> &s : sentences) { + s->print(output); + } + } + + void McfSentences::print(FILE *output) { + for (std::shared_ptr<McfSentence> &s : sentences) { + s->print(output); + } + } + + McfColumn::McfColumn(std::string name, std::string typeName, int columnIdInFile): + name(name), typeName(typeName), alphabet(nullptr), columnIdInFile(columnIdInFile) {} + + void McfColumn::setAlphabet(std::shared_ptr<Alphabet> a) { + alphabet = a; + } + + Mcf::Mcf(std::istream &sinput): sinput(sinput), internalFile(false), parsedHeader(false) { + readHeader(); + } + + Mcf::Mcf(std::istream &sinput, std::string alphabetFilename): + sinput(sinput), internalFile(false), parsedHeader(false), alphaArray(alphabetFilename) { + readHeader(); + } + + Mcf::Mcf(std::istream &sinput, AlphabetArray &array): + sinput(sinput), internalFile(false), parsedHeader(false), alphaArray(array) { + readHeader(); + } + + Mcf::Mcf(std::string filename): + sinput(*(new std::ifstream(filename))), internalFile(true), parsedHeader(false) { + readHeader(); + } + + Mcf::Mcf(std::string filename, std::string alphabetFilename): + sinput(*(new std::ifstream(filename))), internalFile(true), parsedHeader(false), alphaArray(alphabetFilename) { + readHeader(); + } + + Mcf::Mcf(std::string filename, AlphabetArray &array): + sinput(*(new std::ifstream(filename))), internalFile(true), parsedHeader(false), alphaArray(array) { + readHeader(); + } + + Mcf::~Mcf() { + if(internalFile) + delete &sinput; + } + + void Mcf::readHeader() { + if (parsedHeader) { + throw std::runtime_error("Already read header once!"); + } + + std::string line; + char firstChar = sinput.get(); + if (firstChar == '#') { + if (!std::getline(sinput, line)) { + throw std::runtime_error("Couldn't read header (EOF) !"); + } + } else { /* conll07 compatibility */ + sinput.unget(); + line = "ID@INT\tFORM@WORDS\tLEMMA@WORDS\tCPOSTAG@STRING\tPOSTAG@POS\tFEATS@MORPHO\t" + "HEAD@INT\tDEPREL@LABELS\tPHEAD@STRING\tPDEPREL@STRING"; + } + + std::istringstream iss(line); + std::string token; + int column = 0; + while (std::getline(iss, token, '\t')) { + std::string name; + std::string typeName; + std::size_t pos = token.find('@'); + if (pos == std::string::npos) { + name = token; + typeName = MCF_COLUMN_TYPE_STRING; + } else { + name = token.substr(0, pos); + typeName = token.substr(pos+1); + } + + columns.push_back(McfColumn(name, typeName, column)); + col2Index[name] = column; + column++; + } + + nbInputColumns = column; + parsedHeader = true; + } + + int Mcf::input(std::string columnName) { + std::map<std::string,int>::iterator it = col2Index.find(columnName); + if (it == col2Index.end()) { + throw std::runtime_error("Can't register a non-existant column!"); + } + McfColumn &column = columns[it->second]; + std::string &columnType = column.typeName; + if (columnType != MCF_COLUMN_TYPE_STRING && + columnType != MCF_COLUMN_TYPE_INTEGER && + columnType != MCF_COLUMN_TYPE_FLOAT && + columnType != MCF_COLUMN_TYPE_BOOL) { // We have an alphabet then + if (alphaArray.has(columnType)) { + column.setAlphabet(alphaArray[columnType]); + } else { + std::shared_ptr<Alphabet> a(new Alphabet(columnType)); + alphaArray.addAlphabet(a); + column.setAlphabet(a); + } + } + + registeredColumns[columnName] = it->second; + + return it->second; + } + + int Mcf::output(std::string columnName, std::string columnType) { + int columnId = columns.size(); + + std::map<std::string,int>::iterator it = col2Index.find(columnName); + if (it != col2Index.end()) + throw std::runtime_error("Column has already been registered!"); + + columns.push_back(McfColumn(columnName, columnType, -1)); + col2Index[columnName] = columnId; + registeredColumns[columnName] = columnId; + + if (columnType != MCF_COLUMN_TYPE_STRING && + columnType != MCF_COLUMN_TYPE_INTEGER && + columnType != MCF_COLUMN_TYPE_FLOAT && + columnType != MCF_COLUMN_TYPE_BOOL) { // We have an alphabet then + if (alphaArray.has(columnType)) { + columns[columnId].setAlphabet(alphaArray[columnType]); + } else { + std::shared_ptr<Alphabet> a(new Alphabet(columnType)); + alphaArray.addAlphabet(a); + columns[columnId].setAlphabet(a); + } + } + + return columnId; + } + + McfColumn &Mcf::getColumnInfo(int id) { + return columns.at(id); + } + + McfColumn &Mcf::getColumnInfo(std::string name) { + std::map<std::string,int>::iterator it = col2Index.find(name); + if (it == col2Index.end()) + throw std::invalid_argument("Column does not exist!"); + return columns[it->second]; + } + + bool Mcf::readLine(std::string &line, McfSentence &sentence, bool sentenceValid) { + std::istringstream iss(line); + std::string token; + int columnId = 0; + McfWord &word = sentence.addWord(); + while (getline(iss, token, '\t')) { + if (columnId >= nbInputColumns) { + sentenceValid = false; + std::cerr << "Warning: Line has too many columns!\n"; + break; + } + + if (token != "_") { + std::istringstream subiss(token); + std::string subtoken; + McfColumn &column = columns[columnId]; + while (getline(subiss, subtoken, '|')) { + if (column.typeName == MCF_COLUMN_TYPE_STRING) { + word.addValueString(columnId, subtoken); + } else if (column.typeName == MCF_COLUMN_TYPE_INTEGER) { + word.addValueInt(columnId, std::stoi(subtoken)); + } else if (column.typeName == MCF_COLUMN_TYPE_FLOAT) { + word.addValueFloat(columnId, std::stof(subtoken)); + } else if (column.typeName == MCF_COLUMN_TYPE_BOOL) { + word.addValueBool(columnId, to_bool(subtoken)); + } else { + if (column.alphabet) { + try { + word.addValueAlphabet(columnId, subtoken, column.alphabet); + } catch (std::exception &e) { + sentenceValid = false; + std::cerr << "Warning: Couldn't add '" << subtoken + << "' to alphabet (maybe locked?)!\n"; + } + } else { + word.addValueString(columnId, subtoken); + } + } + } + } + columnId++; + } + if (columnId != nbInputColumns) { + sentenceValid = false; + std::cerr << "Warning: Line has too few columns!\n"; + } + + return sentenceValid; + } + + std::shared_ptr<McfSentence> Mcf::readNextSentence() { + bool sentenceValid = true; + std::string line; + + if (sinput.eof()) { + return nullptr; + } + + std::shared_ptr<McfSentence> sentence(new McfSentence(columns.size())); + + while (getline(sinput, line)) { + if (line.empty()) { + if (sentence->length > 1) { + if (sentenceValid) { + break; + } else { + sentence->clear(); + sentenceValid = true; + std::cerr << "Warning: Skipping invalid sentence...\n"; + } + } + continue; + } + + sentenceValid = readLine(line, *sentence, sentenceValid); + } + + if (!sentenceValid) { + return nullptr; + } + + return sentence; + } + + + std::shared_ptr<McfSentences> Mcf::readSentences() { + std::shared_ptr<McfSentences> sentences(new McfSentences()); + + std::shared_ptr<McfSentence> sentence; + while ((sentence = readNextSentence())) { + sentences->addSentence(sentence); + } + + return sentences; + } + + void Mcf::printHeader(std::ostream &output) { + output << "#"; + bool first = true; + for (McfColumn &column : columns) { + if (!first) + output << "\t"; + output << column.name << "@" << column.typeName; + first = false; + } + output << "\n"; + } + + void Mcf::printHeader(FILE* output) { + fprintf(output, "#"); + bool first = true; + for (McfColumn &column : columns) { + if (!first) + fprintf(output, "\t"); + fprintf(output, "%s@%s", column.name.c_str(), column.typeName.c_str()); + first = false; + } + fprintf(output, "\n"); + } + + void Mcf::dumpAlphabets(std::ostream &output) { + alphaArray.dump(output); + } + + void Mcf::dumpAlphabets(std::string filename) { + std::ofstream output(filename); + if (!output.is_open()) + throw std::runtime_error("couldn't open alphabets file!"); + dumpAlphabets(output); + output.close(); + } +} + diff --git a/maca_graph_parser/maca_mcf.hh b/maca_graph_parser/maca_mcf.hh new file mode 100644 index 0000000000000000000000000000000000000000..1c8833c3d87eee554af2cbdc4efa9f66a7714ae8 --- /dev/null +++ b/maca_graph_parser/maca_mcf.hh @@ -0,0 +1,627 @@ +#ifndef __MACA_MCF_H__ +#define __MACA_MCF_H__ + +#include <iostream> +#include <sstream> +#include <fstream> +#include <string> +#include <vector> +#include <map> +#include <list> +#include <memory> +#include <cstdio> +#include "maca_alphabet.hh" + +#define MCF_COLUMN_TYPE_STRING "STRING" +#define MCF_COLUMN_TYPE_INTEGER "INT" +#define MCF_COLUMN_TYPE_FLOAT "FLOAT" +#define MCF_COLUMN_TYPE_BOOL "BOOL" + +namespace macaon { + class McfValue { + public: + virtual std::string toString() = 0; + virtual McfValue *clone() const = 0; + }; + + template <class Derived> + class McfValueDerivationHelper : public McfValue { + public: + virtual McfValue* clone() const { + return new Derived(static_cast<const Derived&>(*this)); // call the copy ctor. + } + }; + + class McfValueInt: public McfValueDerivationHelper<McfValueInt> { + public: + int i; + + McfValueInt(int i): i(i) {} + + std::string toString() { + return std::to_string(i); + } + }; + + class McfValueFloat: public McfValueDerivationHelper<McfValueFloat> { + public: + float f; + + McfValueFloat(float f): f(f) {} + + std::string toString() { + std::ostringstream ss; /* We have to do that because std::to_string makes us */ + ss << f; /* lose precision */ + return ss.str(); + } + }; + + class McfValueString: public McfValueDerivationHelper<McfValueString> { + public: + std::string s; + + McfValueString(std::string s): s(s) {} + + std::string toString() { + return s; + } + }; + + class McfValueBool: public McfValueDerivationHelper<McfValueBool> { + public: + bool b; + + McfValueBool(bool b) : b(b) {} + + std::string toString() { + if (b) + return "1"; + return "0"; + } + }; + + class McfValueAlphabet: public McfValueInt { + private: + std::shared_ptr<Alphabet> alphabet; + + public: + McfValueAlphabet(std::string symbol, std::shared_ptr<Alphabet> alphabet): + McfValueInt(-1), alphabet(alphabet) { + + try{ + i = alphabet->addSymbol(symbol); + } catch (std::exception e){ + throw std::runtime_error("Can't create alphabet value!"); + } + + } + + std::string toString() { + return alphabet->getSymbol(i); + } + + virtual McfValue* clone() const { + return new McfValueAlphabet(static_cast<const McfValueAlphabet&>(*this)); + } + }; + + + class McfWord { + private: + int id; /**< Index of the word in its sentence */ + std::vector<std::vector<std::unique_ptr<McfValue>>> word; + + public: + /** + * Constructor of McfWord. + * + * @param id the index of the word in its sentence. + * @param nbColumns the number of columns the word has. + */ + McfWord(int id, int nbColumns) : id(id), word(nbColumns) {} + + /** + * Copy constructor of McfWord. + * + * @param other the other McfWord to copy from. + */ + McfWord(const McfWord &other); + + /** + * Assignement operator of McfWord. + * + * @param other the other McfWord to assign from. + * + * @return the McfWord to assign. + */ + McfWord &operator =(const McfWord &other); + + /** + * Gets the index of the word in its sentence. + * + * + * @return the index; + */ + int getId(); + + /** + * Adds an integer value to the word at the specified column. + * + * @param column the id of the column. + * @param value the integer value to add. + */ + void addValueInt(int column, int value); + + /** + * Adds a float value to the word at the specified column. + * + * @param column the id of the column. + * @param value the float value to add. + */ + void addValueFloat(int column, float value); + + /** + * Adds a string value to the word at the specified column. + * + * @param column the id of the column. + * @param value the string value to add. + */ + void addValueString(int column, std::string value); + + /** + * Adds a boolean value to the word at the specified column. + * + * @param column the id of the column. + * @param value the boolean value to add. + */ + void addValueBool(int column, bool value); + + /** + * Adds a symbol of an alphabet to the word at the specified column. + * + * @param column the id of the column. + * @param symbol the symbol value to add. + * @param a the alphabet to use to convert the symbol into an integer. + * + * @note once stored in the word, these values act as integers. + * + * @see getIntValue() + * @see getIntValues() + */ + void addValueAlphabet(int column, std::string symbol, std::shared_ptr<Alphabet> a); + + /** + * Gets an integer list of the stored values from the specified column. + * + * @param column the id of the column. + * + * @return a list of integer values. + * + * @note only use this method if stored values are integers! + * @note if performance is a concern, prefer the get*Value() alternatives. + * + * @see getIntValue() + */ + std::list<int> getIntValues(int column); + + /** + * Gets a float list of the stored values from the specified column. + * + * @param column the id of the column. + * + * @return a list of float values. + * + * @note only use this method if stored values are floats! + * @note if performance is a concern, prefer the get*Value() alternatives. + * + * @see getFloatValue() + */ + std::list<float> getFloatValues(int column); + + /** + * Gets a string list of the stored values from the specified column. + * + * @param column the id of the column. + * + * @return a list of string values. + * + * @note only use this method if stored values are strings! + * @note if performance is a concern, prefer the get*Value() alternatives. + * + * @see getStringValue() + */ + std::list<std::string> getStringValues(int column); + + /** + * Gets a boolean list of the stored values from the specified column. + * + * @param column the id of the column. + * + * @return a list of boolean values. + * + * @note only use this method if stored values are booleans! + * @note if performance is a concern, prefer the get*Value() alternatives. + * + * @see getBoolValue() + */ + std::list<bool> getBoolValues(int column); + + /** + * Gets the number of values stored in the specified column. + * + * @param column the id of the column. + * + * @return the number of stored values. + * + * @note it's recommended to use this method before calling any get*Value() method. + */ + int getNbValues(int column); + + /** + * Get an integer value from the specified column and index. + * + * @param column the id of the column. + * @param index the index of the value in the column. + * + * @return an integer value. + * + * @see getNbValues() + */ + int &getIntValue(int column, int index); + + /** + * Get a float value from the specified column and index. + * + * @param column the id of the column. + * @param index the index of the value in the column. + * + * @return a float value. + * + * @see getNbValues() + */ + float &getFloatValue(int column, int index); + + /** + * Get a string value from the specified column and index. + * + * @param column the id of the column. + * @param index the index of the value in the column. + * + * @return a string value. + * + * @see getNbValues() + */ + std::string &getStringValue(int column, int index); + + /** + * Get a bool value from the specified column and index. + * + * @param column the id of the column. + * @param index the index of the value in the column. + * + * @return a bool value. + * + * @see getNbValues() + */ + bool &getBoolValue(int column, int index); + + /** + * Clears all the values in the specified column. + * + * @param column the id of the column to clear. + */ + void clearValues(int column); + + /** + * Prints the word into the specified output stream. + * + * @param output the output stream. + */ + void print(std::ostream &output=std::cout); + + /** + * Prints the word into the specified output file. + * + * @param output the output file. + */ + void print(FILE* output); + }; + + class McfSentence { + private: + int nbColumns; //!< Number of columns in the file + std::vector<McfWord> sentence; + + public: + int length; //!< Length of the sentence (starts at 1 because of a fake root) + + /** + * Constructor of McfSentence. + * + * @param nbColumns the number of columns in the sentence. + */ + McfSentence(int nbColumns); + + /** + * Adds a new word to the sentence. + * + * @return a reference to the added McfWord. + */ + McfWord &addWord(); + + /** + * Gets a word of the sentence. + * + * @param i the index of the word in the sentence. + * + * @return a referencen to a McfWord. + */ + McfWord &getWord(int i); + + /** + * Clears the sentence. + * + */ + void clear(); + + /** + * Prints the sentence into the specified output stream. + * + * @param output the output stream. + */ + void print(std::ostream &output=std::cout); + + /** + * Prints the sentence into the specified output file. + * + * @param output the output file. + */ + void print(FILE* output); + }; + + class McfSentences { + private: + int nbSentences; + std::vector<std::shared_ptr<McfSentence>> sentences; + + public: + /** + * Constructor of McfSentences. + * + */ + McfSentences(); + + /** + * Adds a sentence. + * + * @param s the sentence to add. + */ + void addSentence(std::shared_ptr<McfSentence> s); + + /** + * Gets a sentence from its index. + * + * @param index the index of the sentence. + * + * @return the sentence. + */ + std::shared_ptr<McfSentence> getSentence(int index); + + /** + * Same as getSentenc() + */ + std::shared_ptr<McfSentence> operator[](int index); + + /** + * Gets the number of sentences. + * + * @return the number of sentences. + */ + int size(); + + /** + * Prints the sentences into the specified output stream. + * + * @param output the output stream. + */ + void print(std::ostream &output=std::cout); + + /** + * Prints the sentences into the specified output file. + * + * @param output the output file. + */ + void print(FILE* output); + }; + + class McfColumn { + public: + std::string name; /**< the name of the column */ + std::string typeName; /**< the type of the column */ + std::shared_ptr<Alphabet> alphabet; /**< the alphabet used by the column (if the type is an alphabet) */ + int columnIdInFile; /**< the position of the column in the file */ + + /** + * Constructor of McfColumn. + * + * @param name the name of the column. + * @param typeName the type of the column. + * @param columnIdInFile the position of the column in the file. + */ + McfColumn(std::string name, std::string typeName, int columnIdInFile); + + /** + * Sets the alphabet of the column. + * + * @param a the alphabet. + */ + void setAlphabet(std::shared_ptr<Alphabet> a); + }; + + class Mcf { + public: + /** + * Constructor of Mcf. + * It also reads the header of the input. + * This constructor won't load any alphabets from a file. + * + * @param input the input mcf stream. + */ + Mcf(std::istream &input=std::cin); + + /** + * Constructor of Mcf. + * It also reads the header of the input. + * This constructor will load alphabets from the specified file. + * + * @param input the input mcf stream. + * @param alphabetFilename the name of the file which contains alphabets. + */ + Mcf(std::istream &input, std::string alphabetFilename); + + /** + * Constructor of Mcf. + * It also reads the header of the input. + * This constructor will use an already existing alphabet array. + * + * @param input the input mcf stream. + * @param array the alphabet array to use. + */ + Mcf(std::istream &input, AlphabetArray &array); + + /** + * Constructor of Mcf. + * It also reads the header of the input. + * This constructor won't load any alphabets from a file. + * + * @param filename the name of the mcf file. + */ + Mcf(std::string filename); + + /** + * Constructor of Mcf. + * It also reads the header of the input. + * This constructor will load alphabets from the specified file. + * + * @param filename the name of the mcf file. + * @param alphabetFilename the name of the file which contains alphabets. + */ + Mcf(std::string filename, std::string alphabetFilename); + + /** + * Constructor of Mcf. + * It also reads the header of the input. + * This constructor will use an already existing alphabet array. + * + * @param filename the name of the mcf file. + * @param array the alphabet array to use. + */ + Mcf(std::string filename, AlphabetArray &array); + + /** + * Destructor of Mcf. + * + */ + ~Mcf(); + + /** + * Notifies that you are going to use a column. + * + * @param columnName the name of the column. + * + * @return the id of the column. + */ + int input(std::string columnName); + + /** + * Creates a new output column. + * + * @param columnName the name of the new column. + * @param columnType the type of the new column. + * + * @return the id of the new column. + * + * @note if type is an alphabet, it will try to use already existing alphabets + * if an alphabet with the same name exists. If not, it will create a new alphabet. + */ + int output(std::string columnName, std::string columnType=MCF_COLUMN_TYPE_STRING); + + /** + * Gets information about a column. + * + * @param id the id of the column. + * + * @return a McfColumn which contains information on the column. + */ + McfColumn &getColumnInfo(int id); + + /** + * Gets information about a column. + * + * @param name the name of the column. + * + * @return a McfColumn which contains information on the column. + */ + McfColumn &getColumnInfo(std::string name); + + /** + * Reads the next sentence in the stream. + * + * @return the read sentence, or nullptr if none were read. + */ + std::shared_ptr<McfSentence> readNextSentence(); + + /** + * Reads all the sentences of the stream. + * + * @return a McfSentences which contains all the sentences. + */ + std::shared_ptr<McfSentences> readSentences(); + + /** + * Prints the header into the specified output stream. + * + * @param output the output stream. + */ + void printHeader(std::ostream &output=std::cout); + + /** + * Prints the header into the specified output file. + * + * @param output the output file. + */ + void printHeader(FILE *output); + + /** + * Dumps the alphabets into the output stream. + * + * @param output the output stream. + */ + void dumpAlphabets(std::ostream &output); + + /** + * Dumps the alphabets into the specified file. + * + * @param filename the name of the file. + */ + void dumpAlphabets(std::string filename); + + private: + std::istream &sinput; + bool internalFile; + int nbInputColumns; + std::vector<McfColumn> columns; + std::map<std::string,int> col2Index; + std::map<std::string,int> registeredColumns; + bool parsedHeader; + AlphabetArray alphaArray; + + bool readLine(std::string &line, McfSentence &sentence, bool sentenceValid); + /** + * Reads the header of the mcf file. + * + */ + void readHeader(); + }; +} +#endif /* MACA_MCF_H */ diff --git a/maca_graph_parser/maca_mcf_wrapper.cc b/maca_graph_parser/maca_mcf_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..74b08e603d571810938ff4cbd1133d58602862b2 --- /dev/null +++ b/maca_graph_parser/maca_mcf_wrapper.cc @@ -0,0 +1,232 @@ +#include "maca_mcf.hh" +#include "maca_mcf_wrapper.h" +#include <cstring> + +extern "C" { + int maca_mcf_word_get_id(maca_mcf_word *word) { + return word->getId(); + } + + void maca_mcf_word_add_value_int(maca_mcf_word *word, int column, int value) { + word->addValueInt(column, value); + } + + void maca_mcf_word_add_value_float(maca_mcf_word *word, int column, float value) { + word->addValueFloat(column, value); + } + + void maca_mcf_word_add_value_string(maca_mcf_word *word, int column, char* value) { + word->addValueString(column, value); + } + + void maca_mcf_word_add_value_bool(maca_mcf_word *word, int column, char value) { + word->addValueBool(column, value); + } + + char maca_mcf_word_add_value_alphabet(maca_mcf_word *word, int column, char *symbol, + maca_alphabet *alphabet) { + std::shared_ptr<macaon::Alphabet> ptr = PtrPool<macaon::Alphabet>::getInstance().get(alphabet); + try { + word->addValueAlphabet(column, symbol, ptr); + return 1; + } catch (std::exception &e) { + return 0; + } + } + + int maca_mcf_word_get_nb_values(maca_mcf_word *word, int column) { + return word->getNbValues(column); + } + + int maca_mcf_word_get_value_int(maca_mcf_word *word, int column, int index) { + return word->getIntValue(column, index); + } + + float maca_mcf_word_get_value_float(maca_mcf_word *word, int column, int index) { + return word->getFloatValue(column, index); + } + + void maca_mcf_word_get_value_string(maca_mcf_word *word, int column, int index, + char *str, int size) { + strncpy(str, word->getStringValue(column,index).c_str(), size); + } + + char maca_mcf_word_get_value_bool(maca_mcf_word *word, int column, int index) { + return word->getBoolValue(column, index); + } + + void maca_mcf_word_clear_values(maca_mcf_word *word, int column) { + word->clearValues(column); + } + + void maca_mcf_word_print(maca_mcf_word *word, FILE *out) { + word->print(out); + } + + maca_mcf_sentence *maca_mcf_sentence_new(int nb_columns) { + std::shared_ptr<macaon::McfSentence> sent(new macaon::McfSentence(nb_columns)); + PtrPool<macaon::McfSentence>::getInstance().accept(sent); + + return sent.get(); + } + + void maca_mcf_sentence_delete(maca_mcf_sentence *sentence) { + PtrPool<macaon::McfSentence>::getInstance().release(sentence); + } + + void maca_mcf_sentence_release(maca_mcf_sentence *sentence) { + maca_mcf_sentence_delete(sentence); + } + + maca_mcf_word *maca_mcf_sentence_add_word(maca_mcf_sentence *sentence) { + return &(sentence->addWord()); + } + + maca_mcf_word *maca_mcf_sentence_get_word(maca_mcf_sentence *sentence, int index) { + return &(sentence->getWord(index)); + } + + void maca_mcf_sentence_clear(maca_mcf_sentence *sentence) { + sentence->clear(); + } + + int maca_mcf_sentence_get_length(maca_mcf_sentence *sentence) { + return sentence->length; + } + + void maca_mcf_sentence_print(maca_mcf_sentence *sentence, FILE *out) { + sentence->print(out); + } + + maca_mcf_sentences *maca_mcf_sentences_new() { + std::shared_ptr<macaon::McfSentences> sents(new macaon::McfSentences()); + PtrPool<macaon::McfSentences>::getInstance().accept(sents); + + return sents.get(); + } + + void maca_mcf_sentences_delete(maca_mcf_sentences *sentences) { + PtrPool<macaon::McfSentences>::getInstance().release(sentences); + } + + void maca_mcf_sentences_release(maca_mcf_sentences *sentences) { + maca_mcf_sentences_release(sentences); + } + + void maca_mcf_sentences_add_sentence(maca_mcf_sentences *sentences, maca_mcf_sentence *sentence) { + std::shared_ptr<macaon::McfSentence> ptr = + PtrPool<macaon::McfSentence>::getInstance().get(sentence); + sentences->addSentence(ptr); + } + + maca_mcf_sentence *maca_mcf_sentences_get_sentence(maca_mcf_sentences *sentences, int index) { + std::shared_ptr<macaon::McfSentence> ptr = sentences->getSentence(index); + + return PtrPool<macaon::McfSentence>::getInstance().accept(ptr); + } + + int maca_mcf_sentences_size(maca_mcf_sentences *sentences) { + return sentences->size(); + } + + void maca_mcf_sentences_print(maca_mcf_sentences *sentences, FILE *out) { + sentences->print(out); + } + + void maca_mcf_column_get_name(maca_mcf_column *column, char *name, int size) { + strncpy(name, column->name.c_str(), size); + } + + void maca_mcf_column_get_type(maca_mcf_column *column, char *type, int size) { + strncpy(type, column->typeName.c_str(), size); + } + + maca_alphabet *maca_mcf_column_get_alphabet(maca_mcf_column *column) { + return PtrPool<macaon::Alphabet>::getInstance().accept(column->alphabet); + } + + void maca_mcf_column_set_alphabet(maca_mcf_column *column, maca_alphabet *alphabet) { + std::shared_ptr<macaon::Alphabet> ptr = PtrPool<macaon::Alphabet>::getInstance().get(alphabet); + column->setAlphabet(ptr); + } + + maca_mcf *maca_mcf_new(char *input_filename) { + std::shared_ptr<macaon::Mcf> format(new macaon::Mcf(input_filename)); + PtrPool<macaon::Mcf>::getInstance().accept(format); + + return format.get(); + } + + maca_mcf *maca_mcf_new_with_alphabets(char *input_filename, char *alphabet_filename) { + std::shared_ptr<macaon::Mcf> format(new macaon::Mcf(input_filename, alphabet_filename)); + PtrPool<macaon::Mcf>::getInstance().accept(format); + + return format.get(); + } + + maca_mcf *maca_mcf_new_with_alphabet_array(char *input_filename, maca_alphabet_array *array) { + std::shared_ptr<macaon::Mcf> format(new macaon::Mcf(input_filename, *array)); + PtrPool<macaon::Mcf>::getInstance().accept(format); + + return format.get(); + } + + void maca_mcf_delete(maca_mcf *format) { + PtrPool<macaon::Mcf>::getInstance().release(format); + } + + int maca_mcf_input(maca_mcf *format, char *column_name) { + try { + return format->input(column_name); + } catch (std::exception &e) { + return -1; + } + } + + int maca_mcf_output(maca_mcf *format, char *column_name, char *column_type) { + try { + return format->output(column_name, column_type); + } catch (std::exception &e) { + return -1; + } + } + + maca_mcf_column *maca_mcf_get_column_info(maca_mcf *format, int id) { + try { + return &(format->getColumnInfo(id)); + } catch (std::exception &e) { + return NULL; + } + } + + maca_mcf_column *maca_mcf_get_column_info_by_name(maca_mcf *format, char *name) { + try { + return &(format->getColumnInfo(name)); + } catch (std::exception &e) { + return NULL; + } + } + + maca_mcf_sentence *maca_mcf_read_next_sentence(maca_mcf *format) { + std::shared_ptr<macaon::McfSentence> ptr = format->readNextSentence(); + return PtrPool<macaon::McfSentence>::getInstance().accept(ptr); + } + + maca_mcf_sentences *maca_mcf_read_sentences(maca_mcf *format) { + std::shared_ptr<macaon::McfSentences> ptr = format->readSentences(); + return PtrPool<macaon::McfSentences>::getInstance().accept(ptr); + } + + void maca_mcf_print_header(maca_mcf *format, FILE *out) { + format->printHeader(out); + } + + char maca_mcf_dump_alphabets(maca_mcf *format, char *output_filename) { + try { + format->dumpAlphabets(output_filename); + return 1; + } catch (std::exception &e) { + return 0; + } + } +} diff --git a/maca_graph_parser/maca_mcf_wrapper.h b/maca_graph_parser/maca_mcf_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..7437b48f37c94896aef4e9798d61d394f952e57d --- /dev/null +++ b/maca_graph_parser/maca_mcf_wrapper.h @@ -0,0 +1,490 @@ +/******************************************************************************* + Copyright (C) 2012 by Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> + Ghasem Mirroshandel <ghasem.mirroshandel@lif.univ-mrs.fr> + Jeremy Auguste <jeremy.auguste@etu.univ-amu.fr> + This file is part of maca_graph_parser. + + maca_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + maca_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_tagger. If not, see <http://www.gnu.org/licenses/>. +*******************************************************************************/ + +#ifndef __MACA_MCF_WRAPPER_H__ +#define __MACA_MCF_WRAPPER_H__ + +#include "maca_alphabet_wrapper.h" + +#ifdef __cplusplus +extern "C" { + namespace macaon {}; + using namespace macaon; +#else +#include <stdio.h> +#endif + + typedef struct McfWord maca_mcf_word; + + /** + * Get the id of a word in a sentence. + * + * @param word a word of a sentence. + * + * @return the id of the word. + */ + int maca_mcf_word_get_id(maca_mcf_word *word); + + /** + * Adds an integer value to the specified column. + * + * @param word the word we are working on. + * @param column the column in which we must insert our value. + * @param value an integer we want to insert. + */ + void maca_mcf_word_add_value_int(maca_mcf_word *word, int column, int value); + + /** + * Adds a float value to the specified column. + * + * @param word the word we are working on. + * @param column the column in which we must insert our value. + * @param value a float we want to insert. + */ + void maca_mcf_word_add_value_float(maca_mcf_word *word, int column, float value); + + /** + * Adds a string value to the specified column. + * + * @param word the word we are working on. + * @param column the column in which we must insert our value. + * @param value a string we want to insert. + */ + void maca_mcf_word_add_value_string(maca_mcf_word *word, int column, char* value); + + /** + * Adds a boolean value to the specified column. + * + * @param word the word we are working on. + * @param column the column in which we must insert our value. + * @param value a boolean we want to insert. + */ + void maca_mcf_word_add_value_bool(maca_mcf_word *word, int column, char value); + + /** + * Adds a symbol, represented by its integer value from the alphabet, + * to the specified column. + * + * @param word the word we are working on. + * @param column the column in which we must insert our value. + * @param symbol a symbol we want to insert. + * @param alphabet the alphabet of the symbol. + * + * @return 1 if successful, 0 if an error occured (alphabet locked and unknown symbol, etc.) + */ + char maca_mcf_word_add_value_alphabet(maca_mcf_word *word, int column, char *symbol, + maca_alphabet *alphabet); + + /** + * Gets the number of values in the specified column. + * + * @param word the word we are working on. + * @param column the column we are checking. + * + * @return the number of values in the column. + * + * @note it's recommended to call this function before fetching the values. + */ + int maca_mcf_word_get_nb_values(maca_mcf_word *word, int column); + + /** + * Gets an integer value from the specified column and index. + * + * @param word the word we want to get a walue from. + * @param column the column we want to fetch from. + * @param index the index of the value in the vector of values. + * + * @return an integer value. + * + * @note if value was inserted using _add_value_alphabet, use this function to fetch. + */ + int maca_mcf_word_get_value_int(maca_mcf_word *word, int column, int index); + + /** + * Gets a float value from the specified column and index. + * + * @param word the word we want to get a value from. + * @param column the column we want to fetch from. + * @param index the index of the value in the vector of values. + * + * @return a float value. + */ + float maca_mcf_word_get_value_float(maca_mcf_word *word, int column, int index); + + /** + * Gets an integer value from the specified column and index. + * + * @param word the word we want to get a value from. + * @param column the column we want to fetch from. + * @param index the index of the value in the vector of values. + * @param str the buffer string to insert our value in. + * @param size the size of the buffer. + */ + void maca_mcf_word_get_value_string(maca_mcf_word *word, int column, int index, + char *str, int size); + + /** + * Gets a boolean value from the specified column and index. + * + * @param word the word we want to get a value from. + * @param column the column we want to fetch from. + * @param index the index of the value in the vector of values. + * + * @return a boolean value. + */ + char maca_mcf_word_get_value_bool(maca_mcf_word *word, int column, int index); + + /** + * Clears all the values in the specified column of the specified word. + * + * @param word the word we are clearing values from. + * @param column the column to clear. + */ + void maca_mcf_word_clear_values(maca_mcf_word *word, int column); + /** + * Prints the word (i.e its columns) on the desired output. + * + * @param word the word we want to print. + * @param out the desired output. + */ + void maca_mcf_word_print(maca_mcf_word *word, FILE *out); + + typedef struct McfSentence maca_mcf_sentence; + + /** + * Creates a new maca_mcf_sentence. + * + * @param nb_columns the number of columns the sentence must have. + * + * @return a pointer to the new maca_mcf_sentence. + * + * @see maca_mcf_sentence_delete() + */ + maca_mcf_sentence *maca_mcf_sentence_new(int nb_columns); + + /** + * Frees a maca_mcf_sentence. + * + * @param sentence the sentence we are freeing. + * + * @see maca_mcf_sentence_new() + */ + void maca_mcf_sentence_delete(maca_mcf_sentence *sentence); + + /** + * Releases a previously obtained maca_mcf_sentence. + * + * @param sentence the sentence we are releasing. + * + * @see maca_mcf_read_next_sentence() + * @see maca_mcf_sentences_get_sentence() + */ + void maca_mcf_sentence_release(maca_mcf_sentence *sentence); + + /** + * Creates a new word in the sentence. + * + * @param sentence the sentence in which we are adding a new word. + * + * @return a pointer to the added word, do not try to free this! + */ + maca_mcf_word *maca_mcf_sentence_add_word(maca_mcf_sentence *sentence); + + /** + * Gets a word from a sentence at the specified index. + * + * @param sentence the sentence we are fetching from. + * @param index the index of the word. + * + * @return a pointer to the wanted word, do not try to free this! + */ + maca_mcf_word *maca_mcf_sentence_get_word(maca_mcf_sentence *sentence, int index); + + /** + * Clears the sentence of all its words. + * + * @param sentence the sentence we want to clear. + */ + void maca_mcf_sentence_clear(maca_mcf_sentence *sentence); + + /** + * Gets the length of a sentence. + * + * @param the sentence we want to get the length from. + * + * @return the length of a sentence. + */ + int maca_mcf_sentence_get_length(maca_mcf_sentence *sentence); + + /** + * Prints the sentence on the desired output. + * + * @param sentence the sentence we want to print. + * @param out the desired output. + */ + void maca_mcf_sentence_print(maca_mcf_sentence *sentence, FILE *out); + + typedef struct McfSentences maca_mcf_sentences; + + /** + * Creates a new maca_mcf_sentences. + * + * @return a pointer to the new maca_mcf_sentences. + * + * @see maca_mcf_sentences_delete() + */ + maca_mcf_sentences *maca_mcf_sentences_new(); + + /** + * Frees a maca_mcf_sentences. + * + * @param sentences the sentences we are freeing. + * + * @see maca_mcf_sentences_new() + */ + void maca_mcf_sentences_delete(maca_mcf_sentences *sentences); + + /** + * Releases a maca_mcf_sentences. + * + * @param sentences the sentence we are releasing. + * + * @see maca_mcf_read_sentences() + */ + void maca_mcf_sentences_release(maca_mcf_sentences *sentences); + + /** + * Add a new sentence to the sentences. + * + * @param sentences the sentences we are adding the sentence into. + * @param sentence the sentence we are adding. + */ + void maca_mcf_sentences_add_sentence(maca_mcf_sentences *sentences, maca_mcf_sentence *sentence); + + /** + * Get a sentence from the sentences. + * + * @param sentences the sentences we are fetching from. + * @param index the index of a sentence. + * + * @return a pointer to a maca_mcf_sentence. + * + * @note you must release this pointer once you're finished with it! + * + * @see maca_mcf_sentence_release() + */ + maca_mcf_sentence *maca_mcf_sentences_get_sentence(maca_mcf_sentences *sentences, int index); + + /** + * Gets the number of sentences. + * + * @param sentences the sentences we are working on. + * + * @return the number of sentences. + */ + int maca_mcf_sentences_size(maca_mcf_sentences *sentences); + + + /** + * Prints all the sentences on the desired output. + * + * @param sentences the sentences to print. + * @param out the desired output. + */ + void maca_mcf_sentences_print(maca_mcf_sentences *sentences, FILE *out); + + typedef struct McfColumn maca_mcf_column; + + /** + * Gets the name of a column. + * + * @param column the column we are getting the name from. + * @param name the string buffer to store the name in. + * @param size the size of the buffer. + */ + void maca_mcf_column_get_name(maca_mcf_column *column, char *name, int size); + + /** + * Gets the type of a column. + * + * @param column the column we are getting the type from. + * @param type the string buffer to store the type in. + * @param size the size of the buffer. + */ + void maca_mcf_column_get_type(maca_mcf_column *column, char *type, int size); + + /** + * Gets the alphabet used by the column (if the type is an alphabet). + * + * @param column the column we are getting the alphabet from. + * + * @return a pointer to the alphabet. + * + * @note you must release this pointer once you're finished with it! + * + * @see maca_alphabet_release() + */ + maca_alphabet *maca_mcf_column_get_alphabet(maca_mcf_column *column); + + /** + * Sets the alphabet of a column. + * + * @param column the column we are updating. + * @param alphabet the alphabet we are setting. + */ + void maca_mcf_column_set_alphabet(maca_mcf_column *column, maca_alphabet *alphabet); + + typedef struct Mcf maca_mcf; + + /** + * Creates a new maca_mcf. + * + * @param input_filename the name of the file in the mcf format. + * + * @return a new maca_mcf. + * + * @see maca_mcf_new_with_alphabets() + * @see maca_mcf_delete() + */ + maca_mcf *maca_mcf_new(char *input_filename); + + /** + * Creates a new maca_mcf, with filled alphabets. + * + * @param input_filename the name of the file in the mcf format. + * @param alphabet_filename the name of the file which contains all the alphabets. + * + * @return a new maca_mcf. + * @see maca_mcf_new() + * @see maca_mcf_delete() + */ + maca_mcf *maca_mcf_new_with_alphabets(char *input_filename, char *alphabet_filename); + + /** + * Creates a new maca_mcf, with filled alphabets using alphabets from another alphabet array. + * + * @param input_filename the name of the file in the mcf format. + * @param array the alphabet array to take the alphabets from. + * + * @return a new maca_mcf. + * + * @see maca_mcf_delete() + */ + maca_mcf *maca_mcf_new_with_alphabet_array(char *input_filename, maca_alphabet_array *array); + + /** + * Frees a maca_mcf. + * + * @param format the maca_mcf we are freeing. + * + * @see maca_mcf_new() + * @see maca_mcf_new_with_alphabets() + */ + void maca_mcf_delete(maca_mcf *format); + + /** + * Notifies the format that a column will be used. + * + * @param format the format of the mcf file. + * @param column_name the name of the column. + * + * @return the id of column if it exists, -1 if it's an unknown column. + */ + int maca_mcf_input(maca_mcf *format, char *column_name); + + /** + * Adds an output column to the format. + * + * @param format the format of the mcf file. + * @param column_name the name of the new column. + * @param column_type the type of the new column. + * + * @return the id of the new column, -1 if the column name already exists. + * + * @note in case the type is an alphabet, it will look for an alphabet with the + * same name. If it doesn't find one, it will create a new alphabet. + */ + int maca_mcf_output(maca_mcf *format, char *column_name, char *column_type); + + /** + * Gets a structure which stores information on the specified column. + * + * @param format the format of the mcf file. + * @param id the id of the column. + * + * @return a maca_mcf_column structure, or NULL if id is invalid. + */ + maca_mcf_column *maca_mcf_get_column_info(maca_mcf *format, int id); + + /** + * Gets a structure which stores information on the specified column. + * + * @param format the format of the mcf file. + * @param name the name of the column. + * + * @return a maca_mcf_column structure, or NULL if id is invalid. + */ + maca_mcf_column *maca_mcf_get_column_info_by_name(maca_mcf *format, char *name); + + /** + * Reads the next sentence in the mcf file. + * + * @param format the format of the mcf file. + * + * @return a pointer to the read sentence, or NULL if none were found. + * + * @note you must release the pointer once you're finished with it! + * + * @see maca_mcf_sentence_release() + */ + maca_mcf_sentence *maca_mcf_read_next_sentence(maca_mcf *format); + + /** + * Reads all the sentences of the mcf file. + * + * @param format the format of the mcf file. + * + * @return a pointer to the read sentences. + * + * @note you must release the pointer once you're finished with it! + * + * @see maca_mcf_sentences_release() + */ + maca_mcf_sentences *maca_mcf_read_sentences(maca_mcf *format); + + /** + * Prints the header into the desired output. + * + * @param format the format of the mcf file. + * @param out the desired output. + */ + void maca_mcf_print_header(maca_mcf *format, FILE *out); + + /** + * Dumps the alphabets into the specified file. + * + * @return 1 if successful, 0 otherwise. + */ + char maca_mcf_dump_alphabets(maca_mcf *format, char *output_filename); + +#ifdef __cplusplus +} +#endif +#endif /* __MACA_MCF_WRAPPER_H__ */ diff --git a/maca_graph_parser/maca_msg.c b/maca_graph_parser/maca_msg.c new file mode 100644 index 0000000000000000000000000000000000000000..a5084eeff9f823221731c78d51948ae43e286a1d --- /dev/null +++ b/maca_graph_parser/maca_msg.c @@ -0,0 +1,106 @@ +/*********************************************************************************** + Copyright (C) 2009-2012 by Jean-François Rey <jean-francois.rey@lif.univ-mrs.fr> + This file is part of macaon. + + Macaon is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Macaon is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with macaon. If not, see <http://www.gnu.org/licenses/>. +************************************************************************************/ + +/** + * \file maca_msg.c + * \brief macaon log function implementation + * \author Jean-François REY + * \version 2.5 + * \date 02 Aug 2012 + * + */ + +#include"maca_msg.h" + + +void maca_msg(char *module, int type) +{ + if(type == MACA_ERROR){ + fprintf(stderr, "[%s] ERROR : ", module); + } + else + if(type == MACA_WARNING){ + fprintf(stderr, "[%s] WARNING : ", module); + } + else + if(type == MACA_MESSAGE){ + fprintf(stderr, "[%s] MSG : ", module); + } +} + +void maca_print_msg(char * module, int type, const char * function, const char * message, ...) +{ + FILE * out; + va_list args; + + switch(type){ + case MACA_ERROR : out = stderr; + fprintf(stderr, "[%s] ERROR ", module); + break; + case MACA_WARNING : out = stderr; + fprintf(stderr, "[%s] WARNING ", module); + break; + case MACA_MESSAGE : out = stderr; + fprintf(stderr, "[%s] MSG ", module); + break; + default : out = stderr; + fprintf(stderr, "[%s] DEFAULT MSG : ", module); + break; + } + + + if(function) fprintf(out, "# %s # -> ", function); + else fprintf(out,": "); + + va_start( args, message ); + vfprintf(out,message,args); + va_end(args); + + fprintf(out,"\n"); +} + + +void maca_print_verbose(char * module, int level, int type, char * function, char * message, ...) +{ +// extern int verbose; + va_list args; + int i; + + if(level <= maca_verbose || level == -1) + { + for(i=1;i<level;i++) fprintf(stderr," "); + maca_msg(module,type); + if(function != NULL) fprintf(stderr,"# %s # -> ",function); + va_start(args,message); + vfprintf(stderr,message,args); + va_end(args); + fprintf(stderr,"\n"); + } +} + + +void maca_print_vverbose(char * module, int level, int type,char * function, char * message, va_list * args) +{ + int i; + + for(i=1;i<level;i++) fprintf(stderr," "); + maca_msg(module,type); + if(function != NULL) fprintf(stderr,"# %s # -> ",function); + vfprintf(stderr,message,*args); + fprintf(stderr,"\n"); +} diff --git a/maca_graph_parser/maca_msg.h b/maca_graph_parser/maca_msg.h new file mode 100644 index 0000000000000000000000000000000000000000..d630fdc521f671b830200e79255648d9cc18abaf --- /dev/null +++ b/maca_graph_parser/maca_msg.h @@ -0,0 +1,107 @@ +/*********************************************************************************** + Copyright (C) 2009-2012 by Jean-François Rey <jean-francois.rey@lif.univ-mrs.fr> + This file is part of macaon. + + Macaon is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Macaon is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with macaon. If not, see <http://www.gnu.org/licenses/>. +************************************************************************************/ + +/** + * \file maca_msg.h + * \brief macaon log function + * \author Jean-François REY + * \version 2.5 + * \date 02 Aug 2011 + * + */ + +#ifndef __MACA_MSG__ +#define __MACA_MSG__ + +#include <stdio.h> +#include <stdarg.h> + +#include "maca_constants.h" + +#ifndef MACA_ERROR +#define MACA_ERROR 1 +#define MACA_WARNING 2 +#define MACA_MESSAGE 3 +#endif + +#define MACA_V_LOAD_XML "loading xml %s" +#define MACA_V_LOAD_CONFIG_XML "loading config xml file %s" +#define MACA_V_INIT "initialize data %s" +#define MACA_V_INIT_COMMON "initialize common options %s" +#define MACA_V_LEVEL "verbose level %i" +#define MACA_V_ON "verbose on %s" +#define MACA_V_FREE_MEM "free sentence %s %s section" +#define MACA_V_FREE_ALL "free all data from %s" +#define MACA_V_SENTENCE "processing sentence %s" +#define MACA_V_AUTOMATON "loading automaton %s" +#define MACA_V_UPDATE_AUTOMATON "update automaton %s" +#define MACA_V_FOUND_SECTION "found a <section> tag of type %s, ignoring it" +#define MACA_V_SECTION "processing section %s" +#define MACA_V_ADD_SECTION "add section %s" +#define MACA_V_ERROR_LOAD_SECTION "can't load %s section" +#define MACA_V_ADD_SEGS "add segments %s to %s section" +#define MACA_V_ADD_SEG "add segment %s to %s section" +#define MACA_V_ADD_XML_AUTOMATON "add xml automaton to %s section" + +#ifdef __cplusplus +extern "C"{ +#endif + + + +/** Print type of message of a module + * \param module : which module call this function + * \param type : type of message, MACA_ERROR, MACA_WARNING or MACA_MSG + * + * Call this function before doing a fprintf on type output. + */ +void maca_msg(char *module, int type); + +/** Print a message + * \param module : which module call this function + * \param type : type of message, MACA_ERROR, MACA_WARNING or MACA_MSG + * \param function : in which function this maca_print_msg is call + * \param message : format of the message to print (like fprintf) + * \param ... : list of variable to print in message (like fprintf) + */ +void maca_print_msg(char * module, int type, const char * function, const char * message, ...); + +/** Print verbose + * \param module : which module call this function + * \param level : verbose level to show + * \param type : type of message, MACA_ERROR, MACA_WARNING or MACA_MSG + * \param function : in which function this maca_print_msg is call + * \param message : format of the message to print (like fprintf) + * \param ... : list of variable to print in message (like fprintf) + */ +void maca_print_verbose(char * module, int level, int type, char * function, char * message, ...); + +/** Print verbose + * \param module : which module call this function + * \param level : verbose level to show + * \param type : type of message, MACA_ERROR, MACA_WARNING or MACA_MSG + * \param function : in which function this maca_print_msg is call + * \param message : format of the message to print (like fprintf) + * \param va_list : list of variable to print in message + */ +void maca_print_vverbose(char * module, int level, int type, char * function, char * message, va_list *args); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/maca_graph_parser/simple_parser.cc b/maca_graph_parser/simple_parser.cc new file mode 100644 index 0000000000000000000000000000000000000000..f2821c5c498fbbce5e300f0a5f0a0a623e11aa93 --- /dev/null +++ b/maca_graph_parser/simple_parser.cc @@ -0,0 +1,223 @@ +#include <string> + +#include <string.h> +#include <stdio.h> +#include <stdlib.h> + +#include"simple_parser.h" + +extern "C" { +//#include <maca_macaon.h> +#include "maca_common.h" +#include "maca_constants.h" +#include "maca_graph_parser_sentence.h" +#include "maca_graph_parser_decoder1.h" +#include "maca_graph_parser_decoder2.h" +#include "maca_graph_parser_decoder.h" +#include "maca_graph_parser_model.h" +#include "maca_graph_parser_features.h" +#include "maca_graph_parser_dep_count_table.h" +#include "maca_graph_parser_feature_table.h" +} + +bool macaon::Parser::ProcessSentence(const std::vector<std::string> &words, + const std::vector<std::string> &tags, + const std::vector<std::string> &lemmas, + std::vector<ParsedWord>& output) +{ + maca_graph_parser_sentence *maca_s = NULL; + int code_postag, code_lemma, code_form, code_label; + maca_s = maca_graph_parser_allocate_sentence(ctx); + size_t i; + int ill_formed = 0; + + for(i=0; i < words.size(); i++){ + code_postag = maca_alphabet_get_code(ctx->pos_alphabet, (char *)tags[i].c_str()); + if(code_postag == -1){ + if(ctx->verbose_flag > 0){ + maca_msg(ctx->module, MACA_WARNING); + fprintf(stderr,"pos %s unknown, skipping sentence\n", (char *)tags[i].c_str()); + } + ill_formed = 1; + } + code_lemma = lemmas[i] != "" ? maca_alphabet_add_symbol(ctx->words_alphabet, (char *)lemmas[i].c_str()) : -1; + code_form = maca_alphabet_add_symbol(ctx->words_alphabet, (char *)words[i].c_str()); + + code_label = 0; + maca_graph_parser_sentence_add_word(ctx, maca_s, NULL, code_form, code_lemma, code_postag, -1, code_label, 0, NULL); + } + + if(ill_formed){ + maca_graph_parser_free_sentence(maca_s); + return false; + } + + maca_graph_parser_decoder_parse(ctx, maca_s); + + + /*ParsedWord pw0; + pw0.id = 0; + pw0.word = "ROOT"; + // pw0.lemma = "ROOT"; + pw0.posTag = "NA"; + pw0.dependencyLabel = "NA"; + pw0.dependencyParent = -1; + output.push_back(pw0);*/ + + char buffer[128]; + for(i=1; i < (size_t) maca_s->l; i++){ + ParsedWord pw; + pw.id = i; + pw.word = words[i-1]; + pw.lemma = lemmas[i-1]; + pw.posTag = tags[i-1]; + maca_alphabet_get_symbol(ctx->labels_alphabet, maca_s->label[i], buffer, sizeof(buffer)); + pw.dependencyLabel = buffer; + pw.dependencyParent = maca_s->gov[i]; + output.push_back(pw); + } + + maca_graph_parser_free_sentence(maca_s); + return true; +} + + +macaon::Parser::Parser( + const char * cfg, /*!< config/language selected */ + int verbose_flag, /*!< verbose flag */ + const char *model_file_name, + const char *alphabet_file_name, + const char *dep_count_file_name, + int order) +{ + + const char* argv[] = {"macaon", "-C", cfg, "-m", model_file_name, "-a", alphabet_file_name, "-d", dep_count_file_name}; + int argc = sizeof(argv) / sizeof(char*); + + ctx = maca_graph_parser_LoadCTX(argc, (char**) argv); + ctx->verbose_flag = verbose_flag; + + int i; + int sent_num; + + /* alphabets */ + /* load alphabets */ + + maca_alphabet_array *alpha_array = maca_alphabet_array_new_from_file(ctx->alphabet_file_name); + if (alpha_array == NULL) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "couldn't open the alphabet file!\n"); + exit(1); + } + ctx->words_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_WORDS); + ctx->words_nb = (ctx->words_alphabet != NULL) ? maca_alphabet_size(ctx->words_alphabet) : 0; + ctx->labels_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_LABELS); + ctx->labels_nb = (ctx->labels_alphabet != NULL) ? maca_alphabet_size(ctx->labels_alphabet) : 0; + ctx->pos_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_POS); + ctx->pos_nb = (ctx->pos_alphabet != NULL) ? maca_alphabet_size(ctx->pos_alphabet) : 0; + ctx->morpho_alphabet = maca_alphabet_array_get_alphabet(alpha_array, MACA_ALPHABET_MORPHO); + ctx->morpho_nb = (ctx->morpho_alphabet != NULL) ? maca_alphabet_size(ctx->morpho_alphabet) : 0; + + maca_alphabet_array_delete(alpha_array); + + /* store special values in ctx and check that every necessary alphabet is loaded */ + if (ctx->use_full_forms || ctx->use_lemmas) { + if (ctx->words_alphabet == NULL) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "missing the '" MACA_ALPHABET_WORDS "' alphabet in the alphabet file\n"); + exit(1); + } + ctx->w_start = maca_alphabet_get_code(ctx->words_alphabet, "__START__"); + ctx->w_end = maca_alphabet_get_code(ctx->words_alphabet, "__END__"); + } + + if (ctx->pos_alphabet == NULL) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "missing the '" MACA_ALPHABET_POS "' alphabet in the alphabet file\n"); + exit(1); + } + ctx->pos_start = maca_alphabet_get_code(ctx->pos_alphabet, "__START__"); + ctx->pos_end = maca_alphabet_get_code(ctx->pos_alphabet, "__END__"); + + if (ctx->labels_alphabet == NULL) { + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr, "missing the '" MACA_ALPHABET_LABELS "' alphabet in the alphabet file\n"); + exit(1); + } + ctx->fct_joker = maca_alphabet_get_code(ctx->labels_alphabet, "__JOKER__"); + + /* end alphabets */ + + /* template library allocator needs: words_nb, pos_nb, labels_nb */ + ctx->e = maca_graph_parser_templ_library_allocator(ctx); + + /* load dep_count_table */ + ctx->dep_count_table = maca_graph_parser_dep_count_table_read(ctx, ctx->dep_count_table_file_name); + + /* model */ + //ctx->model = maca_graph_parser_model_mmap(ctx, ctx->model_file_name); + ctx->model = maca_graph_parser_model_load(ctx, ctx->model_file_name); + /* model2 */ + if(ctx->model2_file_name != NULL){ + ctx->model2 = maca_graph_parser_model_load(ctx, ctx->model2_file_name); + } else { + ctx->model2 = NULL; + } + /* set active feature types for the decoder */ + ctx->min_dep_count = ctx->model->min_dep_count; + ctx->use_lemmas = ctx->model->use_lemmas; + ctx->use_full_forms = ctx->model->use_full_forms; + ctx->basic_features = ctx->model->basic_features; + ctx->first_features = ctx->model->first_features; + ctx->grandchildren_features = ctx->model->grandchildren_features; + ctx->sibling_features = ctx->model->sibling_features; + + if(ctx->sibling_features || ctx->grandchildren_features) ctx->order = 2; + + /* allocate feature table */ + if(ctx->store_in_feature_table){ + maca_graph_parser_feature_table_allocator(ctx); + } + + ctx->s = maca_graph_parser_allocate_sentence(ctx); + + if(ctx->print_ctx) maca_graph_parser_print_ctx(ctx); + + + loaded = true; +} + +macaon::Parser::~Parser() +{ + //maca_close(); + maca_graph_parser_free_all(ctx); +} + +/* C interface to generic parser */ +macaon::Parser* Parser_new(char * cfg, int verbose_flag, char *model_file_name, char *alphabet_file_name, char *dep_count_file_name, int order) { + return new macaon::Parser(cfg, verbose_flag, model_file_name, alphabet_file_name, dep_count_file_name, order); +} + +void Parser_free(macaon::Parser* parser) { + delete parser; +} + +bool Parser_ProcessSentence(macaon::Parser* parser, int num_words, char** words, char** tags, char** lemmas, int* governors, const char** labels) { + std::vector<std::string> word_vector; + std::vector<std::string> tag_vector; + std::vector<std::string> lemma_vector; + + std::vector<macaon::ParsedWord> output; + for(int i = 0; i < num_words; i++) { + word_vector.push_back(words[i]); + tag_vector.push_back(tags[i]); + lemma_vector.push_back(lemmas[i]); + } + bool result = parser->ProcessSentence(word_vector, tag_vector, lemma_vector, output); + for(int i = 0; i < num_words; i++) { + governors[i] = output[i].dependencyParent; + labels[i] = strdup(output[i].dependencyLabel.c_str()); // do not forget to free + } + return result; +} + diff --git a/maca_graph_parser/simple_parser.h b/maca_graph_parser/simple_parser.h new file mode 100644 index 0000000000000000000000000000000000000000..5f7c952e9573e22fcf03a5d9e8345b588fb7e5c4 --- /dev/null +++ b/maca_graph_parser/simple_parser.h @@ -0,0 +1,55 @@ +#pragma once +#ifdef __cplusplus +#include <string> +#endif +extern "C"{ +#include "maca_graph_parser.h" +} +#include<string> +#include<vector> + +namespace macaon { + struct ParsedWord { + int id; + std::string word; + std::string lemma; + std::string posTag; + std::string dependencyLabel; + int dependencyParent; // relative to word id + }; + + class Parser { + private: + bool loaded; + maca_graph_parser_ctx *ctx; + public: + /* takes models trained for the macaon parser */ + + Parser( + const char * cfg, /*!< config/language selected */ + int verbose_flag, /*!< verbose flag */ + const char *model_file_name, + const char *alphabet_file_name, + const char *dep_count_file_name, + int order); + ~Parser(); + + bool ProcessSentence(const std::vector<std::string> &words, + const std::vector<std::string> &tags, + const std::vector<std::string> &lemmas, + std::vector<ParsedWord>& output); + + bool IsLoaded() { return loaded; } + }; + +} + +/* add C interface to generic parser */ +extern "C" { + macaon::Parser* Parser_new(char * cfg, int verbose_flag, char *model_file_name, char *alphabet_file_name, char *dep_count_file_name, int order); + + void Parser_free(macaon::Parser* parser); + + bool Parser_ProcessSentence(macaon::Parser* parser, int num_words, char** words, char** tags, char** lemmas, int* governors, const char** labels); +} + diff --git a/maca_graph_parser/test_simple_parser.cc b/maca_graph_parser/test_simple_parser.cc new file mode 100644 index 0000000000000000000000000000000000000000..1f070c00c6ca84882e3ae4364264253a02e35b9d --- /dev/null +++ b/maca_graph_parser/test_simple_parser.cc @@ -0,0 +1,44 @@ +#include <iostream> +#include <string> +#include <vector> +#include <sstream> + +#include "simple_parser.h" +using namespace macaon; + +int main(int argc, char** argv) +{ + if(argc < 4) { + std::cerr << "usage: " << argv[0] << " <cfg> <model.bin> <model.alpha> <model.dep_count>\n"; + std::cerr << "expects: one word, one lemma, one tag, per line, empty line between sentences\n"; + return 1; + } + Parser mp(argv[1], 0, argv[2], argv[3], argv[4], 1); + + std::vector<std::string> words; + std::vector<std::string> lemmas; + std::vector<std::string> tags; + + std::string line; + while(std::getline(std::cin, line)) { + std::cout << "|" << line << "|\n"; + if(line == "") { + std::vector<ParsedWord> output; + mp.ProcessSentence(words, tags, lemmas, output); + std::cout << "size: " << output.size() << "\n"; + for(size_t i = 0; i < output.size(); i++){ + std::cout << i << " " << output[i].word << " " << output[i].posTag << " " << output[i].dependencyParent << " " << output[i].dependencyLabel << "\n"; + } + std::cout << "\n"; + words.clear(); + tags.clear(); + lemmas.clear(); + } + std::stringstream reader(line); + std::string word, lemma, tag; + reader >> word >> lemma >> tag; + words.push_back(word); + lemmas.push_back(lemma); + tags.push_back(tag); + } +}