diff --git a/CMakeLists.txt b/CMakeLists.txt index 85d4e4bcdce693d1da0be09721efe1118d02158e..a94a43c37b96a18c5b65627e3d311e8fdde884d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,5 +8,6 @@ include_directories(maca_common/include) add_subdirectory(maca_common) add_subdirectory(maca_lemmatizer) add_subdirectory(maca_trans_parser) +add_subdirectory(maca_crf_tagger) #set(CMAKE_INSTALL_PREFIX ../) diff --git a/maca_crf_tagger/CMakeLists.txt b/maca_crf_tagger/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6df7df6ef265f3d1eafcd632372b032d8aa19344 --- /dev/null +++ b/maca_crf_tagger/CMakeLists.txt @@ -0,0 +1,24 @@ +include_directories(src) + +#compiling, linking and installing executables + +add_executable(crf_barebones_decoder ./src/crf_barebones_decoder.cc) +target_compile_options(crf_barebones_decoder PRIVATE -std=c++11) +install (TARGETS crf_barebones_decoder DESTINATION bin) + +#add_executable(test_simple_tagger ./src/test_simple_tagger.cc) +#target_compile_options(test_simple_tagger PRIVATE -std=c++11) +#install (TARGETS test_simple_tagger DESTINATION bin) + +add_executable(apply_template_crfsuite ./src/apply_template_crfsuite.cc) +target_compile_options(apply_template_crfsuite PRIVATE -std=c++11) +install (TARGETS apply_template_crfsuite DESTINATION bin) + +add_executable(maca_crf_convert_binmodel ./src/maca_crf_convert_binmodel.cc) +target_compile_options(maca_crf_convert_binmodel PRIVATE -std=c++11) +install (TARGETS maca_crf_convert_binmodel DESTINATION bin) + +add_executable(maca_crf_convert_binlexicon ./src/maca_crf_convert_binlexicon.cc) +target_compile_options(maca_crf_convert_binlexicon PRIVATE -std=c++11) +install (TARGETS maca_crf_convert_binlexicon DESTINATION bin) + diff --git a/maca_crf_tagger/src/apply_template_crfsuite.cc b/maca_crf_tagger/src/apply_template_crfsuite.cc new file mode 100644 index 0000000000000000000000000000000000000000..eba51ba44b7e638b6bcc9b88ac41fc86b7078111 --- /dev/null +++ b/maca_crf_tagger/src/apply_template_crfsuite.cc @@ -0,0 +1,72 @@ +#include <string> +#include <vector> +#include <iostream> +#include <fstream> +#include "crf_template.hh" + +// http://www.oopweb.com/CPP/Documents/CPPHOWTO/Volume/C++Programming-HOWTO-7.html +static void tokenize(const std::string& str, std::vector<std::string>& tokens, const std::string& delimiters = " ") +{ + std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + while (std::string::npos != pos || std::string::npos != lastPos) + { + tokens.push_back(str.substr(lastPos, pos - lastPos)); + lastPos = str.find_first_not_of(delimiters, pos); + pos = str.find_first_of(delimiters, lastPos); + } +} + +static void replace(std::string& str, const std::string &search, const std::string &replacement) { + std::string::size_type pos = 0; + while ((pos = str.find(search, pos)) != std::string::npos) { + str.replace(pos, search.size(), replacement); + pos += replacement.size(); + } +} + +int main(int argc, char** argv) { + if(argc != 2) { + std::cerr << "usage: cat <input> | " << argv[0] << " <template>\n"; + return 1; + } + std::vector<macaon::CRFPPTemplate> templates; + std::ifstream templateFile(argv[1]); + while(!templateFile.eof()) { + std::string line; + std::getline(templateFile, line); + if(templateFile.eof()) break; + macaon::CRFPPTemplate current(line.c_str()); + if(current.type != macaon::CRFPPTemplate::BIGRAM) templates.push_back(current); + //std::cerr << templates.back() << std::endl; + } + std::vector<std::vector<std::string> > lines; + while(!std::cin.eof()) { + std::string line; + std::getline(std::cin, line); + if(std::cin.eof()) break; + std::vector<std::string> tokens; + tokenize(line, tokens, " \t"); + if(tokens.size() == 0) { + for(int position = 0; position < (int) lines.size(); position++) { + std::string label = lines[position][lines[position].size() - 1]; + replace(label, "\\", "\\\\"); + replace(label, ":", "\\:"); + std::cout << label; + for(std::vector<macaon::CRFPPTemplate>::const_iterator i = templates.begin(); i != templates.end(); i++) { + std::string feature = i->apply(lines, position); + replace(feature, "\\", "\\\\"); + replace(feature, ":", "\\:"); + std::cout << "\t" << feature; + } + /*if(position == 0) std::cout << "\t__BOS__"; + if(position == (int) lines.size() - 1) std::cout << "\t__EOS__";*/ + std::cout << std::endl; + } + std::cout << std::endl; + lines.clear(); + } else { + lines.push_back(tokens); + } + } +} diff --git a/maca_crf_tagger/src/crf_barebones_decoder.cc b/maca_crf_tagger/src/crf_barebones_decoder.cc new file mode 100644 index 0000000000000000000000000000000000000000..229868f4d0effabbe0e390f88be3cf8dc44b5cb3 --- /dev/null +++ b/maca_crf_tagger/src/crf_barebones_decoder.cc @@ -0,0 +1,90 @@ +#include <vector> +#include "crf_decoder.hh" +#include "crf_binlexicon.hh" +#include "crf_features.hh" + +/* This is a sample decoder for the crf tagger. + compile with: + g++ -O3 -Wall -o barebones_decoder barebones_decoder.cc + + example usage: + echo -e "I\nam\nyour\nfather\n\njhon\neats\npotatoes\n" | ./barebones_decoder en/bin/crf_tagger.model.bin en/bin/crf_tagger.wordtag.lexicon + */ + +void tag_sentence(macaon::Decoder& decoder, macaon::BinaryLexicon* lexicon, const std::vector<std::vector<std::string> >& lines, int wordField, bool isConll07) { + + std::vector<std::vector<std::string> > features; + for(size_t i = 0; i < lines.size(); i++) { + std::vector<std::string> word_features; + macaon::FeatureGenerator::get_pos_features(lines[i][wordField], word_features); + features.push_back(word_features); + //for(size_t j = 0; j < word_features.size(); j++) std::cout << word_features[j] << " "; + //std::cout << "\n"; + } + std::vector<std::string> tagged; + decoder.decodeString(features, tagged, lexicon); + for(size_t i = 0; i < tagged.size(); i++) { + if(isConll07) { + for(size_t j = 0; j < lines[i].size(); j++) { + if(j != 0) std::cout << "\t"; + if(j == 3 || j == 4) std::cout << tagged[i]; + else std::cout << lines[i][j]; + } + std::cout << "\n"; + } else { + std::cout << lines[i][wordField] << "\t" << tagged[i] << "\n"; + } + } + std::cout << "\n"; +} + +void usage(const char* argv0) { + std::cerr << "usage: " << argv0 << " [--conll07] <model> [lexicon]\n"; + exit(1); +} + +int main(int argc, char** argv) { + bool isConll07 = false; // warning: no verification of conll07 format + int word_offset = 0; + std::string modelName = ""; + std::string lexiconName = ""; + + for(int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if(arg == "-h" || arg == "--help") { + usage(argv[0]); + } else if(arg == "--conll07") { + isConll07 = true; + word_offset = 1; + } else if(modelName == "") { + modelName = arg; + } else if(lexiconName =="") { + lexiconName = arg; + } else { + usage(argv[0]); + } + } + if(modelName == "") usage(argv[0]); + + macaon::Decoder decoder(modelName); + macaon::BinaryLexicon *lexicon = NULL; + if(lexiconName != "") lexicon = new macaon::BinaryLexicon(lexiconName, decoder.getTagset()); + + std::string line; + std::vector<std::vector<std::string> > lines; + while(std::getline(std::cin, line)) { + if(line == "") { + tag_sentence(decoder, lexicon, lines, word_offset, isConll07); + lines.clear(); + } else { + std::vector<std::string> tokens; + macaon::Tokenize(line, tokens, "\t"); + lines.push_back(tokens); + } + } + if(!lines.empty()) { + tag_sentence(decoder, lexicon, lines, word_offset, isConll07); + } + if(lexicon) delete lexicon; + return 0; +} diff --git a/maca_crf_tagger/src/crf_binlexicon.hh b/maca_crf_tagger/src/crf_binlexicon.hh new file mode 100644 index 0000000000000000000000000000000000000000..82948603fd9a9d18f4bbf096131368425b49950b --- /dev/null +++ b/maca_crf_tagger/src/crf_binlexicon.hh @@ -0,0 +1,365 @@ +#pragma once + +#include <stdio.h> +#include <string.h> +#include <stdint.h> +#include "crf_model.hh" +#include "crf_template.hh" +#include "crf_lexicon.hh" + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <unistd.h> +#include <errno.h> + +#include <limits.h> +#ifdef CHAR_BIT +#if CHAR_BIT != 8 +#error CHAR_BIT != 8 not supported +#endif +#endif + +namespace macaon { + const uint32_t lexiconMagic = 0xbffe1253; + + class BinaryLexicon : public Lexicon { + // disable alignment in MSVC++ +#pragma pack(push, 1) + struct ModelInfo { + uint32_t magic; + uint32_t dataLocation; + uint32_t tableSize; + uint32_t numLabels; + } __attribute__((packed)); + + struct TableElement { + uint32_t hashValue; + uint8_t keySize; + uint8_t dataSize; + uint32_t location; + } __attribute__((packed)); // disable alignment in g++ +#define lexicon_tag_t uint8_t +#define sizeof_LexiconTableElement (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint8_t) + sizeof(uint32_t)) + +#pragma pack(pop) + + private: + bool isBinary; + int fd; + const char* data; + size_t dataLength; + const ModelInfo* info; + const TableElement* table; + + // copied from https://smhasher.googlecode.com/svn-history/r136/trunk/MurmurHash3.cpp (MIT license) + + static inline uint32_t rotl32 ( uint32_t x, int8_t r ) + { + return (x << r) | (x >> (32 - r)); + } + + static inline uint64_t rotl64 ( uint64_t x, int8_t r ) + { + return (x << r) | (x >> (64 - r)); + } + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) +#define FORCE_INLINE inline + + static FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) + { + return p[i]; + } + + static FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) + { + return p[i]; + } + + static FORCE_INLINE uint32_t fmix ( uint32_t h ) + { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; + } + + //---------- + + static FORCE_INLINE uint64_t fmix ( uint64_t k ) + { + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; + } + + static void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ) + { + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + //std::cerr << k1 << " " << h1 << " " << seed << "\n"; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix(h1); + + *(uint32_t*)out = h1; + } + + static uint32_t Hash(const char *k, size_t length) { + uint32_t output = 0; + MurmurHash3_x86_32(k, length, BinaryModelConstants::magic, &output); + + // java hash function + /*uint32_t output = 0; + for(size_t i = 0; i < length; i++) { + output = 31 * output + k[i]; + }*/ + return output; + } + + public: + BinaryLexicon() : Lexicon(), isBinary(false) {} + BinaryLexicon(const std::string &filename, Symbols* _tagSymbols = NULL) : Lexicon(), isBinary(false), fd(-1), data((const char*) MAP_FAILED) { + tagSymbols = _tagSymbols; + Load(filename); + } + + ~BinaryLexicon() { + if(data != MAP_FAILED) munmap((void*) data, dataLength); + if(fd != -1) close(fd); + } + + bool Convert(const std::string& from, const std::string& to) { + std::cerr << "loading\n"; + Lexicon::Load(from); + std::cerr << "writing\n"; + return Write(to); + } + + bool Write(const std::string & filename) { + FILE* output = fopen(filename.c_str(), "w"); + // magic + fwrite(&lexiconMagic, sizeof(lexiconMagic), 1, output); // magic + + // features + uint32_t dataLocation = 0; + uint32_t dataLocationOffset = (uint32_t) ftell(output); + fwrite(&dataLocation, sizeof(dataLocation), 1, output); + uint32_t tableSize = (uint32_t) wordSymbols.NumSymbols() * 2; + fwrite(&tableSize, sizeof(tableSize), 1, output); + uint32_t numLabels = tagsForWord[kUnknownWordTags].size(); + fwrite(&numLabels, sizeof(numLabels), 1, output); + + // create table + TableElement* table = (TableElement*) malloc(sizeof(TableElement) * tableSize); + memset(table, 0, sizeof(TableElement) * tableSize); + + // write entries + int num = 0; + int totalNumCollisions = 0; + int numTags = 0; + int sizeOfKeys = 0; + for(SymbolsIterator siter(wordSymbols); !siter.Done(); siter.Next()) { + std::string word = siter.Symbol(); + if(tagsForWordEntry.find(siter.Value()) == tagsForWordEntry.end()) { + continue; + } + int64 id = tagsForWordEntry[siter.Value()]; + + num++; + TableElement element; + element.hashValue = Hash(word.c_str(), word.length()) % tableSize; + element.keySize = (uint8_t) word.length(); + element.dataSize = tagsForWord[id].size(); + numTags += element.dataSize; + element.location = (uint32_t) ftell(output); + fwrite(word.c_str(), element.keySize, 1, output); + sizeOfKeys += element.keySize; + for(size_t tag = 0; tag < tagsForWord[id].size(); tag++) { + lexicon_tag_t packed = (lexicon_tag_t) tagsForWord[id][tag]; + fwrite(&packed, sizeof(packed), 1, output); + } + if(element.dataSize > 0) { + uint32_t hash = element.hashValue % tableSize; + int numCollisions = 0; + while(table[hash].location != 0) { + numCollisions++; + hash = (hash + 1) % tableSize; + } + totalNumCollisions += numCollisions; + table[hash] = element; + } + } + std::cerr << "avg collisions: " << 1.0 * totalNumCollisions / (double) wordSymbols.NumSymbols() << "\n"; + std::cerr << "sizeof (keys) = " << sizeOfKeys << "\n"; + std::cerr << "sizeof (tags) = " << sizeof(lexicon_tag_t) << " * " << numTags << "\n"; + std::cerr << "sizeof (entry in table) = " << sizeof_LexiconTableElement << " * " << tableSize << "\n"; + + // write table + dataLocation = (uint32_t) ftell(output); + for(uint32_t i = 0; i < tableSize; i++) { + fwrite(&table[i], sizeof(table[i]), 1, output); + } + free(table); + + // set feature locations + fseek(output, dataLocationOffset, SEEK_SET); + fwrite(&dataLocation, sizeof(dataLocation), 1, output); + + fclose(output); + return true; + } + + bool Load(const std::string& filename) { + isBinary = false; + + struct stat sb; + fd = open(filename.c_str(), O_RDONLY); + if(fd == -1) { + std::cerr << "ERROR: could not open crf lexicon \"" << filename << "\"\n"; + return false; + } + if (fstat(fd, &sb) == -1) { + std::cerr << "ERROR: could not fstat crf lexicon \"" << filename << "\"\n"; + return false; + } + dataLength = sb.st_size; + data = (const char*) mmap(NULL, dataLength, PROT_READ, MAP_PRIVATE, fd, 0); + if(data == MAP_FAILED) { + perror("mmap"); + std::cerr << "ERROR: could mmap() crf lexicon \"" << filename << "\"\n"; + return false; + } + + info = (const ModelInfo*) data; + + // read magic + if(info->magic != lexiconMagic) { + bool result = Lexicon::Load(filename); + if(result == false) { + std::cerr << "ERROR: invalid magic or unsupported version in binary crf lexicon. Please reconvert it from text model.\n"; + } + return result; + } + + // read table + table = (const TableElement*) &data[info->dataLocation]; + + loaded = true; + isBinary = true; + return true; + } + + bool GetTagsForWord(int64 word, std::vector<int64>& output) const { + if(!isBinary) { + return Lexicon::GetTagsForWord(word, output); + } + std::cerr << "ERROR: GetTagsForWord() not supported on binary models\n"; + abort(); + return false; + } + + int NumLabels() const { + if(!isBinary) return Lexicon::NumLabels(); + return info->numLabels; + } + + bool GetTagsForWord(const std::string& word, std::vector<int64>& output) const { + if(!isBinary) { + return Lexicon::GetTagsForWord(word, output); + //std::cerr << "ERROR: called GetTagsForWord() on a non binary model\n"; + //return false; + } + if(word == "<eps>") { + output.clear(); + output.push_back(0); + } + size_t keySize = word.length(); + uint32_t hashValue = Hash(word.c_str(), keySize); // % info->tableSize; + uint32_t offset = 0; + while(offset < info->tableSize) { + uint32_t location = (hashValue + offset) % info->tableSize; + const TableElement& element = table[location]; + /*std::cerr << word << " " << location << " " << hashValue << " " << offset << " " << info->tableSize << + "|" << element.hashValue << " " << (int) element.keySize << " " << (int) element.dataSize << " " << element.location << + "\n";*/ + if(element.location == 0) break; + if(element.keySize == keySize) { + char key[keySize + 1]; + strncpy(key, &data[element.location], keySize); + key[keySize] = '\0'; + if(std::string(key) == word) { + //std::cerr << "h:" << element.hashValue << " k:" << element.keySize << " d:" << element.dataSize << "\n"; + output.clear(); + const lexicon_tag_t* tags = (const lexicon_tag_t*) &data[element.location + keySize]; + for(int i = 0; i < element.dataSize; i++) { + output.push_back(tags[i]); + } + return true; + } + } + offset++; + } + // unknown word + output.clear(); + for(int i = 1; i < (int) info->numLabels + 1; i++) output.push_back(i); + return false; + } + + }; +} diff --git a/maca_crf_tagger/src/crf_binmodel.hh b/maca_crf_tagger/src/crf_binmodel.hh new file mode 100644 index 0000000000000000000000000000000000000000..33deb53834e55bcb0007882f28fbe5573a89d60b --- /dev/null +++ b/maca_crf_tagger/src/crf_binmodel.hh @@ -0,0 +1,406 @@ +#pragma once + +#include <stdio.h> +#include <string.h> +#include <stdint.h> +#include "crf_model.hh" +#include "crf_template.hh" + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <unistd.h> +#include <errno.h> + +#include <limits.h> +#ifdef CHAR_BIT +#if CHAR_BIT != 8 +#error CHAR_BIT != 8 not supported +#endif +#endif + +namespace macaon { +// disable alignment in MSVC++ +#pragma pack(push, 1) + struct ModelInfo { + uint32_t magic; + uint32_t templateLocation; + uint32_t numTemplates; + uint32_t labelLocation; + uint32_t numLabels; + uint32_t featureLocation; + uint32_t tableSize; + } __attribute__((packed)); + + struct TableElement { + uint32_t hashValue; + uint16_t keySize; + uint16_t dataSize; + uint32_t location; + } __attribute__((packed)); // disable alignment in g++ +#define sizeof_TableElement (sizeof(uint32_t) + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t)) + + struct LabelWeight { + uint16_t label; + float weight; + } __attribute__((packed)); +#define sizeof_LabelWeight (sizeof(uint16_t) + sizeof(float)) + + struct LabelPairWeight { + uint16_t previous; + uint16_t label; + float weight; + } __attribute__((packed)); +#define sizeof_LabelPairWeight (sizeof(uint16_t) + sizeof(uint16_t) + sizeof(float)) +#pragma pack(pop) + + namespace BinaryModelConstants { + const uint32_t magic = 0x132a0ab5; + } + + class BinaryModel : public CRFModel { + private: + bool isBinary; + int fd; + const char* data; + size_t dataLength; + const ModelInfo* info; + const TableElement* table; + std::vector<double> B_weights; // cache for B template + + // java hash function + uint32_t Hash(const char *k, size_t length) { + uint32_t output = 0; + for(size_t i = 0; i < length; i++) { + output = 31 * output + k[i]; + } + return output; + } + + public: + BinaryModel() : CRFModel(), isBinary(false) {} + BinaryModel(const std::string &filename) : CRFModel(), isBinary(false), fd(-1), data((const char*) MAP_FAILED) { + Load(filename); + } + + ~BinaryModel() { + if(data != MAP_FAILED) munmap((void*) data, dataLength); + if(fd != -1) close(fd); + } + + bool Convert(const std::string& from, const std::string& to) { + std::cerr << "loading\n"; + CRFModel::Load(from); + std::cerr << "writing\n"; + return Write(to); + } + + // trimming is already performed when writing the bin model + void TrimModel() { + std::unordered_map<std::string, int> newFeatures; + for(std::unordered_map<std::string, int>::const_iterator feature = features.begin(); feature != features.end(); feature++) { + if(feature->first[0] != 'B') { + int numNonNull = 0; + for(size_t i = 0; i < labels.size(); i++) { + float weight = weights[feature->second + i]; + if(weight != 0) numNonNull++; + } + if(numNonNull > 0) { + newFeatures[feature->first] = feature->second; + } + } else { + newFeatures[feature->first] = feature->second; + } + } + std::cerr << "trim: " << features.size() << " -> " << newFeatures.size() << "\n"; + features = newFeatures; + } + + bool Write(const std::string & filename) { + FILE* output = fopen(filename.c_str(), "w"); + // magic + fwrite(&BinaryModelConstants::magic, sizeof(BinaryModelConstants::magic), 1, output); // magic + + // templates + uint32_t templateLocation = 0; + uint32_t templateLocationOffset = (uint32_t) ftell(output); + fwrite(&templateLocation, sizeof(templateLocation), 1, output); + uint32_t numTemplates = (uint32_t) templates.size(); + fwrite(&numTemplates, sizeof(numTemplates), 1, output); + + // labels + uint32_t labelLocation = 0; + uint32_t labelLocationOffset = (uint32_t) ftell(output); + fwrite(&labelLocation, sizeof(labelLocation), 1, output); + uint32_t numLabels = (uint32_t) labels.size(); + fwrite(&numLabels, sizeof(numLabels), 1, output); + + // features + uint32_t featureLocation = 0; + uint32_t featureLocationOffset = (uint32_t) ftell(output); + fwrite(&featureLocation, sizeof(featureLocation), 1, output); + uint32_t tableSize = (uint32_t) features.size() * 3; + fwrite(&tableSize, sizeof(tableSize), 1, output); + + // create table + TableElement* table = (TableElement*) malloc(sizeof(TableElement) * tableSize); + memset(table, 0, sizeof(TableElement) * tableSize); + + // write templates + templateLocation = (uint32_t) ftell(output); + for(std::vector<CRFPPTemplate>::const_iterator i = templates.begin(); i != templates.end(); i++) { + fprintf(output, "%s\n", i->text.c_str()); + } + + // write labels + std::vector<std::string> labelVector(labels.size()); + for(std::unordered_map<std::string, int>::const_iterator label = labels.begin(); label != labels.end(); label++) { + labelVector[label->second] = label->first; + } + labelLocation = (uint32_t) ftell(output); + for(size_t i = 0; i < labelVector.size(); i++) { + fprintf(output, "%s\n", labelVector[i].c_str()); + } + + // write weights + int num = 0; + int totalNumCollisions = 0; + int numUnigram = 0; + int numBigram = 0; + for(std::unordered_map<std::string, int>::const_iterator feature = features.begin(); feature != features.end(); feature++) { + num++; + TableElement element; + element.hashValue = Hash(feature->first.c_str(), feature->first.length()) % tableSize; + element.keySize = (uint16_t) feature->first.length(); + element.dataSize = 0; + element.location = (uint32_t) ftell(output); + fwrite(feature->first.c_str(), element.keySize, 1, output); + if(feature->first[0] == 'B') { + for(uint16_t label = 0; label < numLabels; label++) { + for(uint16_t previous = 0; previous < numLabels; previous++) { + float weight = weights[feature->second + label + numLabels * previous]; + if(weight != 0) { + LabelPairWeight item; + item.previous = previous; + item.label = label; + item.weight = weight; + fwrite(&item, sizeof(item), 1, output); + element.dataSize ++; + } + } + } + numBigram++; + } else { + for(uint16_t label = 0; label < numLabels; label++) { + float weight = weights[feature->second + label]; + if(weight != 0) { + LabelWeight item; + item.label = label; + item.weight = weight; + fwrite(&item, sizeof(item), 1, output); + element.dataSize ++; + } + } + numUnigram++; + } + if(element.dataSize > 0) { + uint32_t hash = element.hashValue % tableSize; + int numCollisions = 0; + while(table[hash].location != 0) { + numCollisions++; + hash = (hash + 1) % tableSize; + } + totalNumCollisions += numCollisions; + //std::cout << element.hashValue << " " << feature->first << "\n"; + table[hash] = element; + } + } + std::cerr << "avg collisions: " << 1.0 * totalNumCollisions / (double) features.size() << "\n"; + std::cerr << "sizeof (label+weight) = " << sizeof_LabelWeight << " * " << numUnigram << "\n"; + std::cerr << "sizeof (label+label+weight) = " << sizeof_LabelPairWeight << " * " << numBigram << "\n"; + std::cerr << "sizeof (entry in table) = " << sizeof_TableElement << " * " << tableSize << "\n"; + + // write table + featureLocation = (uint32_t) ftell(output); + for(uint32_t i = 0; i < tableSize; i++) { + fwrite(&table[i], sizeof(table[i]), 1, output); + } + free(table); + + // set section locations + fseek(output, templateLocationOffset, SEEK_SET); + fwrite(&templateLocation, sizeof(templateLocation), 1, output); + + // set label locations + fseek(output, labelLocationOffset, SEEK_SET); + fwrite(&labelLocation, sizeof(labelLocation), 1, output); + + // set feature locations + fseek(output, featureLocationOffset, SEEK_SET); + fwrite(&featureLocation, sizeof(featureLocation), 1, output); + + fclose(output); + return true; + } + + bool Load(const std::string& filename) { + isBinary = false; + + struct stat sb; + fd = open(filename.c_str(), O_RDONLY); + if(fd == -1) { + std::cerr << "ERROR: could not open crf model \"" << filename << "\"\n"; + return false; + } + if (fstat(fd, &sb) == -1) { + std::cerr << "ERROR: could not fstat crf model \"" << filename << "\"\n"; + return false; + } + dataLength = sb.st_size; + data = (const char*) mmap(NULL, dataLength, PROT_READ, MAP_PRIVATE, fd, 0); + if(data == MAP_FAILED) { + perror("mmap"); + std::cerr << "ERROR: could mmap() crf model \"" << filename << "\"\n"; + return false; + } + name = filename; + + info = (const ModelInfo*) data; + + // read magic + if(info->magic != BinaryModelConstants::magic) { + //std::cerr << "WARNING: binary crf model format not recognized, trying text model\n"; + return CRFModel::Load(filename); + } + + size_t lineSize = 0; + + // read templates + templates.clear(); + const char* line = (const char*) &data[info->templateLocation]; + for(size_t i = 0; i < info->numTemplates; i++) { + lineSize = strchr(line, '\n') - line; + char content[lineSize + 1]; + strncpy(content, line, lineSize); + content[lineSize] = '\0'; + //std::cerr << "TEMPLATE[" << content << "]\n"; + templates.push_back(CRFPPTemplate(content)); + line += lineSize + 1; + } + + // read labels + labels.clear(); + reverseLabels.clear(); + line = (const char*) &data[info->labelLocation]; + for(uint32_t i = 0; i < info->numLabels; i++) { + lineSize = strchr(line, '\n') - line; + char content[lineSize + 1]; + strncpy(content, line, lineSize); + content[lineSize] = '\0'; + //std::cerr << "LABEL[" << content << "]\n"; + labels[std::string(content)] = (int) i; + reverseLabels.push_back(std::string(content)); + line += lineSize + 1; + } + + // read table + table = (const TableElement*) &data[info->featureLocation]; + + ComputeWindowOffset(); + loaded = true; + isBinary = true; + GetWeights("B", B_weights); + return true; + } + + bool GetWeights(const std::string& feature, std::vector<double>& output) { + if(!isBinary) { + std::cerr << "ERROR: called GetWeights() on a non binary model\n"; + return false; + } + size_t keySize = feature.length(); + uint32_t hashValue = Hash(feature.c_str(), keySize) % info->tableSize; + uint32_t offset = 0; + size_t numLabels = labels.size(); + while(offset < info->tableSize) { + uint32_t location = (hashValue + offset) % info->tableSize; + const TableElement& element = table[location]; + if(element.location == 0) return false; + if(element.keySize == keySize) { + char key[keySize + 1]; + strncpy(key, &data[element.location], keySize); + key[keySize] = '\0'; + if(std::string(key) == feature) { + //std::cerr << "h:" << element.hashValue << " k:" << element.keySize << " d:" << element.dataSize << "\n"; + if(feature[0] == 'B') { + output.assign(numLabels * numLabels, 0); + const LabelPairWeight* items = (const LabelPairWeight*) &data[element.location + keySize]; + for(int i = 0; i < element.dataSize; i++) { + output[items[i].label + numLabels * items[i].previous] = items[i].weight; + } + } else { + output.assign(numLabels, 0); + const LabelWeight* items = (const LabelWeight*) &data[element.location + keySize]; + for(int i = 0; i < element.dataSize; i++) { + output[items[i].label] = items[i].weight; + } + } + return true; + } + } + offset++; + } + return false; + } + + /* note: this function can use bigram templates conditionned on observations */ + double rescore(const std::vector<std::vector<std::string> > &input, const std::vector<int> &context, const std::vector<int> &context_tags) { + if(!isBinary) { + return CRFModel::rescore(input, context, context_tags); + } + double output = 0; + if((int) context.size() != window_length) return 0; + if(context[window_offset] < 0) return 0; + const int label = context_tags[window_offset]; //ilabels[input[context[window_offset]][input[context[window_offset]].size() - 1]]; + int previous = -1; + if(window_length > 1 && context[window_offset - 1] >=0) previous = context_tags[window_offset - 1]; + for(std::vector<CRFPPTemplate>::const_iterator i = templates.begin(); i != templates.end(); i++) { + std::string feature = i->applyToClique(input, context, window_offset); + std::vector<double> feature_weights; + if(GetWeights(feature, feature_weights)) { + if(i->type == CRFPPTemplate::UNIGRAM) output += feature_weights[label]; + else if(previous != -1) output += feature_weights[label + labels.size() * previous]; + } + } + return output; + } + + /* note: this function CANNOT use bigram templates conditionned on observations */ + double transition(int previous, int label) { + if(!isBinary) return CRFModel::transition(previous, label); + return B_weights[label + info->numLabels * previous]; + } + + void emissions(const std::vector<std::vector<std::string> > &input, const std::vector<int> &context, std::vector<double>& output) { + if(!isBinary) { + CRFModel::emissions(input, context, output); + return; + } + output.assign(labels.size(), 0); + if((int) context.size() != window_length) return; + if(context[window_offset] == -1) return; + for(std::vector<CRFPPTemplate>::const_iterator i = templates.begin(); i != templates.end(); i++) { + if(i->type == CRFPPTemplate::UNIGRAM) { + std::string feature = i->applyToClique(input, context, window_offset); + std::vector<double> feature_weights; + if(GetWeights(feature, feature_weights)) { + for(size_t label = 0; label < labels.size(); label++) + output[label] += feature_weights[label]; + } + } + } + } + }; +} diff --git a/maca_crf_tagger/src/crf_decoder.hh b/maca_crf_tagger/src/crf_decoder.hh new file mode 100644 index 0000000000000000000000000000000000000000..0c89aebddbefe82f06cdf5a3412c15dfd695a867 --- /dev/null +++ b/maca_crf_tagger/src/crf_decoder.hh @@ -0,0 +1,147 @@ +#pragma once +#include <list> +#ifdef __APPLE__ +#include "../../../third_party/unordered_map/unordered_map.hpp" +#else +#include <unordered_map> +#endif +#include "crf_binmodel.hh" +#include "crf_utils.hh" +#include "crf_binlexicon.hh" + +namespace macaon { + + struct Decoder { + + //CRFModel model; + BinaryModel model; + Symbols tagSet; + + Decoder() : tagSet("tagset") { } + Decoder(const std::string &filename) : tagSet("tagset") { + model.Load(filename); + tagSet.AddSymbol("<eps>", 0); + for(std::unordered_map<std::string, int>::const_iterator label = model.labels.begin(); label != model.labels.end(); label++) { + tagSet.AddSymbol(label->first, label->second + 1); + } + } + + Symbols* getTagset() { + return &tagSet; + } + + bool IsLoaded() const { + return model.IsLoaded(); + } + + /* Faster decoder for simple sequences. + * This function supports an optional lexicon to specify allowed word/tags. And attional option sets the location of the word in the feature vector. + * */ + void decodeString(const std::vector<std::vector<std::string> > &features, std::vector<std::string> &predictions, const BinaryLexicon* lexicon=NULL, int wordFeatureLocation=0) { + int length = features.size(); + int numLabels = model.labels.size(); + + /*if(lexicon->NumLabels() != numLabels) { + std::cerr << "ERROR: num label mismatch between model and lexicon\n"; + return; + }*/ + + // store score and backtrack matrices (TODO: size matrices according to possible word/tag assoc) + std::vector<std::vector<double> > scores(length, std::vector<double>(numLabels, 0.0)); + std::vector<std::vector<int> > backtrack(length, std::vector<int>(numLabels, 0)); + /*double** scores = new double*[length]; + int** backtrack = new int*[length]; + + for(int i = 0; i < length; i++) { + scores[i] = new double[numLabels]; + backtrack[i] = new int[numLabels]; + for(int j = 0; j < numLabels; j++) { + backtrack[i][j] = -1; + scores[i][j] = 0.0; + } + }*/ + + // possible tags for each word: use lexicon if provided + std::vector<std::vector<int64> > wordTags(length); + std::vector<int64> allTags(numLabels); + for(int label = 0; label < numLabels; label++) allTags[label] = label + 1; // warning: there is an offset of one for epsilon transitions + + // perform viterbi search for the maximum scoring labeling + for(int current = 0; current < length; current++) { + // honor lexicon or allow all tags + if(lexicon != NULL) lexicon->GetTagsForWord(features[current][wordFeatureLocation], wordTags[current]); + else wordTags[current] = allTags; + + // create context vector (offset of features for current word) + std::vector<int> context(model.window_length); + for(int i = 0; i < model.window_length; i++) + if(current + i - model.window_offset >= 0 && current + i - model.window_offset < length) context[i] = (current + i - model.window_offset); + else context[i] = -1; + + // compute emissions and find highest scoring transition pair + if(current == 0) { + // TODO: compute emissions only for valid word/tags pairs + std::vector<double> emissions; + model.emissions(features, context, emissions); + for(int e = 0; e < numLabels; e++) scores[current][e] = emissions[e]; + } else { + std::vector<double> emissions; + model.emissions(features, context, emissions); + for(int e = 0; e < numLabels; e++) scores[current][e] = emissions[e]; + for(size_t i = 0; i < wordTags[current].size(); i++) { + int label = wordTags[current][i] - 1; + if(label < 0 || label >= numLabels) { + std::cerr << "ERROR: unexpected label (" << label << ") from lexicon, please check that it is compatible with model.\n"; + return; + } + double max = 0; + int argmax = -1; + for(size_t j = 0; j < wordTags[current - 1].size(); j++) { + int previous = wordTags[current - 1][j] - 1; + if(previous < 0 || previous >= numLabels) { + std::cerr << "ERROR: unexpected label (" << previous << ") from lexicon, please check that it is compatible with model.\n"; + return; + } + double score = scores[current][label] + scores[current - 1][previous] + model.transition(previous, label); + if(argmax == -1 || max < score) { + max = score; + argmax = previous; + } + } + scores[current][label] = max; + backtrack[current][label] = argmax; + } + } + } + // find last label + double max = 0; + int argmax = -1; + if(length > 0) { + for(size_t i = 0; i < wordTags[length - 1].size(); i++) { + int label = wordTags[length - 1][i] - 1; + if(argmax == -1 || scores[length - 1][label] > max) { + max = scores[length - 1][label]; + argmax = label; + } + } + } + + // backtrack solution + int current = length - 1; + predictions.clear(); + predictions.resize(length); + while(current >= 0) { + predictions[current] = model.reverseLabels[argmax]; + argmax = backtrack[current][argmax]; + current --; + } + + /*for(int i = 0; i < length; i++) { + delete scores[i]; + delete backtrack[i]; + } + delete scores; + delete backtrack;*/ + } + }; +} diff --git a/maca_crf_tagger/src/crf_features.hh b/maca_crf_tagger/src/crf_features.hh new file mode 100644 index 0000000000000000000000000000000000000000..917a395b7a5ba9d31bf3578c93bcfe1e98eedfe9 --- /dev/null +++ b/maca_crf_tagger/src/crf_features.hh @@ -0,0 +1,100 @@ +#pragma once +#include <vector> +#include <string> + +namespace macaon { + class FeatureGenerator { + static void prefixesUtf8(const std::string &word, int n, std::vector<std::string> &output) { + size_t offset = 0; + while(offset < word.length() && n > 0) { + if((unsigned char)word[offset] >> 7 == 1) { // 1xxxxxxx (length of utf8 character) + offset++; + while(offset < word.length() && (unsigned char)word[offset] >> 6 == 2) { // 10xxxxxx (continuation of character) + offset++; + } + } else { + offset++; + } + + output.push_back(word.substr(0, offset)); + n--; + } + while(n > 0) { + output.push_back("__nil__"); + n--; + } + } + + static void suffixesUtf8(const std::string &word, int n, std::vector<std::string> &output) { + std::vector<int> char_starts; + size_t offset = 0; + while(offset < word.length()) { + char_starts.push_back(offset); + if((unsigned char)word[offset] >> 7 == 1) { // 1xxxxxxx (length of utf8 character) + offset++; + while(offset < word.length() && (unsigned char)word[offset] >> 6 == 2) { // 10xxxxxx (continuation of character) + offset++; + } + } else { + offset++; + } + } + for(int i = char_starts.size() - 1; i > 0 && n > 0; i--) { + //std::cerr << "s=[" << word.substr(offsets[i]) << "]\n"; + output.push_back(word.substr(char_starts[i])); + n--; + } + while(n > 0) { + output.push_back("__nil__"); + n--; + } + } + + + static void prefixes(const std::string &word, int n, std::vector<std::string> &output) { + int length = word.length(); + for(int i = 1; i <= n; i++) { + if(length >= i) output.push_back(word.substr(0, i)); + else output.push_back("__nil__"); + } + } + static void suffixes(const std::string &word, int n, std::vector<std::string> &output) { + int length = word.length(); + for(int i = 1; i <= n; i++) { + if(length >= i) output.push_back(word.substr(length - i, i)); + else output.push_back("__nil__"); + } + } + static void wordClasses(const std::string &word, std::vector<std::string> &output) { + bool containsNumber = false; + bool containsSymbol = false; + for(int i = 0; i < (int) word.length(); i++) { + if(!containsNumber && word.at(i) >= '0' && word.at(i) <= '9') containsNumber = true; + if(!containsSymbol && !((word.at(i) >= '0' && word.at(i) <= '9') || (word.at(i) >= 'a' && word.at(i) <= 'z') || (word.at(i) >= 'A' && word.at(i) <= 'Z'))) containsSymbol = true; + } + if(containsNumber) output.push_back("Y"); + else output.push_back("N"); + if(word.length() >= 2 && word.at(0) >= 'A' && word.at(0) <= 'Z' && word.at(1) >= 'a' && word.at(1) <= 'z') output.push_back("Y"); + else output.push_back("N"); + if(containsSymbol) output.push_back("Y"); + else output.push_back("N"); + } + public: + static void get_pos_features(const std::string &word, std::vector<std::string> &output, bool utf8=true) { + output.push_back(word); + wordClasses(word, output); + if(utf8) { + prefixesUtf8(word, 4, output); + suffixesUtf8(word, 4, output); + } else { + prefixes(word, 4, output); + suffixes(word, 4, output); + } + } + static std::vector<std::string> get_pos_features(const std::string &word, bool utf8=true) { + std::vector<std::string> output; + get_pos_features(word, output, utf8); + return output; + } + }; +} diff --git a/maca_crf_tagger/src/crf_lexicon.hh b/maca_crf_tagger/src/crf_lexicon.hh new file mode 100644 index 0000000000000000000000000000000000000000..5182c1fbb0142053a761b9f77029227b68bb70be --- /dev/null +++ b/maca_crf_tagger/src/crf_lexicon.hh @@ -0,0 +1,128 @@ +#pragma once + +#include <iostream> +#include <fstream> +#include <string> +#include <stdint.h> +#ifdef __APPLE__ +#include "../../../third_party/unordered_map/unordered_map.hpp" +#else +#include <unordered_map> +#endif +#include "crf_utils.hh" + +namespace macaon { + const int kEpsilonTags = 0; + const int kUnknownWordTags = 1; + + class Lexicon { + protected: + bool loaded; + + Symbols wordSymbols; + Symbols* tagSymbols; + + std::vector<std::vector<int64> > tagsForWord; + std::unordered_map<int64, int> tagsForWordEntry; + + + public: + Lexicon() : loaded(false), wordSymbols("words"), tagSymbols(NULL) { + wordSymbols.AddSymbol("<eps>", 0); + } + + Lexicon(const std::string& filename, Symbols* _tagSymbols) : loaded(false), wordSymbols("words"), tagSymbols(_tagSymbols) { + wordSymbols.AddSymbol("<eps>", 0); + Load(filename); + } + + virtual ~Lexicon() { + } + + bool NumLabels() const { + return tagSymbols->NumSymbols() - 1; // account for epsilon + } + + bool Load(const std::string &filename) { + tagsForWord.push_back(std::vector<int64>()); // keep space for epsilon tags + tagsForWord.push_back(std::vector<int64>()); // keep space for unk word tags + loaded = false; + std::unordered_map<std::string, int> known; + std::ifstream input(filename.c_str()); + if(!input.is_open()) { + std::cerr << "ERROR: could not open " << filename << " in Lexicon::Load()" << std::endl; + return false; + } + while(!input.eof()) { + std::string line; + std::getline(input, line); + if(input.eof()) break; + std::string word; + std::string::size_type end_of_word = line.find('\t'); + if(end_of_word == std::string::npos) { + return false; + } + word = line.substr(0, end_of_word); + int64 wordId = wordSymbols.AddSymbol(word); + std::string signature = line.substr(end_of_word + 1); + std::unordered_map<std::string, int>::const_iterator found = known.find(signature); + if(found == known.end()) { + int id = tagsForWord.size(); + known[signature] = id; + tagsForWordEntry[wordId] = id; + std::vector<std::string> tokens; + Tokenize(signature, tokens, "\t"); + std::vector<int64> tagset; + for(std::vector<std::string>::const_iterator i = tokens.begin(); i != tokens.end(); i++) { + int64 tagId = tagSymbols->Find(*i); + if(tagId != -1) tagset.push_back(tagId); + } + tagsForWord.push_back(tagset); + } else { + tagsForWordEntry[wordId] = found->second; + } + } + tagsForWord[kEpsilonTags].push_back(0); // epsilon + for(SymbolsIterator siter(*tagSymbols); !siter.Done(); siter.Next()) { // unknown word + if(siter.Value() != 0) tagsForWord[kUnknownWordTags].push_back(siter.Value()); + } + loaded = true; + return loaded; + } + + virtual bool GetTagsForWord(const std::string& word, std::vector<int64>& output) const { + return GetTagsForWord(wordSymbols.Find(word), output); + } + + virtual bool GetTagsForWord(int64 word, std::vector<int64>& output) const { + if(!IsLoaded()) { + std::cerr << "ERROR: Lexicon::GetTagsForWord(" << wordSymbols.Find(word) << ") called on empty lexicon" << std::endl; + return false; + } + if(word == -1) { + output = tagsForWord[kUnknownWordTags]; + return true; + } + if(word == 0) { + output = tagsForWord[kEpsilonTags]; + return true; + } + std::unordered_map<int64, int>::const_iterator found = tagsForWordEntry.find(word); + if(found == tagsForWordEntry.end()) { + output = tagsForWord[kUnknownWordTags]; + } else { + if(tagsForWord[found->second].size() == 0) { + std::cerr << "WARNING: inconsistancy between word/tag lexicon and model, word no " << word << " has no tags => treat as unknown word\n"; + output = tagsForWord[kUnknownWordTags]; + } + output = tagsForWord[found->second]; + } + return true; + } + + bool IsLoaded() const { + return loaded; + } + + }; +} diff --git a/maca_crf_tagger/src/crf_model.hh b/maca_crf_tagger/src/crf_model.hh new file mode 100644 index 0000000000000000000000000000000000000000..6547460fb5beff1d79c2ff1b7189169c602c2154 --- /dev/null +++ b/maca_crf_tagger/src/crf_model.hh @@ -0,0 +1,170 @@ +#pragma once +#include <string> +#include <vector> +#ifdef __APPLE__ +#include "../../../third_party/unordered_map/unordered_map.hpp" +#else +#include <unordered_map> +#endif +#include <stdio.h> +#include <errno.h> +#include "crf_template.hh" + +namespace macaon { + class CRFModel { + protected: + std::string name; + std::vector<CRFPPTemplate> templates; + int version; + double cost_factor; + int maxid; + int xsize; + std::unordered_map<std::string, int> features; + std::vector<float> weights; + bool loaded; + int bigramWeightLocation; + public: + std::unordered_map<std::string, int> labels; + std::vector<std::string> reverseLabels; + int window_offset; + int window_length; + CRFModel() : loaded(false) {} + CRFModel(const std::string &filename) : loaded(false) { Load(filename); } + + bool Load(const std::string &filename) { + name = filename; + FILE* fp = fopen(filename.c_str(), "r"); + if(!fp) { + fprintf(stderr, "ERROR: %s, %s\n", filename.c_str(), strerror(errno)); + return false; + } + char line[1024]; + int section = 0; + int header_num = 0; + int line_num = 0; + int num_non_null = 0; + while(NULL != fgets(line, 1024, fp)) { + line_num ++; + if(line[0] == '\n') { + section ++; + } else { + line[1023] = '\0'; + line[strlen(line) - 1] = '\0'; // chomp + if(section == 0) { // header + char* space = line; + while(*space != ' ' && *space != '\0') space ++; + if(header_num == 0) version = strtol(space + 1, NULL, 10); + else if(header_num == 1) cost_factor = strtod(space + 1, NULL); + else if(header_num == 2) maxid = strtol(space + 1, NULL, 10); + else if(header_num == 3) xsize = strtol(space + 1, NULL, 10); + else { + fprintf(stderr, "ERROR: unexpected header line %d in %s\n", line_num, filename.c_str()); + fclose(fp); + return false; + } + header_num ++; + } else if (section == 1) { // labels + int next_id = labels.size(); + labels[std::string(line)] = next_id; + reverseLabels.push_back(std::string(line)); + } else if (section == 2) { // templates + templates.push_back(CRFPPTemplate(line)); + } else if (section == 3) { // feature indexes + char* space = line; + while(*space != ' ' && *space != '\0') space ++; + *space = '\0'; + int index = strtol(line, NULL, 10); + features[std::string(space + 1)] = index; + } else if (section == 4) { // weights + float weight = (float) strtod(line, NULL); + if(weight != 0) num_non_null++; + weights.push_back(weight); + } else { + fprintf(stderr, "ERROR: too many sections in %s\n", filename.c_str()); + fclose(fp); + return false; + } + } + } + //std::cerr << "weights: " << num_non_null << "/" << weights.size() << "\n"; + fclose(fp); + + ComputeWindowOffset(); + + std::unordered_map<std::string, int>::const_iterator found = features.find("B"); + if(found != features.end()) { + bigramWeightLocation = found->second; + } + loaded = true; + return true; + } + + void ComputeWindowOffset() { + int max_template_offset = 0; + int min_template_offset = 9; + for(std::vector<CRFPPTemplate>::const_iterator i = templates.begin(); i != templates.end(); i++) { + if(i->type == CRFPPTemplate::BIGRAM && min_template_offset > -1) min_template_offset = -1; // account for label bigram + for(std::vector<TemplateItem>::const_iterator j = i->items.begin(); j != i->items.end(); j++) { + if(j->line < min_template_offset) min_template_offset = j->line; + if(j->line > max_template_offset) max_template_offset = j->line; + } + } + window_offset = - min_template_offset; + window_length = max_template_offset - min_template_offset + 1; + } + + bool IsLoaded() const { + return loaded; + } + + /* note: this function can use bigram templates conditionned on observations */ + virtual double rescore(const std::vector<std::vector<std::string> > &input, const std::vector<int> &context, const std::vector<int> &context_tags) { + double output = 0; + if((int) context.size() != window_length) return 0; + //std::cerr << context[window_offset] << std::endl; + if(context[window_offset] < 0) return 0; + const int label = context_tags[window_offset]; //ilabels[input[context[window_offset]][input[context[window_offset]].size() - 1]]; + int previous = -1; + if(window_length > 1 && context[window_offset - 1] >=0) previous = context_tags[window_offset - 1]; //labels[input[context[window_offset - 1]][input[context[window_offset - 1]].size() - 1]]; + for(std::vector<CRFPPTemplate>::const_iterator i = templates.begin(); i != templates.end(); i++) { + std::string feature = i->applyToClique(input, context, window_offset); + //std::cerr << "feature: " << feature << std::endl; + std::unordered_map<std::string, int>::const_iterator found = features.find(feature); + if(found != features.end()) { + if(found->second >= 0 && found->second < (int) weights.size()) { + if(i->type == CRFPPTemplate::UNIGRAM) output += weights[found->second + label]; + else if(previous != -1) output += weights[found->second + label + labels.size() * previous]; + } + } + } + return output; + } + + /* note: this function CANNOT use bigram templates conditionned on observations */ + virtual double transition(int previous, int label) { + if(bigramWeightLocation < 0) return 0; + return weights[bigramWeightLocation + label + labels.size() * previous]; + } + + virtual void emissions(const std::vector<std::vector<std::string> > &input, const std::vector<int> &context, std::vector<double>& output) { + output.clear(); + output.resize(labels.size()); + if((int) context.size() != window_length) return; + if(context[window_offset] == -1) return; + for(std::vector<CRFPPTemplate>::const_iterator i = templates.begin(); i != templates.end(); i++) { + std::string feature = i->applyToClique(input, context, window_offset); + //std::cerr << " " << feature; + std::unordered_map<std::string, int>::const_iterator found = features.find(feature); + if(found != features.end()) { + if(found->second >= 0 && found->second < (int) weights.size()) { + if(i->type == CRFPPTemplate::UNIGRAM) + for(size_t label = 0; label < labels.size(); label++) + output[label] += weights[found->second + label]; + } + } + //else std::cerr << "*"; + } + //std::cerr << "\n"; + } + }; +} diff --git a/maca_crf_tagger/src/crf_tagger b/maca_crf_tagger/src/crf_tagger new file mode 100755 index 0000000000000000000000000000000000000000..48867b39b0ef4bffed7927ba5b51a0cbe81cb1e2 Binary files /dev/null and b/maca_crf_tagger/src/crf_tagger differ diff --git a/maca_crf_tagger/src/crf_tagger.cc b/maca_crf_tagger/src/crf_tagger.cc new file mode 100644 index 0000000000000000000000000000000000000000..a9fba5a923897863404933d34a377e2249b24a0d --- /dev/null +++ b/maca_crf_tagger/src/crf_tagger.cc @@ -0,0 +1,60 @@ +#include <vector> +#include "crf_decoder.hh" +#include "crf_binlexicon.hh" +#include "crf_features.hh" + +void tag_sentence(macaon::Decoder& decoder, macaon::BinaryLexicon* lexicon, const std::vector<std::string>& words) { + + std::vector<std::vector<std::string> > features; + for(size_t i = 0; i < words.size(); i++) { + std::vector<std::string> word_features; + macaon::FeatureGenerator::get_pos_features(words[i], word_features); + features.push_back(word_features); + /*for(size_t j = 0; j < word_features.size(); j++) std::cout << word_features[j] << " "; + std::cout << "\n";*/ + } + std::vector<std::string> tagged; + decoder.decodeString(features, tagged, lexicon); + for(size_t i = 0; i < tagged.size(); i++) { + if(i > 0) std::cout << " "; + std::cout << words[i] << "/" << tagged[i]; + } + std::cout << "\n"; +} + +void usage(const char* argv0) { + std::cerr << "usage: " << argv0 << " <model> [lexicon]\n"; + exit(1); +} + +int main(int argc, char** argv) { + std::string modelName = ""; + std::string lexiconName = ""; + + for(int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if(arg == "-h" || arg == "--help") { + usage(argv[0]); + } else if(modelName == "") { + modelName = arg; + } else if(lexiconName =="") { + lexiconName = arg; + } else { + usage(argv[0]); + } + } + if(modelName == "") usage(argv[0]); + + macaon::Decoder decoder(modelName); + macaon::BinaryLexicon *lexicon = NULL; + if(lexiconName != "") lexicon = new macaon::BinaryLexicon(lexiconName, decoder.getTagset()); + + std::string line; + while(std::getline(std::cin, line)) { + std::vector<std::string> words; + macaon::Tokenize(line, words, " "); + tag_sentence(decoder, lexicon, words); + } + if(lexicon) delete lexicon; + return 0; +} diff --git a/maca_crf_tagger/src/crf_template.hh b/maca_crf_tagger/src/crf_template.hh new file mode 100644 index 0000000000000000000000000000000000000000..7307f14832592d0f06390677500dc1d351401a7e --- /dev/null +++ b/maca_crf_tagger/src/crf_template.hh @@ -0,0 +1,146 @@ +#pragma once +#include <vector> +#include <string> +#include <iostream> +#include <sstream> +#include <string.h> +#include <stdlib.h> +#include <algorithm> + +namespace macaon { + // from http://www.jb.man.ac.uk/~slowe/cpp/itoa.html + static std::string number_to_string(const int value) { + const int base = 10; + std::string buf; + buf.reserve(35); + int quotient = value; + do { + buf += "0123456789abcdef"[ abs( quotient % base ) ]; + quotient /= base; + } while (quotient); + if (value < 0) buf += '-'; + reverse( buf.begin(), buf.end() ); + return buf; + } + + struct TemplateItem { + int line; + int column; + std::string prefix; + TemplateItem(const int _line, const int _column, const std::string &_prefix) : line(_line), column(_column), prefix(_prefix) { } + //friend std::ostream &operator<<(std::ostream &, const TemplateItem & ); + }; + + struct CRFPPTemplate { + enum TemplateType { + UNIGRAM, + BIGRAM, + }; + std::string text; + TemplateType type; + int size; + std::string suffix; + std::vector<TemplateItem> items; + CRFPPTemplate() {} + CRFPPTemplate(const char* input) { read(input); } + friend std::ostream &operator<<(std::ostream &, const CRFPPTemplate & ); + + std::string apply(const std::vector<std::vector<std::string> > &clique, int offset) const { + std::ostringstream output; + for(std::vector<TemplateItem>::const_iterator i = items.begin(); i != items.end(); i++) { + output << i->prefix; + int column = i->column; + int line = i->line + offset; + if(line >= 0 && line < (int) clique.size()) { + if(column >= 0 && column < (int) clique[line].size()) { + output << clique[line][column]; + } else { + std::cerr << "ERROR: invalid column " << column << " in template \"" << text << "\"\n"; + return ""; + } + } else { + output << "_B"; + output << number_to_string(line); + } + } + output << suffix; + return output.str(); + } + + std::string applyToClique(const std::vector<std::vector<std::string> > &features, const std::vector<int> &clique, int offset) const { + std::string output; + for(std::vector<TemplateItem>::const_iterator i = items.begin(); i != items.end(); i++) { + output += i->prefix; + int column = i->column; + int line = i->line; + if(line + offset >= 0 && line + offset < (int) clique.size() && clique[line + offset] >=0) { + if(column >= 0 && column < (int) features[clique[line + offset]].size()) { + output += features[clique[line + offset]][column]; + } else { + std::cerr << "ERROR: invalid column " << column << " in template \"" << text << "\"\n"; + return ""; + } + } else { + output += "_B"; + output += number_to_string(line); + } + } + output += suffix; + return output; + } + + void read(const char* input) { + text = input; + size = 0; + const char* current = input; + const char* gap_start = NULL, *gap_end = NULL, *line_start = NULL, *column_start = NULL; + int state = 0; + gap_start = current; + /* template is a succession of %x[-?\d+,\d+] which must be replaced by corresponding + * features at the given line, column relative to the current example. + * They are parsed with a rudimentary state machine, and stored in the template. + */ + if(*current == 'U') type = UNIGRAM; + else if(*current == 'B') type = BIGRAM; + else { + std::cerr << "ERROR: unexpected template type \"" << input << "\"\n"; + return; + } + while(*current != '\0') { + if(state == 0 && *current == '%') { state ++; gap_end = current; } + else if(state == 1 && *current == 'x') { state ++; } + else if(state == 2 && *current == '[') state ++; + else if(state == 3 && (*current == '-' || (*current >= '0' && *current <= '9'))) { state ++; line_start = current; } + else if(state == 4 && (*current >= '0' && *current <= '9')); + else if(state == 4 && *current == ',') { state ++; } + else if(state == 5 && (*current >= '0' && *current <= '9')) { state ++; column_start = current; } + else if(state == 6 && (*current >= '0' && *current <= '9')); + else if(state == 6 && *current == ']') { + state = 0; + std::string gap = std::string(gap_start, gap_end - gap_start); + int column = strtol(column_start, NULL, 10); + int line = strtol(line_start, NULL, 10); + items.push_back(TemplateItem(line, column, gap)); + size++; + gap_start = current + 1; + } else state = 0; + current ++; + } + suffix = gap_start; // add trailing text + } + }; + + /*std::ostream &operator<<(std::ostream &output, const macaon::TemplateItem &item) { + output << item.prefix << "%x[" << item.line << "," << item.column << "]"; + return output; + } + + std::ostream &operator<<(std::ostream &output, const macaon::CRFPPTemplate &featureTemplate) { + for(std::vector<macaon::TemplateItem>::const_iterator i = featureTemplate.items.begin(); i != featureTemplate.items.end(); i++) { + output << (*i); + } + output << featureTemplate.suffix; + return output; + }*/ + +} diff --git a/maca_crf_tagger/src/crf_utils.hh b/maca_crf_tagger/src/crf_utils.hh new file mode 100644 index 0000000000000000000000000000000000000000..8a33ab7b3839e88e02b4af96ea94f6c593195d34 --- /dev/null +++ b/maca_crf_tagger/src/crf_utils.hh @@ -0,0 +1,75 @@ +#pragma once + +#include <string> +#include <vector> + +#define int64 int + +namespace macaon { + class Symbols { + protected: + std::string name; + std::unordered_map<std::string, int> word2int; + std::unordered_map<int, std::string> int2word; + public: + Symbols(std::string _name) : name(_name) {} + int AddSymbol(const std::string& symbol, int value = -1) { + if(value == -1) value = word2int.size(); + word2int[symbol] = value; + int2word[value] = symbol; + return value; + } + int Find(const std::string& word) const { + std::unordered_map<std::string, int>::const_iterator found = word2int.find(word); + if(found != word2int.end()) return found->second; + return -1; + } + const std::string Find(const int64 id) const { + std::unordered_map<int, std::string>::const_iterator found = int2word.find(id); + if(found != int2word.end()) return found->second; + return ""; + } + int NumSymbols() const { + return word2int.size(); + } + friend class SymbolsIterator; + }; + class SymbolsIterator { + const Symbols& symbols; + std::unordered_map<std::string, int>::const_iterator iter; + public: + SymbolsIterator(const Symbols& _symbols) : symbols(_symbols) { + iter = symbols.word2int.begin(); + } + bool Done() { + return iter == symbols.word2int.end(); + } + void Next() { + iter++; + } + const std::string Symbol() { + return iter->first; + } + int Value() { + return iter->second; + } + }; + + // http://www.oopweb.com/CPP/Documents/CPPHOWTO/Volume/C++Programming-HOWTO-7.html + static void Tokenize(const std::string& str, std::vector<std::string>& tokens, const std::string& delimiters = " ", bool strict = false) + { + std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); + std::string::size_type pos = str.find_first_of(delimiters, lastPos); + tokens.clear(); + while (std::string::npos != pos || std::string::npos != lastPos) + { + tokens.push_back(str.substr(lastPos, pos - lastPos)); + if(strict) { + if(pos == std::string::npos) break; + lastPos = pos + 1; + } else lastPos = str.find_first_not_of(delimiters, pos); + pos = str.find_first_of(delimiters, lastPos); + } + } + +} diff --git a/maca_crf_tagger/src/lemmatizer.cc b/maca_crf_tagger/src/lemmatizer.cc new file mode 100644 index 0000000000000000000000000000000000000000..70c019f7de0a9774c531af10f8f680069f053a4d --- /dev/null +++ b/maca_crf_tagger/src/lemmatizer.cc @@ -0,0 +1,19 @@ +#include "lemmatizer.h" + +int main(int argc, char** argv) { + if(argc != 2) { + std::cerr << "usage: " << argv[0] << " <fplm-dictionary>\n"; + return 1; + } + macaon::Lemmatizer lemmatizer(argv[1]); + std::string line; + while(std::getline(std::cin, line)) { + std::vector<std::string> tokens; + macaon::Tokenize(line, tokens, " "); + for(size_t i = 0; i < tokens.size(); i++) { + if(i > 0) std::cout << " "; + std::cout << lemmatizer.lemmatize(tokens[i]); + } + std::cout << "\n"; + } +} diff --git a/maca_crf_tagger/src/lemmatizer.h b/maca_crf_tagger/src/lemmatizer.h new file mode 100644 index 0000000000000000000000000000000000000000..5966c7e666037cfe677c1660c803fcb3ebb24911 --- /dev/null +++ b/maca_crf_tagger/src/lemmatizer.h @@ -0,0 +1,51 @@ +#pragma once + +#include <string> +#include <unordered_map> +#include <vector> +#include <fstream> +#include <iostream> + +#include "crf_utils.hh" + +namespace macaon { + class Lemmatizer { + std::unordered_map<std::string, std::string> dictionary; + public: + Lemmatizer(const std::string& filename) { + std::ifstream input(filename); + if(input) { + std::string line; + int line_num = 1; + while(std::getline(input, line)) { + std::vector<std::string> tokens; + macaon::Tokenize(line, tokens, "\t", true); + if(tokens.size() != 4) { + std::cerr << "ERROR: unexpected input in " << filename << ", line " << line_num << ": \"" << line << "\"\n"; + break; + } + std::string word = tokens[0]; + std::string tag = tokens[1]; + std::string lemma = tokens[2]; + std::string morpho = tokens[3]; + dictionary[word + "/" + tag] = lemma; + line_num ++; + } + } else { + std::cerr << "ERROR: loading " << filename << "\n"; + } + } + std::string lemmatize(const std::string& word, const std::string& tag) const { + std::string key = word + "/" + tag; + return lemmatize(key); + } + std::string lemmatize(const std::string& word_tag) const { + std::unordered_map<std::string, std::string>::const_iterator found = dictionary.find(word_tag); + if(found != dictionary.end()) { + return found->second; + } + return word_tag.substr(0, word_tag.rfind('/')); + } + }; +} + diff --git a/maca_crf_tagger/src/maca_crf_convert_binlexicon.cc b/maca_crf_tagger/src/maca_crf_convert_binlexicon.cc new file mode 100644 index 0000000000000000000000000000000000000000..4704208f252dd40a627d9103c63018c07ae17153 --- /dev/null +++ b/maca_crf_tagger/src/maca_crf_convert_binlexicon.cc @@ -0,0 +1,46 @@ +#include "crf_decoder.hh" +#include "crf_binlexicon.hh" + +int main(int argc, char** argv) { + if(argc != 4 && argc != 3) { + std::cerr << "convert: " << argv[0] << " <crf-model> <lexicon.in> <lexicon.out>\n"; + std::cerr << "test: cat <text-lexicon> | " << argv[0] << " <crf-model> <bin-lexicon>\n"; + return 1; + } + if(argc == 4) { + macaon::Decoder decoder(argv[1]); + macaon::BinaryLexicon lexicon(argv[2], decoder.getTagset()); + lexicon.Write(argv[3]); + } else if(argc == 3) { + macaon::Decoder decoder(argv[1]); + macaon::BinaryLexicon lexicon(argv[2], decoder.getTagset()); + std::string line; + int line_num = 0; + while(std::getline(std::cin, line)) { + line_num ++; + std::vector<int64> tags; + std::vector<std::string> tokens; + macaon::Tokenize(line, tokens, "\t "); + if(lexicon.GetTagsForWord(tokens[0], tags) == false) { + std::cerr << "WARNING: word not found \"" << tokens[0] << "\", using all tags\n"; + } + if(tags.size() != tokens.size() - 1) { + std::cerr << "ERROR: wrong number of tags for entry " << line_num << "\n"; + std::cerr << " TXT: " << line << "\n"; + std::cerr << " BIN: " << tokens[0]; + for(size_t i = 0; i < tags.size(); i++) { + std::cerr << " " << decoder.getTagset()->Find(tags[i]); + } + std::cerr << "\n"; + } else { + for(size_t i = 0; i < tags.size(); i++) { + if(decoder.getTagset()->Find(tags[i]) != tokens[i + 1]) { + std::cerr << "ERROR: wrong tag \"" << tokens[i + 1] << "\" => \"" << decoder.getTagset()->Find(tags[i]) << "\", entry " << line_num << "\n"; + } + } + } + } + } + return 0; +} + diff --git a/maca_crf_tagger/src/maca_crf_convert_binmodel.cc b/maca_crf_tagger/src/maca_crf_convert_binmodel.cc new file mode 100644 index 0000000000000000000000000000000000000000..4c6cc55a71ae7584863200b6f33ed06518f50809 --- /dev/null +++ b/maca_crf_tagger/src/maca_crf_convert_binmodel.cc @@ -0,0 +1,23 @@ +#include "crf_binmodel.hh" + +int main(int argc, char** argv) { + if(argc != 3 && argc != 2) { + std::cerr << "usage: " << argv[0] << " <from> <to> or <binmodel>\n"; + return 1; + } + macaon::BinaryModel model; + if(argc == 3) { + model.Convert(argv[1], argv[2]); + } else { + model.Load(argv[1]); + std::vector<double> weights; + model.GetWeights("U18=a/jamais", weights); + for(size_t i = 0; i < weights.size(); i++) { + std::cout << weights[i] << " "; + } + std::cout << "\n"; + //model.Dump(); + } + return 0; +} + diff --git a/maca_crf_tagger/src/maca_crf_tagger_main.cc b/maca_crf_tagger/src/maca_crf_tagger_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..0371581044ba350f38c6e3fcd20bfb1090b55b38 --- /dev/null +++ b/maca_crf_tagger/src/maca_crf_tagger_main.cc @@ -0,0 +1,259 @@ +/*************************************************************************** + Copyright (C) 2011 by xxx <xxx@lif.univ-mrs.fr> + This file is part of maca_crf_tagger. + + Maca_crf_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Maca_crf_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_crf_tagger. If not, see <http://www.gnu.org/licenses/>. +**************************************************************************/ + +#include "maca_crf_tagger.hh" +#include "crf_decoder.hh" +#include "crf_features.hh" +#include "crf_lexicon.hh" +#include "crf_tclexdet.hh" + +void crf_tagger(fst::StdVectorFst &input, maca_crf_tagger_ctx *ctx, bool debug=false) +{ + if(debug) input.Write("debug.crf_tagger.input"); + gfsmStateId start; + maca_ht_structure * ht = ctx->ms->xml_nodes_ht; + xmlNodePtr seg; + char *tokens = NULL; + std::vector<std::vector<std::string> >features; + std::vector<int>ilabels; + fst::StdVectorFst output; + if(ctx->model_filename == NULL) { + std::cerr << "ERROR: crf_tagger model file not specified, exiting\n"; + exit(1); // ERROR + } + if(ctx->lexicon_filename == NULL) { + std::cerr << "ERROR: crf_tagger lexicon file not specified, exiting\n"; + exit(1); // ERROR + } + fst::SymbolTable inputSymbols("words"); + inputSymbols.AddSymbol("<eps>", 0); + + // extract features + for(start=0; start < input.NumStates(); start++){ + for(fst::MutableArcIterator<fst::StdVectorFst> aiter(&input, start); !aiter.Done(); aiter.Next()) { + const fst::StdArc &arc = aiter.Value(); + seg = (xmlNodePtr)maca_ht_index2adr(ht, arc.ilabel); + tokens = maca_sentence_get_segment_tokens_value(ctx->ms, seg); + ilabels.push_back(arc.ilabel); + inputSymbols.AddSymbol(tokens, ilabels.size()); + aiter.SetValue(fst::StdArc(ilabels.size(), arc.olabel, arc.weight, arc.nextstate)); + std::vector<std::string>word_features; + macaon::FeatureGenerator::get_pos_features(tokens, word_features); + features.push_back(word_features); + free(tokens); + } + } + if(debug) input.Write("debug.crf_tagger.features"); + + int64 isString = input.Properties(fst::kString, true); + if(isString & fst::kString && ctx->n == 1) { + if(ctx->verbose_flag > 0) std::cerr << "INFO: using linear tagger\n"; + // faster pipeline for linear automata + std::vector<std::string> tags; + ctx->decoder->decodeString(features, tags, ctx->lexicon); + output.AddState(); + output.SetStart(0); + for(int64 state = 0; state < input.NumStates() - 1; state++){ + const fst::StdArc &arc = fst::ArcIterator<fst::StdVectorFst>(input, state).Value(); + output.AddState(); + output.AddArc(state, fst::StdArc(ilabels[arc.ilabel-1], ctx->tag_mapping[ctx->decoder->getTagset()->Find(tags[state])], arc.weight, state + 1)); + } + output.SetFinal(output.NumStates() - 1, 0); + input = output; + + } else { + // add possible tag labels + input.SetInputSymbols(&inputSymbols); + ctx->lexicon->AddTags(input); + if(debug) input.Write("debug.crf_tagger.tags"); + + // rescore with CRF + ctx->decoder->decode(features, input, output, true); + if(debug) output.Write("debug.crf_tagger.decoded"); + + // convert to macaon + fst::RmEpsilon(&output); + + input = output; + for(start=0; start < input.NumStates(); start++){ + for(fst::MutableArcIterator<fst::StdVectorFst> aiter(&input, start); !aiter.Done(); aiter.Next()) { + const fst::StdArc &arc = aiter.Value(); + aiter.SetValue(fst::StdArc(ilabels[arc.ilabel-1], ctx->tag_mapping[arc.olabel], arc.weight, arc.nextstate)); + } + } + } + if(debug) input.Write("debug.crf_tagger.output"); +} + +void traverse_segments(maca_section *section, maca_crf_tagger_ctx *ctx) +{ + xmlNodePtr segs = section->xml_node_segs; + xmlNodePtr seg; + xmlChar *ulex_id; + maca_ht_structure * ht = ctx->ms->xml_nodes_ht; + int n; + int index; + char *tokens = NULL; + + + for(seg=segs->children, n=0; seg ; seg=seg->next, n++) + { + ulex_id = xmlGetProp(seg, BAD_CAST "id"); + index = maca_ht_adr2index(ht, seg); + tokens = maca_sentence_get_segment_tokens_value(ctx->ms, seg); + fprintf(stderr, "index = %d n = %d id = %s tokens = %s\n", index, n, ulex_id, tokens); + } +} + +void traverse_automaton(gfsmAutomaton *a, maca_crf_tagger_ctx *ctx) +{ + gfsmStateId i; + gfsmArcIter ai; + gfsmArc *t; + xmlNodePtr n; + maca_ht_xmlnode *ht = ctx->ms->xml_nodes_ht; + xmlChar *ulex_id; + xmlChar *ulex_lex_id; + + for(i=0; i<gfsm_automaton_n_states(a); i++){ + for (gfsm_arciter_open_ptr(&ai,a,gfsm_automaton_find_state(a, i)); gfsm_arciter_ok(&ai); gfsm_arciter_next(&ai)){ + t = gfsm_arciter_arc(&ai); + n = (xmlNodePtr)maca_ht_index2adr(ht, gfsm_arc_lower(t)); + ulex_id = xmlGetProp(n, BAD_CAST "id"); + ulex_lex_id = xmlGetProp(n, BAD_CAST "lex_id"); + fprintf(stderr,"index = %d ulex id = %s lex_id = %s\n",gfsm_arc_lower(t), ulex_id, ulex_lex_id); + } + } + /* creation de segment et d'une section */ +} + + +maca_section *create_morpho_section(gfsmAutomaton *a, maca_crf_tagger_ctx *ctx) +{ + gfsmStateId i; + gfsmArc *t; + gfsmArcIter ai; + maca_ht_structure * ht = ctx->ms->xml_nodes_ht; + char id_pos[500]; + xmlNodePtr posNode = NULL; + xmlNodePtr lexNode = NULL; + maca_section * section = NULL; + GHashTable* segments_created = g_hash_table_new_full(g_str_hash, g_str_equal,free, NULL); + char * prefix_id; + char * temp; + + //section = maca_section_create_section(MACA_POSS_SECTION); + //maca_sentence_add_section(ctx->ms, section); + section = maca_sentence_new_section(ctx->ms,MACA_MORPHO_SECTION); + prefix_id = (char*)malloc(sizeof(char)*(strlen(ctx->ms->id_sentence) +3)); + sprintf(prefix_id,"%s_M",ctx->ms->id_sentence); + + for(i=0; i<gfsm_automaton_n_states(a); i++){ + for (gfsm_arciter_open(&ai,a,i); gfsm_arciter_ok(&ai); gfsm_arciter_next(&ai)){ + t = gfsm_arciter_arc(&ai); + if(gfsm_arc_lower(t) != gfsmEpsilon){ + lexNode = (xmlNodePtr)maca_ht_index2adr(ht,gfsm_arc_lower(t)); + if(lexNode){ + temp = (char*)xmlGetProp(lexNode, BAD_CAST "id"); + sprintf(id_pos, "%s_%s",temp, maca_tags_get_str(ctx->cfg, "morpho", "stype", gfsm_arc_upper(t))); + free(temp); + // printf("key = %s\n", id_pos); + if(posNode = (xmlNodePtr)g_hash_table_lookup(segments_created, id_pos)){ + t->lower = maca_ht_adr2index(ht,posNode); + // printf("segment %s already created\n", id_pos); + } + else{ + // printf("add segment %s %s (%s)\n", xmlGetProp(lexNode, BAD_CAST "id"), maca_tags_get_str(ctx->cfg, "morpho", "stype", gfsm_arc_upper(t)), id_pos); + posNode = maca_sentence_add_segment(ctx->ms, MACA_MORPHO_SECTION, MACA_CAT_TYPE, prefix_id); + t->lower = maca_ht_adr2index(ht,posNode); + xmlNewProp(posNode, BAD_CAST "stype", BAD_CAST maca_tags_get_str(ctx->cfg, "morpho", "stype", gfsm_arc_upper(t))); + maca_segment_add_elt_from_node(posNode, lexNode, 0); + g_hash_table_insert(segments_created, strdup(id_pos), posNode); + } + } + } + } + } + free(prefix_id); + // maca_section_add_automaton(section, a); + maca_sentence_update_xml_automaton(ctx->ms, MACA_MORPHO_SECTION,a); + // section->xml_node_fsm = xmlAddChild(section->xml_node, fsm2xml(a, ht)); + + g_hash_table_destroy(segments_created); + // fsm_affiche(a, ht); + //maca_section_update_xml_automaton(section, ht, a); + return section; +} + + + +int maca_crf_tagger_ProcessSentence(maca_sentence * ms, maca_crf_tagger_ctx * ctx) +{ + maca_section * prelex_section; + maca_section * lex_section; + gfsmAutomaton *lex_automaton; + fst::StdVectorFst automaton; + + ctx->ms = ms; + + if(!maca_sentence_is_section_loaded(ctx->ms,MACA_PRELEX_SECTION)) + { + prelex_section = maca_sentence_load_section_by_type(ctx->ms,MACA_PRELEX_SECTION); + } + else prelex_section = maca_sentence_get_section(ctx->ms, MACA_PRELEX_SECTION); + if(prelex_section == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr,"sentence : %s no prelex section\n", ctx->ms->id_sentence); + return -1; + } + + if(!maca_sentence_is_section_loaded(ctx->ms,MACA_LEX_SECTION)) + { + lex_section = maca_sentence_load_section_by_type(ctx->ms,MACA_LEX_SECTION); + } + else lex_section = maca_sentence_get_section(ctx->ms, MACA_LEX_SECTION); + if(lex_section == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr,"sentence : %s no lex section\n", ctx->ms->id_sentence); + return -1; + } + lex_automaton = maca_sentence_get_section_automaton(ctx->ms, MACA_LEX_SECTION); + if(lex_automaton == NULL){ + maca_msg(ctx->module, MACA_ERROR); + fprintf(stderr,"sentence : %s no lex automaton\n", ctx->ms->id_sentence); + return -1; + } + gfsm2fst(lex_automaton, automaton); + crf_tagger(automaton, ctx, ctx->verbose_flag > 4); + if(ctx->n > 0){ + fst::StdVectorFst nbest; + fst::ShortestPath(automaton, &nbest, ctx->n); + create_morpho_section(fst2gfsm(nbest), ctx); + } else if(ctx->n == -1) { + create_morpho_section(fst2gfsm(automaton), ctx); + } else if(ctx->n == -2) { + macaon::DeterminizeTCLex(&automaton); + create_morpho_section(fst2gfsm(automaton), ctx); + } else { + fprintf(stderr, "error: unknown -n value (%d)\n", ctx->n); + return -1; + } + return 1; +} + + diff --git a/maca_crf_tagger/src/maca_crf_tagger_utils.cc b/maca_crf_tagger/src/maca_crf_tagger_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..93ed95d3c540cb7fa977dc8ae02362a97579c44b --- /dev/null +++ b/maca_crf_tagger/src/maca_crf_tagger_utils.cc @@ -0,0 +1,31 @@ +/*************************************************************************** + Copyright (C) 2011 by xxx <xxx@lif.univ-mrs.fr> + This file is part of maca_crf_tagger. + + Maca_crf_tagger is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Maca_crf_tagger is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with maca_crf_tagger. If not, see <http://www.gnu.org/licenses/>. +**************************************************************************/ + +#include "maca_crf_tagger.hh" + +char * maca_crf_tagger_GetVersion() +{ + return MACA_CRF_TAGGER_VERSION; +} + +void maca_crf_tagger_add_stamp(xmlNodePtr node) +{ + add_maca_stamp(node,MACA_CRF_TAGGER_NAME,MACA_CRF_TAGGER_VERSION); +} + + diff --git a/maca_crf_tagger/src/simple_tagger.cc b/maca_crf_tagger/src/simple_tagger.cc new file mode 100644 index 0000000000000000000000000000000000000000..14468294e06b7a5d90206334cb5b0d9db906f311 --- /dev/null +++ b/maca_crf_tagger/src/simple_tagger.cc @@ -0,0 +1,21 @@ +#include "simple_tagger.hh" + +macaon::Tagger* Tagger_new(const char* modelName, const char* lexiconName) { + return new macaon::Tagger(modelName, lexiconName ? lexiconName : ""); +} + +void Tagger_free(macaon::Tagger* tagger) { + delete tagger; +} + +bool Tagger_ProcessSentence(macaon::Tagger* tagger, int num_words, const char** words, const char** tags) { + std::vector<std::string> word_vector, tag_vector; + for(int i = 0; i < num_words; i++) { + word_vector.push_back(words[i]); + } + bool result = tagger->ProcessSentence(word_vector, tag_vector); + for(int i = 0; i < num_words; i++) { + tags[i] = strdup(tag_vector[i].c_str()); + } + return result; +} diff --git a/maca_crf_tagger/src/simple_tagger.hh b/maca_crf_tagger/src/simple_tagger.hh new file mode 100644 index 0000000000000000000000000000000000000000..21853382ec8c2f76a35cef8747ad1083bc3c40d9 --- /dev/null +++ b/maca_crf_tagger/src/simple_tagger.hh @@ -0,0 +1,40 @@ +#include <vector> +#include "crf_decoder.hh" +#include "crf_binlexicon.hh" +#include "crf_features.hh" + +namespace macaon { + class Tagger { + private: + macaon::Decoder decoder; + macaon::BinaryLexicon *lexicon; + public: + Tagger(const std::string modelName, const std::string lexiconName = "") : decoder(modelName), lexicon(NULL) { + if(lexiconName != "") lexicon = new macaon::BinaryLexicon(lexiconName, decoder.getTagset()); + } + + ~Tagger() { + if(lexicon != NULL) delete lexicon; + } + + bool ProcessSentence(const std::vector<std::string>& words, std::vector<std::string>& tags) { + std::vector<std::vector<std::string> > features; + for(size_t i = 0; i < words.size(); i++) { + std::vector<std::string> word_features; + macaon::FeatureGenerator::get_pos_features(words[i], word_features); + features.push_back(word_features); + } + tags.clear(); + decoder.decodeString(features, tags, lexicon); + return true; + } + }; +} + +extern "C" { + macaon::Tagger* Tagger_new(const char* modelName, const char* lexiconName); + + void Tagger_free(macaon::Tagger* tagger); + + bool Tagger_ProcessSentence(macaon::Tagger* tagger, int num_words, const char** words, const char** tags); +} diff --git a/maca_crf_tagger/src/test_simple_tagger.cc b/maca_crf_tagger/src/test_simple_tagger.cc new file mode 100644 index 0000000000000000000000000000000000000000..ac3f021df4cd6479e649c23fb9549604ce1a7b46 --- /dev/null +++ b/maca_crf_tagger/src/test_simple_tagger.cc @@ -0,0 +1,27 @@ +#include <stdio.h> + +#include "simple_tagger.hh" + +int main(int argc, char** argv) { + int num_words = 6; + const char* words[] = {"le", "petit", "chat", "boit", "du", "lait"}; + const char* tags[6]; + int i; + + if(argc != 3) { + fprintf(stderr, "usage: %s <tagger-model> <tagger-lexicon>\n", argv[0]); + return 1; + } + + macaon::Tagger* tagger = Tagger_new(argv[1], argv[2]); + + Tagger_ProcessSentence(tagger, num_words, words, tags); + + for(i = 0; i < num_words; i++) { + printf("%s %s\n", words[i], tags[i]); + } + + Tagger_free(tagger); + + return 0; +} diff --git a/maca_crf_tagger/src/utf8 b/maca_crf_tagger/src/utf8 new file mode 100755 index 0000000000000000000000000000000000000000..4dee4508ec27a0923a86e9384cf41e7758a38d30 Binary files /dev/null and b/maca_crf_tagger/src/utf8 differ diff --git a/maca_crf_tagger/src/utf8.c b/maca_crf_tagger/src/utf8.c new file mode 100644 index 0000000000000000000000000000000000000000..4e1ea27f17001a8cad27e2bb61f93d28a856e358 --- /dev/null +++ b/maca_crf_tagger/src/utf8.c @@ -0,0 +1,33 @@ +#include <stdio.h> +#include <stdlib.h> + +const char *byte_to_binary(int x) +{ + static char b[9]; + b[0] = '\0'; + + int z; + for (z = 128; z > 0; z >>= 1) + { + strcat(b, ((x & z) == z) ? "1" : "0"); + } + + return b; +} + +int main() { + char word[1024]; + fgets(word, 1024, stdin); + printf("%s\n", word); + int offset = 0; + while(word[offset] != 0) { + printf("%3d %s [%s]\n", offset, byte_to_binary(word[offset]), &word[offset]); + if((unsigned char)word[offset] >> 7 == 1) { + offset++; + while((unsigned char)word[offset] >> 6 == 2) offset++; + } else { + offset++; + } + } + return 0; +}