From 20f474f0743f43137f8218ea1b4375d0b6a35074 Mon Sep 17 00:00:00 2001 From: Benoit Favre <benoit.favre@lif.univ-mrs.fr> Date: Tue, 14 Nov 2017 21:44:45 +0100 Subject: [PATCH] add python implementation following old API --- maca_export/example/macaon.py | 155 ++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100755 maca_export/example/macaon.py diff --git a/maca_export/example/macaon.py b/maca_export/example/macaon.py new file mode 100755 index 0000000..f0e6b20 --- /dev/null +++ b/maca_export/example/macaon.py @@ -0,0 +1,155 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- + +''' Macaon implementation compatible with the old ctypes bindings. + +Make sure you have compiled macaon with "cmake -DMACA_EXPORT=true" + +First you need to call setup(model_dir, libdir). +- model_dir is the maca_data2 directory where the model was compiled (it looks for model fils in <model_dir>/<lang>/bin) +- libdir is a directory containing tagger/, lemmatizer/ and parser/ subdirectories containing each one of the .so files compiled + +Then, you can instanciate one of the tools or the whole chain as: + + with Tool(lang, mcd) as tool: + result = tool.process(sentence) + +- lang is the language (or model directory) +- mcd is the feature definition file (in .mcd format) +The "with" statement takes care of house cleaning. + +The tagger's process() method takes as input a list of words. It outputs a list of tuples (word, tag). +The lemmatizer's process() method inputs a list of tuples (word, tag) and outputs a list of (word, tag, lemma) tuples. +The parser's process() method inputs a list of (word, tag, lemma) tuples, and outputs (id, word, tag, lemma, gov, label) tuples. +Governors are absolute ids instead of relative as in macaon2; A governor of 0 indicates the root of a sentence. +While macaon2 can process multiple sentences, the parser assumes one sentence per input. + +Example usage: + + import macaon + macaon.setup('../../models-dir', '../../build/maca_export') + + sentence = ['le', 'chat', 'boit', 'du', 'lait'] + with Tagger('fr', 'wlmp.mcd') as tagger: + result = tagger.process(sentence) + print(result) + +''' + +from __future__ import print_function +import os +import sys + +was_setup = False +currentdir = os.path.dirname(os.path.abspath(__file__)) +default_language = 'fr' +default_mcd = currentdir + '/../../../maca_data2/fr/eval/wplgfs.mcd' + +def setup(model_dir=currentdir + "/../../models-fr", lib_dir=currentdir + "/../../build/maca_export"): + global was_setup + os.environ["MACAON_DIR"] = model_dir + for tool in ['tagger', 'lemmatizer', 'parser']: + sys.path.append(os.path.join(lib_dir, tool)) + + was_setup = True + + +class Tagger: + def __init__(self, lang=default_language, mcd=default_mcd): + global was_setup + if not was_setup: + raise Exception('macaon was not setup') + + import MacaonTagger + self.tagger = MacaonTagger.MacaonTransTagger(lang, mcd) + + def process(self, words): + result = self.tagger.tagmcf('\n'.join(words + [''])) + return [line.split('\t') for line in result.split('\n') if line != ''] + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + del self.tagger + + +class Lemmatizer: + def __init__(self, lang=default_language, mcd=default_mcd): + global was_setup + if not was_setup: + raise Exception('macaon was not setup') + + import MacaonLemmatizer + self.lemmatizer = MacaonLemmatizer.MacaonTransLemmatizer(lang, mcd) + + def process(self, tagged): + result = self.lemmatizer.lemmatizemcf('\n'.join(['\t'.join(tokens) for tokens in tagged] + [''])) + return [line.split('\t') for line in result.split('\n') if line != ''] + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + del self.lemmatizer + + +class Parser: + def __init__(self, lang=default_language, mcd=default_mcd): + global was_setup + if not was_setup: + raise Exception('macaon was not setup') + + import MacaonParser + self.parser = MacaonParser.MacaonTransParser(lang, mcd) + + def process(self, lemmatized): + result = self.parser.parsemcf('\n'.join(['\t'.join(tokens) for tokens in lemmatized] + [''])) + words, tags, lemmas, governors, labels, sentences = zip(*[line.split('\t') for line in result.split('\n') if line != '']) + governors = [0 if j == '0' else i + 1 + int(j) for i, j in enumerate(governors)] + return zip([x + 1 for x in range(len(words))], words, tags, lemmas, governors, labels) + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + del self.parser + + +class Chain: + def __init__(self, lang=default_language, mcd=default_mcd): + self.tagger = Tagger(lang, mcd) + self.lemmatizer = Lemmatizer(lang, mcd) + self.parser = Parser(lang, mcd) + + def process(self, sentence, getter=lambda x: x): + words = [getter(x) for x in sentence] + tagged = self.tagger.process(words) + lemmatized = self.lemmatizer.process(tagged) + parsed = self.parser.process(lemmatized) + return parsed + + def __enter__(self): + self.tagger.__enter__() + self.lemmatizer.__enter__() + self.parser.__enter__() + return self + + def __exit__(self, type, value, traceback): + self.parser.__exit__(type, value, traceback) + self.lemmatizer.__exit__(type, value, traceback) + self.tagger.__exit__(type, value, traceback) + + + +if __name__ == '__main__': + import sys + setup() + + with Chain(lang='fr') as chain: + for line in sys.stdin: + parsed = chain.process(line.strip().split()) + for i, word, tag, lemma, governor, label in parsed: + print(i, word, tag, lemma, governor, label) + print() + -- GitLab