add python implementation following old API

20f474f0 · Benoit Favre · 08a0e46d · 20f474f0
Commit 20f474f0 authored 7 years ago by Benoit Favre
--- a/maca_export/example/macaon.py
+++ b/maca_export/example/macaon.py
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+
+''' Macaon implementation compatible with the old ctypes bindings.
+
+Make sure you have compiled macaon with "cmake -DMACA_EXPORT=true"
+
+First you need to call setup(model_dir, libdir). 
+- model_dir is the maca_data2 directory where the model was compiled (it looks for model fils in <model_dir>/<lang>/bin)
+- libdir is a directory containing tagger/, lemmatizer/ and parser/ subdirectories containing each one of the .so files compiled
+
+Then, you can instanciate one of the tools or the whole chain as:
+
+    with Tool(lang, mcd) as tool:
+        result = tool.process(sentence)
+
+- lang is the language (or model directory)
+- mcd is the feature definition file (in .mcd format)
+The "with" statement takes care of house cleaning.
+
+The tagger's process() method takes as input a list of words. It outputs a list of tuples (word, tag).
+The lemmatizer's process() method inputs a list of tuples (word, tag) and outputs a list of (word, tag, lemma) tuples.
+The parser's process() method inputs a list of (word, tag, lemma) tuples, and outputs (id, word, tag, lemma, gov, label) tuples.
+Governors are absolute ids instead of relative as in macaon2; A governor of 0 indicates the root of a sentence.
+While macaon2 can process multiple sentences, the parser assumes one sentence per input.
+
+Example usage:
+
+    import macaon
+    macaon.setup('../../models-dir', '../../build/maca_export')
+
+    sentence = ['le', 'chat', 'boit', 'du', 'lait']
+    with Tagger('fr', 'wlmp.mcd') as tagger:
+        result = tagger.process(sentence)
+        print(result)
+ 
+'''
+
+from __future__ import print_function
+import os
+import sys
+
+was_setup = False
+currentdir = os.path.dirname(os.path.abspath(__file__))
+default_language = 'fr'
+default_mcd = currentdir + '/../../../maca_data2/fr/eval/wplgfs.mcd'
+
+def setup(model_dir=currentdir + "/../../models-fr", lib_dir=currentdir + "/../../build/maca_export"):
+    global was_setup
+    os.environ["MACAON_DIR"] = model_dir
+    for tool in ['tagger', 'lemmatizer', 'parser']:
+        sys.path.append(os.path.join(lib_dir, tool))
+
+    was_setup = True
+
+
+class Tagger:
+    def __init__(self, lang=default_language, mcd=default_mcd):
+        global was_setup
+        if not was_setup:
+            raise Exception('macaon was not setup')
+
+        import MacaonTagger
+        self.tagger = MacaonTagger.MacaonTransTagger(lang, mcd)
+
+    def process(self, words):
+        result = self.tagger.tagmcf('\n'.join(words + ['']))
+        return [line.split('\t') for line in result.split('\n') if line != '']
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        del self.tagger
+
+
+class Lemmatizer:
+    def __init__(self, lang=default_language, mcd=default_mcd):
+        global was_setup
+        if not was_setup:
+            raise Exception('macaon was not setup')
+
+        import MacaonLemmatizer
+        self.lemmatizer = MacaonLemmatizer.MacaonTransLemmatizer(lang, mcd)
+
+    def process(self, tagged):
+        result = self.lemmatizer.lemmatizemcf('\n'.join(['\t'.join(tokens) for tokens in tagged] + ['']))
+        return [line.split('\t') for line in result.split('\n') if line != '']
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        del self.lemmatizer
+
+
+class Parser:
+    def __init__(self, lang=default_language, mcd=default_mcd):
+        global was_setup
+        if not was_setup:
+            raise Exception('macaon was not setup')
+
+        import MacaonParser
+        self.parser = MacaonParser.MacaonTransParser(lang, mcd)
+
+    def process(self, lemmatized):
+        result = self.parser.parsemcf('\n'.join(['\t'.join(tokens) for tokens in lemmatized] + ['']))
+        words, tags, lemmas, governors, labels, sentences = zip(*[line.split('\t') for line in result.split('\n') if line != ''])
+        governors = [0 if j == '0' else i + 1 + int(j) for i, j in enumerate(governors)]
+        return zip([x + 1 for x in range(len(words))], words, tags, lemmas, governors, labels)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        del self.parser
+
+
+class Chain:
+    def __init__(self, lang=default_language, mcd=default_mcd):
+        self.tagger = Tagger(lang, mcd)
+        self.lemmatizer = Lemmatizer(lang, mcd)
+        self.parser = Parser(lang, mcd)
+
+    def process(self, sentence, getter=lambda x: x):
+        words = [getter(x) for x in sentence]
+        tagged = self.tagger.process(words)
+        lemmatized = self.lemmatizer.process(tagged)
+        parsed = self.parser.process(lemmatized)
+        return parsed
+    
+    def __enter__(self):
+        self.tagger.__enter__()
+        self.lemmatizer.__enter__()
+        self.parser.__enter__()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.parser.__exit__(type, value, traceback)
+        self.lemmatizer.__exit__(type, value, traceback)
+        self.tagger.__exit__(type, value, traceback)
+
+
+
+if __name__ == '__main__':
+    import sys
+    setup()
+
+    with Chain(lang='fr') as chain:
+        for line in sys.stdin:
+            parsed = chain.process(line.strip().split())
+            for i, word, tag, lemma, governor, label in parsed:
+                print(i, word, tag, lemma, governor, label)
+            print()
+