Commit 20f474f0 authored by Benoit Favre's avatar Benoit Favre
Browse files

add python implementation following old API

parent 08a0e46d
#!/usr/bin/python
# -*- coding: UTF-8 -*-
''' Macaon implementation compatible with the old ctypes bindings.
Make sure you have compiled macaon with "cmake -DMACA_EXPORT=true"
First you need to call setup(model_dir, libdir).
- model_dir is the maca_data2 directory where the model was compiled (it looks for model fils in <model_dir>/<lang>/bin)
- libdir is a directory containing tagger/, lemmatizer/ and parser/ subdirectories containing each one of the .so files compiled
Then, you can instanciate one of the tools or the whole chain as:
with Tool(lang, mcd) as tool:
result = tool.process(sentence)
- lang is the language (or model directory)
- mcd is the feature definition file (in .mcd format)
The "with" statement takes care of house cleaning.
The tagger's process() method takes as input a list of words. It outputs a list of tuples (word, tag).
The lemmatizer's process() method inputs a list of tuples (word, tag) and outputs a list of (word, tag, lemma) tuples.
The parser's process() method inputs a list of (word, tag, lemma) tuples, and outputs (id, word, tag, lemma, gov, label) tuples.
Governors are absolute ids instead of relative as in macaon2; A governor of 0 indicates the root of a sentence.
While macaon2 can process multiple sentences, the parser assumes one sentence per input.
Example usage:
import macaon
macaon.setup('../../models-dir', '../../build/maca_export')
sentence = ['le', 'chat', 'boit', 'du', 'lait']
with Tagger('fr', 'wlmp.mcd') as tagger:
result = tagger.process(sentence)
print(result)
'''
from __future__ import print_function
import os
import sys
was_setup = False
currentdir = os.path.dirname(os.path.abspath(__file__))
default_language = 'fr'
default_mcd = currentdir + '/../../../maca_data2/fr/eval/wplgfs.mcd'
def setup(model_dir=currentdir + "/../../models-fr", lib_dir=currentdir + "/../../build/maca_export"):
global was_setup
os.environ["MACAON_DIR"] = model_dir
for tool in ['tagger', 'lemmatizer', 'parser']:
sys.path.append(os.path.join(lib_dir, tool))
was_setup = True
class Tagger:
def __init__(self, lang=default_language, mcd=default_mcd):
global was_setup
if not was_setup:
raise Exception('macaon was not setup')
import MacaonTagger
self.tagger = MacaonTagger.MacaonTransTagger(lang, mcd)
def process(self, words):
result = self.tagger.tagmcf('\n'.join(words + ['']))
return [line.split('\t') for line in result.split('\n') if line != '']
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
del self.tagger
class Lemmatizer:
def __init__(self, lang=default_language, mcd=default_mcd):
global was_setup
if not was_setup:
raise Exception('macaon was not setup')
import MacaonLemmatizer
self.lemmatizer = MacaonLemmatizer.MacaonTransLemmatizer(lang, mcd)
def process(self, tagged):
result = self.lemmatizer.lemmatizemcf('\n'.join(['\t'.join(tokens) for tokens in tagged] + ['']))
return [line.split('\t') for line in result.split('\n') if line != '']
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
del self.lemmatizer
class Parser:
def __init__(self, lang=default_language, mcd=default_mcd):
global was_setup
if not was_setup:
raise Exception('macaon was not setup')
import MacaonParser
self.parser = MacaonParser.MacaonTransParser(lang, mcd)
def process(self, lemmatized):
result = self.parser.parsemcf('\n'.join(['\t'.join(tokens) for tokens in lemmatized] + ['']))
words, tags, lemmas, governors, labels, sentences = zip(*[line.split('\t') for line in result.split('\n') if line != ''])
governors = [0 if j == '0' else i + 1 + int(j) for i, j in enumerate(governors)]
return zip([x + 1 for x in range(len(words))], words, tags, lemmas, governors, labels)
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
del self.parser
class Chain:
def __init__(self, lang=default_language, mcd=default_mcd):
self.tagger = Tagger(lang, mcd)
self.lemmatizer = Lemmatizer(lang, mcd)
self.parser = Parser(lang, mcd)
def process(self, sentence, getter=lambda x: x):
words = [getter(x) for x in sentence]
tagged = self.tagger.process(words)
lemmatized = self.lemmatizer.process(tagged)
parsed = self.parser.process(lemmatized)
return parsed
def __enter__(self):
self.tagger.__enter__()
self.lemmatizer.__enter__()
self.parser.__enter__()
return self
def __exit__(self, type, value, traceback):
self.parser.__exit__(type, value, traceback)
self.lemmatizer.__exit__(type, value, traceback)
self.tagger.__exit__(type, value, traceback)
if __name__ == '__main__':
import sys
setup()
with Chain(lang='fr') as chain:
for line in sys.stdin:
parsed = chain.process(line.strip().split())
for i, word, tag, lemma, governor, label in parsed:
print(i, word, tag, lemma, governor, label)
print()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment