Skip to content
Snippets Groups Projects
Commit 20f474f0 authored by Benoit Favre's avatar Benoit Favre
Browse files

add python implementation following old API

parent 08a0e46d
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/python
# -*- coding: UTF-8 -*-
''' Macaon implementation compatible with the old ctypes bindings.
Make sure you have compiled macaon with "cmake -DMACA_EXPORT=true"
First you need to call setup(model_dir, libdir).
- model_dir is the maca_data2 directory where the model was compiled (it looks for model fils in <model_dir>/<lang>/bin)
- libdir is a directory containing tagger/, lemmatizer/ and parser/ subdirectories containing each one of the .so files compiled
Then, you can instanciate one of the tools or the whole chain as:
with Tool(lang, mcd) as tool:
result = tool.process(sentence)
- lang is the language (or model directory)
- mcd is the feature definition file (in .mcd format)
The "with" statement takes care of house cleaning.
The tagger's process() method takes as input a list of words. It outputs a list of tuples (word, tag).
The lemmatizer's process() method inputs a list of tuples (word, tag) and outputs a list of (word, tag, lemma) tuples.
The parser's process() method inputs a list of (word, tag, lemma) tuples, and outputs (id, word, tag, lemma, gov, label) tuples.
Governors are absolute ids instead of relative as in macaon2; A governor of 0 indicates the root of a sentence.
While macaon2 can process multiple sentences, the parser assumes one sentence per input.
Example usage:
import macaon
macaon.setup('../../models-dir', '../../build/maca_export')
sentence = ['le', 'chat', 'boit', 'du', 'lait']
with Tagger('fr', 'wlmp.mcd') as tagger:
result = tagger.process(sentence)
print(result)
'''
from __future__ import print_function
import os
import sys
was_setup = False
currentdir = os.path.dirname(os.path.abspath(__file__))
default_language = 'fr'
default_mcd = currentdir + '/../../../maca_data2/fr/eval/wplgfs.mcd'
def setup(model_dir=currentdir + "/../../models-fr", lib_dir=currentdir + "/../../build/maca_export"):
global was_setup
os.environ["MACAON_DIR"] = model_dir
for tool in ['tagger', 'lemmatizer', 'parser']:
sys.path.append(os.path.join(lib_dir, tool))
was_setup = True
class Tagger:
def __init__(self, lang=default_language, mcd=default_mcd):
global was_setup
if not was_setup:
raise Exception('macaon was not setup')
import MacaonTagger
self.tagger = MacaonTagger.MacaonTransTagger(lang, mcd)
def process(self, words):
result = self.tagger.tagmcf('\n'.join(words + ['']))
return [line.split('\t') for line in result.split('\n') if line != '']
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
del self.tagger
class Lemmatizer:
def __init__(self, lang=default_language, mcd=default_mcd):
global was_setup
if not was_setup:
raise Exception('macaon was not setup')
import MacaonLemmatizer
self.lemmatizer = MacaonLemmatizer.MacaonTransLemmatizer(lang, mcd)
def process(self, tagged):
result = self.lemmatizer.lemmatizemcf('\n'.join(['\t'.join(tokens) for tokens in tagged] + ['']))
return [line.split('\t') for line in result.split('\n') if line != '']
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
del self.lemmatizer
class Parser:
def __init__(self, lang=default_language, mcd=default_mcd):
global was_setup
if not was_setup:
raise Exception('macaon was not setup')
import MacaonParser
self.parser = MacaonParser.MacaonTransParser(lang, mcd)
def process(self, lemmatized):
result = self.parser.parsemcf('\n'.join(['\t'.join(tokens) for tokens in lemmatized] + ['']))
words, tags, lemmas, governors, labels, sentences = zip(*[line.split('\t') for line in result.split('\n') if line != ''])
governors = [0 if j == '0' else i + 1 + int(j) for i, j in enumerate(governors)]
return zip([x + 1 for x in range(len(words))], words, tags, lemmas, governors, labels)
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
del self.parser
class Chain:
def __init__(self, lang=default_language, mcd=default_mcd):
self.tagger = Tagger(lang, mcd)
self.lemmatizer = Lemmatizer(lang, mcd)
self.parser = Parser(lang, mcd)
def process(self, sentence, getter=lambda x: x):
words = [getter(x) for x in sentence]
tagged = self.tagger.process(words)
lemmatized = self.lemmatizer.process(tagged)
parsed = self.parser.process(lemmatized)
return parsed
def __enter__(self):
self.tagger.__enter__()
self.lemmatizer.__enter__()
self.parser.__enter__()
return self
def __exit__(self, type, value, traceback):
self.parser.__exit__(type, value, traceback)
self.lemmatizer.__exit__(type, value, traceback)
self.tagger.__exit__(type, value, traceback)
if __name__ == '__main__':
import sys
setup()
with Chain(lang='fr') as chain:
for line in sys.stdin:
parsed = chain.process(line.strip().split())
for i, word, tag, lemma, governor, label in parsed:
print(i, word, tag, lemma, governor, label)
print()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment