Commit 2a1858d8 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

version initiale

parent 1ed9487a
train_conll=../data/train_$(lang).conllu
train_proj_conll=./out/train_$(lang)_proj.conllu
train_mcf=./out/train_$(lang)_pgle.mcf
train_cff=./out/train_$(lang).cff
train_word_limit=40000
dev_conll=../data/dev_$(lang).conllu
dev_proj_conll=./out/dev_$(lang)_proj.conllu
dev_mcf=./out/dev_$(lang)_pgle.mcf
dev_cff=./out/dev_$(lang).cff
dev_word_limit=5000
test_conll=../data/test_$(lang).conllu
test_mcf=./out/test_$(lang)_pgle.mcf
test_mcf_hyp=./out/test_$(lang)_hyp.mcf
test_word_limit=700
feat_model=basic.fm
dicos=./out/$(lang)_train.dic
model=./out/$(lang).keras
results = ./out/$(lang).res
mcd_pgle=PGLE.mcd
eval: $(test_mcf_hyp)
python3 ../src/eval_mcf.py $(test_mcf) $(test_mcf_hyp) $(mcd_pgle) $(mcd_pgle) $(lang) > $(results)
$(test_mcf_hyp): $(test_mcf) $(model)
python3 ../src/tbp_decode.py $(test_mcf) $(model) $(dicos) $(feat_model) $(mcd_pgle) $(test_word_limit) > $(test_mcf_hyp)
$(model): $(train_cff) $(dev_cff)
python3 ../src/tbp_train.py $(train_cff) $(dev_cff) $(model)
$(train_cff): $(train_mcf) $(dicos)
python3 ../src/mcf2cff.py $(train_mcf) $(feat_model) $(mcd_pgle) $(dicos) $(train_cff) $(train_word_limit)
$(dev_cff): $(dev_mcf) $(dicos)
python3 ../src/mcf2cff.py $(dev_mcf) $(feat_model) $(mcd_pgle) $(dicos) $(dev_cff) $(dev_word_limit)
$(train_mcf): $(train_proj_conll)
python3 ../src/conll2mcf.py $(train_proj_conll) $(mcd_pgle) > $(train_mcf)
$(dev_mcf): $(dev_proj_conll)
python3 ../src/conll2mcf.py $(dev_proj_conll) $(mcd_pgle) > $(dev_mcf)
$(test_mcf):
python3 ../src/conll2mcf.py $(test_conll) $(mcd_pgle) > $(test_mcf)
$(train_proj_conll):
python3 ../src/remove_non_projective_sentences_from_conll.py $(train_conll) > $(train_proj_conll)
$(dev_proj_conll):
python3 ../src/remove_non_projective_sentences_from_conll.py $(dev_conll) > $(dev_proj_conll)
$(dicos): $(train_mcf)
python3 ../src/create_dicos.py $(train_mcf) $(mcd_pgle) $(dicos)
clean:
- rm $(train_proj_conll)
- rm $(train_mcf)
- rm $(train_cff)
- rm $(dev_proj_conll)
- rm $(dev_mcf)
- rm $(dev_cff)
- rm $(test_mcf)
- rm $(test_mcf_hyp)
- rm $(dicos)
- rm -rf $(model)
0 POS SYM KEEP
1 GOV INT KEEP
2 LABEL SYM KEEP
3 EOS SYM KEEP
B -2 POS
B -1 POS
B 0 POS
B 1 POS
B 2 POS
S 0 POS
S 1 POS
make lang=ar
make lang=bg
make lang=bxr
make lang=ca
make lang=cs
make lang=da
make lang=de
make lang=el
make lang=en
make lang=es
make lang=et
make lang=eu
make lang=fa
make lang=fi
make lang=fr
make lang=ga
make lang=gl
make lang=got
make lang=grc
make lang=he
make lang=hi
make lang=hr
make lang=hsb
make lang=hu
make lang=id
make lang=it
make lang=ja
make lang=kk
make lang=kmr
make lang=ko
make lang=la
make lang=lv
make lang=nl
make lang=no
make lang=pl
make lang=pt
make lang=ro
make lang=ru
make lang=sk
make lang=sl
make lang=sme
make lang=sv
make lang=tr
make lang=ug
make lang=uk
make lang=ur
make lang=vi
make lang=zh
import sys
from Stack import Stack
from Word import Word
from WordBuffer import WordBuffer
class Config:
def __init__(self, filename, mcd, dicos):
self.wb = WordBuffer(filename, mcd)
self.st = Stack()
def isFinal(self):
if self.getStack().getLength() == 1 and self.getStack().top() == 0 and self.getBuffer().getCurrentIndex() >= self.getBuffer().getLength():
return True
return False
def getStack(self):
return self.st
def getBuffer(self):
return self.wb
def fwd(self):
if self.getBuffer().endReached() :
return False
self.getBuffer().currentIndex += 1
return True
def shift(self):
if self.getBuffer().endReached() :
sys.stderr.write("cannot shift : end of buffer reached\n")
return False
self.getStack().push(self.getBuffer().currentIndex);
self.fwd()
return True
def red(self):
if(self.getStack().isEmpty()):
sys.stderr.write("cannot reduce an empty stack !\n")
return False
if int(self.getBuffer().getWord(self.getStack().top()).getFeat('GOV')) == Word.invalidGov() :
sys.stderr.write("cannot reduce the stack if top element does not have a governor !\n")
return False
self.getStack().pop()
return True
def right(self, label):
if(self.getStack().isEmpty()):
print("cannot make a right move, the stack is empty!")
return False
govIndex = self.getStack().top()
depIndex = self.getBuffer().currentIndex
self.getBuffer().getCurrentWord().setFeat('LABEL', label)
self.getBuffer().getCurrentWord().setFeat('GOV', str(govIndex - depIndex))
self.getBuffer().getWord(self.getStack().top()).addRightDaughter(depIndex)
self.getStack().push(self.getBuffer().currentIndex)
res = self.fwd()
return res
def left(self, label):
if(self.getStack().isEmpty()):
print("cannot make a left move, the stack is empty!")
return False
govIndex = self.getBuffer().currentIndex
depIndex = self.getStack().top()
self.getBuffer().getWord(self.getStack().top()).setFeat('LABEL', label)
self.getBuffer().getWord(self.getStack().top()).setFeat('GOV', str(govIndex - depIndex))
self.getBuffer().getCurrentWord().addLeftDaughter(depIndex)
self.getStack().pop()
return True
def applyMvt(self, mvt):
mvt_type = mvt[0]
mvt_label = mvt[1]
if(mvt_type == 'RIGHT'):
return self.right(mvt_label)
elif(mvt_type == 'LEFT'):
return self.left(mvt_label)
elif(mvt_type == 'SHIFT'):
return self.shift()
elif(mvt_type == 'REDUCE'):
return self.red()
return False
def getWordFeat(self, featTuple):
container = featTuple[0]
index = featTuple[1]
tape = featTuple[2]
if(container == 'B'):
# if((index < self.getBuffer().getLength()) and (index >= 0)):
absoluteIndex = self.getBuffer().getCurrentIndex() + index
if absoluteIndex < self.getBuffer().getLength() and absoluteIndex >= 0 :
w = self.getBuffer().getWord(absoluteIndex)
else:
#print('word feature ', container, '.', index, '.', tape, ' cannot be interpreted, index ', index, "is out of bound")
return 'NULL'
else:
if(index < self.getStack().getLength()):
#print('on cherche dans', self.getStack().getLength() - index - 1, "")
w = self.getBuffer().getWord(self.getStack().array[self.getStack().getLength() - index - 1])
if w == None :
return 'NULL'
else:
#print('word feature ', container, '.', index, '.', tape, ' cannot be interpreted, index ', index, "is out of bound")
return 'NULL'
return w.getFeat(tape)
# print('word feature ', container, '.', index, '.', tape, ' cannot be interpreted, tape ', tape, "is unknown")
# return 'NULL'
def affiche(self):
currentIndex = self.getBuffer().getCurrentIndex()
print('BUFFER = ', end = '')
for i in range(currentIndex - 2, currentIndex + 2):
if((i >= 0) and (i < len(self.getBuffer().array))):
if(i == currentIndex):
print('[[', i, ':', self.getBuffer().getWord(i).getFeat('POS'), ']] ', end = ' ')
else:
print('[', i, ':', self.getBuffer().getWord(i).getFeat('POS'), '] ', end = ' ')
print('\nSTACK = [', end = '')
st = self.getStack()
for elt in st.array:
print(elt, ' ', end = '')
print(']')
def extractFeatVec(self, FeatModel):
featVec = []
i = 0
for f in FeatModel.getArray():
# print(f, '=', self.getWordFeat(f))
featVec.append(self.getWordFeat(f))
i += 1
# print(featVec)
return featVec
class Dico:
def __init__(self, name):
self.name = name
self.hash = {}
self.array = []
def add(self, symbol):
if symbol in self.hash :
return self.hash[symbol]
self.hash[symbol] = len(self.array)
self.array.append(symbol)
return len(self.array) - 1
def getCode(self, symbol):
if not symbol in self.hash :
return None
return self.hash[symbol]
def getSymbol(self, code):
if code >= len(self.array) :
return None
return self.array[code]
def getSize(self):
return len(self.array)
def printToFile(self, dicoFile):
dicoFile.write('##')
dicoFile.write(self.name)
dicoFile.write('\n')
for i in range(len(self.array)):
dicoFile.write(self.array[i])
dicoFile.write('\n')
def print(self):
print('##', self.name, sep = '')
for i in range(len(self.array)):
print(self.array[i], i)
from Dico import Dico
class Dicos:
def __init__(self, mcd=False, fileName=False, verbose=False):
self.content = {}
if mcd :
for index in range(mcd.getNbCol()):
if(mcd.getColStatus(index) == 'KEEP') and (mcd.getColType(index) == 'SYM') :
dico = self.addDico(mcd.getColName(index))
dico.add('NULL')
dico.add('ROOT')
if fileName :
try:
dicoFile = open(fileName, encoding='utf-8')
except IOError:
print(fileName, 'does not exist')
exit(1)
for ligne in dicoFile:
if ligne[0] == '#' and ligne[1] == '#' :
currentDicoName = ligne[2:-1]
currentDico = self.getDico(currentDicoName)
else:
symbol = ligne[:-1]
currentDico.add(symbol)
dicoFile.close()
def populateFromMcfFile(self, mcfFilename, mcd, verbose=False):
try:
mcfFile = open(mcfFilename, encoding='utf-8')
except IOError:
print('cannot open', mcfFilename)
exit(1)
tokens = []
for ligne in mcfFile:
ligne = ligne.rstrip('\n\r')
tokens = ligne.split("\t")
for i in range(0, len(tokens)):
if mcd.getColType(i) == 'SYM' and mcd.getColStatus(i) == 'KEEP':
self.add(mcd.getColName(i), tokens[i])
mcfFile.close();
for e in self.content:
print('DICO', e, ':\t', self.content[e].getSize(), 'entries')
# def populateFromConlluFile(self, conlluFilename, verbose=False):
# try:
# conlluFile = open(conlluFilename, encoding='utf-8')
# except IOError:
# print('cannot open', conlluFilename)
# exit(1)
# mcd = (('INDEX', 'INT'), ('FORM', 'INT'), ('LEMMA', 'INT'), ('POS', 'SYM'), ('X1', 'INT'), ('MORPHO', 'INT'), ('GOV', 'SYM'), ('LABEL', 'SYM'), ('X2', 'SYM'), ('X3', 'SYM'))
# tokens = []
# for ligne in conlluFile:
# if ligne[0] != '\n' and ligne[0] != '#' :
# tokens = ligne.split("\t")
# for i in range(0, len(tokens)):
# if mcd[i][1] == 'SYM' :
# if not tokens[i] in self.content[mcd[i][0]] :
# self.content[mcd[i][0]].append(tokens[i])
# if(verbose): print('in module:', __name__, 'adding value ', tokens[i], 'to dico', mcd[i][0])
# conlluFile.close();
# for e in self.content:
# print('DICO', e, ':\t', len(self.content[e]), 'entries')
def print(self):
for dicoName in self.content.keys():
self.content[dicoName].print()
def printToFile(self, filename):
try:
dicoFile = open(filename, 'w', encoding='utf-8')
except IOError:
print('cannot open', filename)
exit(1)
for dicoName in self.content.keys():
self.content[dicoName].printToFile(dicoFile)
dicoFile.close()
def getDico(self, dicoName):
if not dicoName in self.content :
return None
return self.content[dicoName]
def addDico(self, dicoName):
if dicoName in self.content :
return self.content[dicoName]
dico = Dico(dicoName)
self.content[dicoName] = dico
return dico
def getCode(self, dicoName, symbol) :
dico = self.getDico(dicoName)
if dico == None :
return None
return dico.getCode(symbol)
def getSymbol(self, dicoName, code) :
dico = self.getDico()
if dico == None :
return None
return dico.getSymbol()
def add(self, dicoName, symbol) :
dico = self.getDico(dicoName)
if dico == None :
return None
return dico.add(symbol)
import numpy as np
class FeatModel:
array = []
nbFeat = 0
inputVectorSize = None
def __init__(self, featModFilename, dicos):
try:
featModFile = open(featModFilename, encoding='utf-8')
except IOError:
print(featModFilename, " : ce fichier n'existe pas")
exit(1)
for ligne in featModFile:
(container, position, wordFeature) = ligne.split()
# print("container = ", container, "position = ", position, "wordFeature = ", wordFeature)
if(container != "B" and container != "S"):
print("error while reading featMod file : ", featModFilename, "container :", container, "undefined")
exit(1)
if not wordFeature in set(['POS', 'LEMMA', 'FORM']):
print("error while reading featMod file : ", featModFilename, "wordFeature :", wordFeature, "undefined")
exit(1)
self.array.append((container, int(position), wordFeature))
self.nbFeat += 1
featModFile.close()
self.inputVectorSize = self.computeInputSize(dicos)
def computeInputSize(self, dicos):
inputVectorSize = 0
for featTuple in self.getArray():
feat = featTuple[2]
inputVectorSize += dicos.getDico(feat).getSize()
return inputVectorSize
def getInputSize(self):
return self.inputVectorSize
def getNbFeat(self):
return self.nbFeat
def getArray(self):
return self.array
def getFeatContainer(self, featIndex):
return self.array[featIndex][0]
def getFeatPosition(self, featIndex):
return self.array[featIndex][1]
def getFeatWordFeature(self, featIndex):
return self.array[featIndex][2]
def buildInputVector(self, featVec, dicos):
inputVector = np.zeros(self.inputVectorSize, dtype="int32")
origin = 0
for i in range(self.getNbFeat()):
featureName = self.getFeatWordFeature(i)
size = dicos.getDico(featureName).getSize()
position = dicos.getCode(featureName, featVec[i])
#print('featureName = ', featureName, 'value =', featVec[i], 'size =', size, 'position =', position, 'origin =', origin)
inputVector[origin + position] = 1
origin += size
return inputVector
class Mcd:
array = []
nbCol = 0
def __init__(self, mcdFilename):
try:
mcdFile = open(mcdFilename, encoding='utf-8')
except IOError:
print(mcdFilename, " : ce fichier n'existe pas")
exit(1)
for ligne in mcdFile:
(col, name, type, status) = ligne.split()
#print("col = ", col, "name = ", name, "type = ", type, "status =", status)
if(status != "KEEP" and status != "IGNORE"):
print("error while reading mcd file : ", mcdFilename, "status :", status, "undefined")
exit(1)
if(type != "INT" and type != "SYM"):
print("error while reading mcd file : ", mcdFilename, "type :", type, "undefined")
exit(1)
self.array.append((int(col), name, type, status))
mcdFile.close()
self.nbCol = int(col) + 1
def getNbCol(self):
return self.nbCol
def getArray(self):
return self.array
def getColName(self, colIndex):
return self.array[colIndex][1]
def getColType(self, colIndex):
return self.array[colIndex][2]
def getColStatus(self, colIndex):
return self.array[colIndex][3]
def locateCol(self, name):
for colIndex in range(self.nbCol):
if self.array[colIndex][1] == name:
return colIndex
return None
import numpy as np
class Moves:
nb = 0
def __init__(self, dicos):
self.dicoLabels = dicos.getDico('LABEL')
if not self.dicoLabels :
print("cannot find LABEL in dicos")
exit(1)
self.nb = 2 * self.dicoLabels.getSize() + 3
def getNb(self):
return self.nb
def mvtCode(self, mvt):
mvtType = mvt[0]
mvtLabel = mvt[1]
if(mvtType == 'SHIFT'): return 0
if(mvtType == 'REDUCE'): return 1
if(mvtType == 'ROOT'): return 2
labelCode = self.dicoLabels.getCode(mvtLabel)
if not labelCode :
print("cannot compute code of movement ", mvt, "label ", mvtLabel, "unknown")
exit(1)
if(mvtType == 'RIGHT'): return 3 + 2 * labelCode
if(mvtType == 'LEFT'): return 3 + 2 * labelCode + 1
def mvtDecode(self, mvt_Code):
if(mvt_Code == 0) : return ('SHIFT', 'NULL')
if(mvt_Code == 1) : return ('REDUCE', 'NULL')
if(mvt_Code == 2) : return ('ROOT', 'NULL')
if mvt_Code % 2 == 0 : #LEFT
labelCode = int((mvt_Code - 4) / 2)
return ('LEFT', self.dicoLabels.getSymbol(labelCode))
else :
labelCode = int((mvt_Code - 3)/ 2)
return ('RIGHT', self.dicoLabels.getSymbol(labelCode))
def buildOutputVector(self, mvt):
outputVector = np.zeros(self.nb, dtype="int32")
codeMvt = self.mvtCode(mvt)
outputVector[codeMvt] = 1
return outputVector
from Config import Config
from Dicos import Dicos
from Word import Word
def check_all_dependents_of_word_in_ref_are_in_hyp(c, wordIndex):
"""As its name suggests, this function checks that all the dependents of a word have been found.
this function is called by the oracle to predict a ROOT and a REDUCE action
"""
depIndex = wordIndex - 1
# print('target =', wordIndex)
# look for a dependent of word to its left in reference
while (depIndex >=0) :
# print('depIndex = ', depIndex)
govRefIndex = int(c.getBuffer().getWord(depIndex).getFeat('GOVREF')) + depIndex
# print("govRefIndex = ", govRefIndex)
if govRefIndex == wordIndex : # dep is a dependent of word in ref
#check that dep has the same governor in hyp
govHypIndex = int(c.getBuffer().getWord(depIndex).getFeat('GOV')) + depIndex
# print(depIndex, 'is dependent ')
if govHypIndex != govRefIndex :
# print('wrong gov (', govHypIndex, ')');