Skip to content
Snippets Groups Projects
Commit 2a1858d8 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

version initiale

parent 1ed9487a
Branches
No related tags found
No related merge requests found
train_conll=../data/train_$(lang).conllu
train_proj_conll=./out/train_$(lang)_proj.conllu
train_mcf=./out/train_$(lang)_pgle.mcf
train_cff=./out/train_$(lang).cff
train_word_limit=40000
dev_conll=../data/dev_$(lang).conllu
dev_proj_conll=./out/dev_$(lang)_proj.conllu
dev_mcf=./out/dev_$(lang)_pgle.mcf
dev_cff=./out/dev_$(lang).cff
dev_word_limit=5000
test_conll=../data/test_$(lang).conllu
test_mcf=./out/test_$(lang)_pgle.mcf
test_mcf_hyp=./out/test_$(lang)_hyp.mcf
test_word_limit=700
feat_model=basic.fm
dicos=./out/$(lang)_train.dic
model=./out/$(lang).keras
results = ./out/$(lang).res
mcd_pgle=PGLE.mcd
eval: $(test_mcf_hyp)
python3 ../src/eval_mcf.py $(test_mcf) $(test_mcf_hyp) $(mcd_pgle) $(mcd_pgle) $(lang) > $(results)
$(test_mcf_hyp): $(test_mcf) $(model)
python3 ../src/tbp_decode.py $(test_mcf) $(model) $(dicos) $(feat_model) $(mcd_pgle) $(test_word_limit) > $(test_mcf_hyp)
$(model): $(train_cff) $(dev_cff)
python3 ../src/tbp_train.py $(train_cff) $(dev_cff) $(model)
$(train_cff): $(train_mcf) $(dicos)
python3 ../src/mcf2cff.py $(train_mcf) $(feat_model) $(mcd_pgle) $(dicos) $(train_cff) $(train_word_limit)
$(dev_cff): $(dev_mcf) $(dicos)
python3 ../src/mcf2cff.py $(dev_mcf) $(feat_model) $(mcd_pgle) $(dicos) $(dev_cff) $(dev_word_limit)
$(train_mcf): $(train_proj_conll)
python3 ../src/conll2mcf.py $(train_proj_conll) $(mcd_pgle) > $(train_mcf)
$(dev_mcf): $(dev_proj_conll)
python3 ../src/conll2mcf.py $(dev_proj_conll) $(mcd_pgle) > $(dev_mcf)
$(test_mcf):
python3 ../src/conll2mcf.py $(test_conll) $(mcd_pgle) > $(test_mcf)
$(train_proj_conll):
python3 ../src/remove_non_projective_sentences_from_conll.py $(train_conll) > $(train_proj_conll)
$(dev_proj_conll):
python3 ../src/remove_non_projective_sentences_from_conll.py $(dev_conll) > $(dev_proj_conll)
$(dicos): $(train_mcf)
python3 ../src/create_dicos.py $(train_mcf) $(mcd_pgle) $(dicos)
clean:
- rm $(train_proj_conll)
- rm $(train_mcf)
- rm $(train_cff)
- rm $(dev_proj_conll)
- rm $(dev_mcf)
- rm $(dev_cff)
- rm $(test_mcf)
- rm $(test_mcf_hyp)
- rm $(dicos)
- rm -rf $(model)
0 POS SYM KEEP
1 GOV INT KEEP
2 LABEL SYM KEEP
3 EOS SYM KEEP
B -2 POS
B -1 POS
B 0 POS
B 1 POS
B 2 POS
S 0 POS
S 1 POS
make lang=ar
make lang=bg
make lang=bxr
make lang=ca
make lang=cs
make lang=da
make lang=de
make lang=el
make lang=en
make lang=es
make lang=et
make lang=eu
make lang=fa
make lang=fi
make lang=fr
make lang=ga
make lang=gl
make lang=got
make lang=grc
make lang=he
make lang=hi
make lang=hr
make lang=hsb
make lang=hu
make lang=id
make lang=it
make lang=ja
make lang=kk
make lang=kmr
make lang=ko
make lang=la
make lang=lv
make lang=nl
make lang=no
make lang=pl
make lang=pt
make lang=ro
make lang=ru
make lang=sk
make lang=sl
make lang=sme
make lang=sv
make lang=tr
make lang=ug
make lang=uk
make lang=ur
make lang=vi
make lang=zh
import sys
from Stack import Stack
from Word import Word
from WordBuffer import WordBuffer
class Config:
def __init__(self, filename, mcd, dicos):
self.wb = WordBuffer(filename, mcd)
self.st = Stack()
def isFinal(self):
if self.getStack().getLength() == 1 and self.getStack().top() == 0 and self.getBuffer().getCurrentIndex() >= self.getBuffer().getLength():
return True
return False
def getStack(self):
return self.st
def getBuffer(self):
return self.wb
def fwd(self):
if self.getBuffer().endReached() :
return False
self.getBuffer().currentIndex += 1
return True
def shift(self):
if self.getBuffer().endReached() :
sys.stderr.write("cannot shift : end of buffer reached\n")
return False
self.getStack().push(self.getBuffer().currentIndex);
self.fwd()
return True
def red(self):
if(self.getStack().isEmpty()):
sys.stderr.write("cannot reduce an empty stack !\n")
return False
if int(self.getBuffer().getWord(self.getStack().top()).getFeat('GOV')) == Word.invalidGov() :
sys.stderr.write("cannot reduce the stack if top element does not have a governor !\n")
return False
self.getStack().pop()
return True
def right(self, label):
if(self.getStack().isEmpty()):
print("cannot make a right move, the stack is empty!")
return False
govIndex = self.getStack().top()
depIndex = self.getBuffer().currentIndex
self.getBuffer().getCurrentWord().setFeat('LABEL', label)
self.getBuffer().getCurrentWord().setFeat('GOV', str(govIndex - depIndex))
self.getBuffer().getWord(self.getStack().top()).addRightDaughter(depIndex)
self.getStack().push(self.getBuffer().currentIndex)
res = self.fwd()
return res
def left(self, label):
if(self.getStack().isEmpty()):
print("cannot make a left move, the stack is empty!")
return False
govIndex = self.getBuffer().currentIndex
depIndex = self.getStack().top()
self.getBuffer().getWord(self.getStack().top()).setFeat('LABEL', label)
self.getBuffer().getWord(self.getStack().top()).setFeat('GOV', str(govIndex - depIndex))
self.getBuffer().getCurrentWord().addLeftDaughter(depIndex)
self.getStack().pop()
return True
def applyMvt(self, mvt):
mvt_type = mvt[0]
mvt_label = mvt[1]
if(mvt_type == 'RIGHT'):
return self.right(mvt_label)
elif(mvt_type == 'LEFT'):
return self.left(mvt_label)
elif(mvt_type == 'SHIFT'):
return self.shift()
elif(mvt_type == 'REDUCE'):
return self.red()
return False
def getWordFeat(self, featTuple):
container = featTuple[0]
index = featTuple[1]
tape = featTuple[2]
if(container == 'B'):
# if((index < self.getBuffer().getLength()) and (index >= 0)):
absoluteIndex = self.getBuffer().getCurrentIndex() + index
if absoluteIndex < self.getBuffer().getLength() and absoluteIndex >= 0 :
w = self.getBuffer().getWord(absoluteIndex)
else:
#print('word feature ', container, '.', index, '.', tape, ' cannot be interpreted, index ', index, "is out of bound")
return 'NULL'
else:
if(index < self.getStack().getLength()):
#print('on cherche dans', self.getStack().getLength() - index - 1, "")
w = self.getBuffer().getWord(self.getStack().array[self.getStack().getLength() - index - 1])
if w == None :
return 'NULL'
else:
#print('word feature ', container, '.', index, '.', tape, ' cannot be interpreted, index ', index, "is out of bound")
return 'NULL'
return w.getFeat(tape)
# print('word feature ', container, '.', index, '.', tape, ' cannot be interpreted, tape ', tape, "is unknown")
# return 'NULL'
def affiche(self):
currentIndex = self.getBuffer().getCurrentIndex()
print('BUFFER = ', end = '')
for i in range(currentIndex - 2, currentIndex + 2):
if((i >= 0) and (i < len(self.getBuffer().array))):
if(i == currentIndex):
print('[[', i, ':', self.getBuffer().getWord(i).getFeat('POS'), ']] ', end = ' ')
else:
print('[', i, ':', self.getBuffer().getWord(i).getFeat('POS'), '] ', end = ' ')
print('\nSTACK = [', end = '')
st = self.getStack()
for elt in st.array:
print(elt, ' ', end = '')
print(']')
def extractFeatVec(self, FeatModel):
featVec = []
i = 0
for f in FeatModel.getArray():
# print(f, '=', self.getWordFeat(f))
featVec.append(self.getWordFeat(f))
i += 1
# print(featVec)
return featVec
class Dico:
def __init__(self, name):
self.name = name
self.hash = {}
self.array = []
def add(self, symbol):
if symbol in self.hash :
return self.hash[symbol]
self.hash[symbol] = len(self.array)
self.array.append(symbol)
return len(self.array) - 1
def getCode(self, symbol):
if not symbol in self.hash :
return None
return self.hash[symbol]
def getSymbol(self, code):
if code >= len(self.array) :
return None
return self.array[code]
def getSize(self):
return len(self.array)
def printToFile(self, dicoFile):
dicoFile.write('##')
dicoFile.write(self.name)
dicoFile.write('\n')
for i in range(len(self.array)):
dicoFile.write(self.array[i])
dicoFile.write('\n')
def print(self):
print('##', self.name, sep = '')
for i in range(len(self.array)):
print(self.array[i], i)
from Dico import Dico
class Dicos:
def __init__(self, mcd=False, fileName=False, verbose=False):
self.content = {}
if mcd :
for index in range(mcd.getNbCol()):
if(mcd.getColStatus(index) == 'KEEP') and (mcd.getColType(index) == 'SYM') :
dico = self.addDico(mcd.getColName(index))
dico.add('NULL')
dico.add('ROOT')
if fileName :
try:
dicoFile = open(fileName, encoding='utf-8')
except IOError:
print(fileName, 'does not exist')
exit(1)
for ligne in dicoFile:
if ligne[0] == '#' and ligne[1] == '#' :
currentDicoName = ligne[2:-1]
currentDico = self.getDico(currentDicoName)
else:
symbol = ligne[:-1]
currentDico.add(symbol)
dicoFile.close()
def populateFromMcfFile(self, mcfFilename, mcd, verbose=False):
try:
mcfFile = open(mcfFilename, encoding='utf-8')
except IOError:
print('cannot open', mcfFilename)
exit(1)
tokens = []
for ligne in mcfFile:
ligne = ligne.rstrip('\n\r')
tokens = ligne.split("\t")
for i in range(0, len(tokens)):
if mcd.getColType(i) == 'SYM' and mcd.getColStatus(i) == 'KEEP':
self.add(mcd.getColName(i), tokens[i])
mcfFile.close();
for e in self.content:
print('DICO', e, ':\t', self.content[e].getSize(), 'entries')
# def populateFromConlluFile(self, conlluFilename, verbose=False):
# try:
# conlluFile = open(conlluFilename, encoding='utf-8')
# except IOError:
# print('cannot open', conlluFilename)
# exit(1)
# mcd = (('INDEX', 'INT'), ('FORM', 'INT'), ('LEMMA', 'INT'), ('POS', 'SYM'), ('X1', 'INT'), ('MORPHO', 'INT'), ('GOV', 'SYM'), ('LABEL', 'SYM'), ('X2', 'SYM'), ('X3', 'SYM'))
# tokens = []
# for ligne in conlluFile:
# if ligne[0] != '\n' and ligne[0] != '#' :
# tokens = ligne.split("\t")
# for i in range(0, len(tokens)):
# if mcd[i][1] == 'SYM' :
# if not tokens[i] in self.content[mcd[i][0]] :
# self.content[mcd[i][0]].append(tokens[i])
# if(verbose): print('in module:', __name__, 'adding value ', tokens[i], 'to dico', mcd[i][0])
# conlluFile.close();
# for e in self.content:
# print('DICO', e, ':\t', len(self.content[e]), 'entries')
def print(self):
for dicoName in self.content.keys():
self.content[dicoName].print()
def printToFile(self, filename):
try:
dicoFile = open(filename, 'w', encoding='utf-8')
except IOError:
print('cannot open', filename)
exit(1)
for dicoName in self.content.keys():
self.content[dicoName].printToFile(dicoFile)
dicoFile.close()
def getDico(self, dicoName):
if not dicoName in self.content :
return None
return self.content[dicoName]
def addDico(self, dicoName):
if dicoName in self.content :
return self.content[dicoName]
dico = Dico(dicoName)
self.content[dicoName] = dico
return dico
def getCode(self, dicoName, symbol) :
dico = self.getDico(dicoName)
if dico == None :
return None
return dico.getCode(symbol)
def getSymbol(self, dicoName, code) :
dico = self.getDico()
if dico == None :
return None
return dico.getSymbol()
def add(self, dicoName, symbol) :
dico = self.getDico(dicoName)
if dico == None :
return None
return dico.add(symbol)
import numpy as np
class FeatModel:
array = []
nbFeat = 0
inputVectorSize = None
def __init__(self, featModFilename, dicos):
try:
featModFile = open(featModFilename, encoding='utf-8')
except IOError:
print(featModFilename, " : ce fichier n'existe pas")
exit(1)
for ligne in featModFile:
(container, position, wordFeature) = ligne.split()
# print("container = ", container, "position = ", position, "wordFeature = ", wordFeature)
if(container != "B" and container != "S"):
print("error while reading featMod file : ", featModFilename, "container :", container, "undefined")
exit(1)
if not wordFeature in set(['POS', 'LEMMA', 'FORM']):
print("error while reading featMod file : ", featModFilename, "wordFeature :", wordFeature, "undefined")
exit(1)
self.array.append((container, int(position), wordFeature))
self.nbFeat += 1
featModFile.close()
self.inputVectorSize = self.computeInputSize(dicos)
def computeInputSize(self, dicos):
inputVectorSize = 0
for featTuple in self.getArray():
feat = featTuple[2]
inputVectorSize += dicos.getDico(feat).getSize()
return inputVectorSize
def getInputSize(self):
return self.inputVectorSize
def getNbFeat(self):
return self.nbFeat
def getArray(self):
return self.array
def getFeatContainer(self, featIndex):
return self.array[featIndex][0]
def getFeatPosition(self, featIndex):
return self.array[featIndex][1]
def getFeatWordFeature(self, featIndex):
return self.array[featIndex][2]
def buildInputVector(self, featVec, dicos):
inputVector = np.zeros(self.inputVectorSize, dtype="int32")
origin = 0
for i in range(self.getNbFeat()):
featureName = self.getFeatWordFeature(i)
size = dicos.getDico(featureName).getSize()
position = dicos.getCode(featureName, featVec[i])
#print('featureName = ', featureName, 'value =', featVec[i], 'size =', size, 'position =', position, 'origin =', origin)
inputVector[origin + position] = 1
origin += size
return inputVector
class Mcd:
array = []
nbCol = 0
def __init__(self, mcdFilename):
try:
mcdFile = open(mcdFilename, encoding='utf-8')
except IOError:
print(mcdFilename, " : ce fichier n'existe pas")
exit(1)
for ligne in mcdFile:
(col, name, type, status) = ligne.split()
#print("col = ", col, "name = ", name, "type = ", type, "status =", status)
if(status != "KEEP" and status != "IGNORE"):
print("error while reading mcd file : ", mcdFilename, "status :", status, "undefined")
exit(1)
if(type != "INT" and type != "SYM"):
print("error while reading mcd file : ", mcdFilename, "type :", type, "undefined")
exit(1)
self.array.append((int(col), name, type, status))
mcdFile.close()
self.nbCol = int(col) + 1
def getNbCol(self):
return self.nbCol
def getArray(self):
return self.array
def getColName(self, colIndex):
return self.array[colIndex][1]
def getColType(self, colIndex):
return self.array[colIndex][2]
def getColStatus(self, colIndex):
return self.array[colIndex][3]
def locateCol(self, name):
for colIndex in range(self.nbCol):
if self.array[colIndex][1] == name:
return colIndex
return None
import numpy as np
class Moves:
nb = 0
def __init__(self, dicos):
self.dicoLabels = dicos.getDico('LABEL')
if not self.dicoLabels :
print("cannot find LABEL in dicos")
exit(1)
self.nb = 2 * self.dicoLabels.getSize() + 3
def getNb(self):
return self.nb
def mvtCode(self, mvt):
mvtType = mvt[0]
mvtLabel = mvt[1]
if(mvtType == 'SHIFT'): return 0
if(mvtType == 'REDUCE'): return 1
if(mvtType == 'ROOT'): return 2
labelCode = self.dicoLabels.getCode(mvtLabel)
if not labelCode :
print("cannot compute code of movement ", mvt, "label ", mvtLabel, "unknown")
exit(1)
if(mvtType == 'RIGHT'): return 3 + 2 * labelCode
if(mvtType == 'LEFT'): return 3 + 2 * labelCode + 1
def mvtDecode(self, mvt_Code):
if(mvt_Code == 0) : return ('SHIFT', 'NULL')
if(mvt_Code == 1) : return ('REDUCE', 'NULL')
if(mvt_Code == 2) : return ('ROOT', 'NULL')
if mvt_Code % 2 == 0 : #LEFT
labelCode = int((mvt_Code - 4) / 2)
return ('LEFT', self.dicoLabels.getSymbol(labelCode))
else :
labelCode = int((mvt_Code - 3)/ 2)
return ('RIGHT', self.dicoLabels.getSymbol(labelCode))
def buildOutputVector(self, mvt):
outputVector = np.zeros(self.nb, dtype="int32")
codeMvt = self.mvtCode(mvt)
outputVector[codeMvt] = 1
return outputVector
from Config import Config
from Dicos import Dicos
from Word import Word
def check_all_dependents_of_word_in_ref_are_in_hyp(c, wordIndex):
"""As its name suggests, this function checks that all the dependents of a word have been found.
this function is called by the oracle to predict a ROOT and a REDUCE action
"""
depIndex = wordIndex - 1
# print('target =', wordIndex)
# look for a dependent of word to its left in reference
while (depIndex >=0) :
# print('depIndex = ', depIndex)
govRefIndex = int(c.getBuffer().getWord(depIndex).getFeat('GOVREF')) + depIndex
# print("govRefIndex = ", govRefIndex)
if govRefIndex == wordIndex : # dep is a dependent of word in ref
#check that dep has the same governor in hyp
govHypIndex = int(c.getBuffer().getWord(depIndex).getFeat('GOV')) + depIndex
# print(depIndex, 'is dependent ')
if govHypIndex != govRefIndex :
# print('wrong gov (', govHypIndex, ')');
return False
depIndex -= 1
sentenceChange = False
depIndex = wordIndex + 1
while depIndex < c.getBuffer().getLength() :
# print('depIndex = ', depIndex)
govRefIndex = int(c.getBuffer().getWord(depIndex).getFeat('GOVREF')) + depIndex
# print("govRefIndex = ", govRefIndex)
if(govRefIndex == wordIndex): # dep is a dependent of word in ref
govHypIndex = int(c.getBuffer().getWord(depIndex).getFeat('GOV')) + depIndex
# print(depIndex, 'is dependent ')
if govHypIndex != govRefIndex :
# print('wrong gov (', govHypIndex, ')');
return False
depIndex += 1
return True
def oracle(c):
if(c.getStack().isEmpty()):
return ('SHIFT', '')
s0_index = c.getStack().top()
b0_index = c.getBuffer().getCurrentIndex()
# print("s0_index = ", s0_index)
s0_gov_index = int(c.getBuffer().getWord(s0_index).getFeat('GOVREF')) + s0_index
s0_label = c.getBuffer().getWord(s0_index).getFeat('LABELREF')
# print('s0_index = ', s0_index, 'b0_index = ', b0_index, 's0_gov_index = ', s0_gov_index, 'b0_gov_index = ', b0_gov_index, 's0 label =', s0_label)
if(s0_gov_index == b0_index):
return ('LEFT', c.getBuffer().getWord(s0_index).getFeat('LABELREF'))
if(b0_index < c.getBuffer().getLength()):
b0_gov_index = int(c.getBuffer().getWord(b0_index).getFeat('GOVREF')) + b0_index
if(b0_gov_index == s0_index):
return ('RIGHT', c.getBuffer().getWord(b0_index).getFeat('LABELREF'))
if((c.getStack().getLength() > 1) and
check_all_dependents_of_word_in_ref_are_in_hyp(c, s0_index) and # word on top must have all its dependents
(int(c.getBuffer().getWord(c.getStack().top()).getFeat('GOV')) != Word.invalidGov())): # word on top of the stack has a governor
return('REDUCE', '')
#print("no movement possible return SHIFT")
if not c.getBuffer().endReached():
return('SHIFT', '')
print("The machine is stucked")
exit(1)
class Stack:
def __init__(self):
self.array = []
def isEmpty(self):
if(len(self.array) == 0):
return True
else:
return False
def empty(self):
self.array = []
def push(self, elt):
self.array.append(elt)
def pop(self):
if(self.isEmpty()):
print("cannot pop an empty stack");
else:
return(self.array.pop())
def top(self):
if(self.isEmpty() == False):
return self.array[len(self.array) - 1]
def getLength(self):
return len(self.array)
def affiche(self):
print("---- bottom ----")
for elt in self.array:
print(elt)
print("---- top ----")
class Word:
def __init__(self):
self.featDic = {}
self.leftDaughters = []
self.rightDaughters = []
def getFeat(self, featName):
if(not featName in self.featDic):
print('WARNING : feat', featName, 'does not exist')
return None
else:
return self.featDic[featName]
def setFeat(self, featName, featValue):
self.featDic[featName] = featValue
def addLeftDaughter(self, index):
self.leftDaughters.append(index)
def addRightDaughter(self, index):
self.rightDaughters.append(index)
def affiche(self, mcd):
first = True
for columnNb in range(mcd.getNbCol()):
if mcd.getColStatus(columnNb) == 'KEEP':
if first:
first = False
else:
print('\t', end='')
print(self.getFeat(mcd.getColName(columnNb)), end='')
# print('')
@staticmethod
def fakeWordConll():
w = Word()
return w
@staticmethod
def fakeWord(mcd):
w =Word()
for elt in mcd.getArray():
(col, feat, type, status) = elt
w.setFeat(feat, 'ROOT')
w.setFeat('GOV', '0')
return w
@staticmethod
def invalidGov():
return 123456789
@staticmethod
def invalidLabel():
return ''
from Word import Word
class WordBuffer:
def __init__(self, mcfFileName=None, mcd=None):
self.currentIndex = 0
self.array = []
self.mcd = mcd
self.mcfFile = None
if(mcfFileName):
try:
self.mcfFile = open(mcfFileName, encoding='utf-8')
except IOError:
print(mcfFileName, " : ce fichier n'existe pas")
exit(1)
def empty(self):
self.currentIndex = 0
self.array = []
def initConll(self):
self.empty()
self.addWord(Word.fakeWordConll())
def init(self, mcd):
self.empty()
self.addWord(Word.fakeWord())
def addWord(self, w):
self.array.append(w)
def affiche(self, mcd):
for w in self.array:
w.affiche(mcd)
print('')
def getLength(self):
return len(self.array)
def getCurrentIndex(self):
return self.currentIndex
def getWord(self, index):
if index >= len(self.array):
return None
return self.array[index]
def getCurrentWord(self):
return self.getWord(self.currentIndex)
def readNextWord(self):
line = self.mcfFile.readline()
if line == "" :
return None
line = line.rstrip()
tokens = line.split("\t")
w = Word()
for i in range(0, len(tokens)):
w.setFeat(self.mcd.getColName(i), tokens[i])
self.addWord(w)
return w
def readNextSentence(self):
self.currentIndex = 0
self.array = []
self.addWord(Word.fakeWord(self.mcd))
while True:
w = self.readNextWord()
if w == None :
return False
if w.getFeat('EOS') == '1':
return True
def endReached(self):
if(self.getCurrentIndex() >= self.getLength()):
return True
else:
return False
def readAllMcfFile(self):
tokens = []
for ligne in self.mcfFile:
ligne = ligne.rstrip()
tokens = ligne.split("\t")
w = Word()
for i in range(0, len(tokens)):
w.setFeat(self.mcd.getColName(i), tokens[i])
self.addWord(w)
self.mcfFile.close();
import sys
from WordBuffer import WordBuffer
from Word import Word
from Mcd import Mcd
if len(sys.argv) < 3 :
print('usage:', sys.argv[0], 'conllFile mcdFile')
exit(1)
conlluFilename = sys.argv[1]
mcdFilename = sys.argv[2]
mcd = Mcd(mcdFilename)
try:
conlluFile = open(conlluFilename, encoding='utf-8')
except IOError:
print(conlluFilename, " : ce fichier n'existe pas")
exit(1)
tokens = []
wordBuffer = WordBuffer()
for ligne in conlluFile:
if ligne[0] == '\n' :
wordBuffer.getWord(wordBuffer.currentIndex - 1).setFeat('EOS', '1')
next
elif ligne[0] == '#' :
#print("commentaire")
next
else :
ligne = ligne.rstrip()
# 1 Je il PRON _ Number=Sing|Person=1|PronType=Prs 2 nsubj _ _
tokens = ligne.split("\t")
if '-' not in tokens[0]:
w = Word()
index = int(tokens[0])
w.setFeat('INDEX', tokens[0])
w.setFeat('FORM', tokens[1])
w.setFeat('LEMMA', tokens[2])
w.setFeat('POS', tokens[3])
w.setFeat('X1', tokens[4])
w.setFeat('MORPHO', tokens[5])
w.setFeat('GOV', int(tokens[6]) - index)
w.setFeat('LABEL', tokens[7])
w.setFeat('X2', tokens[8])
w.setFeat('X3', tokens[9])
w.setFeat('EOS', '0')
wordBuffer.addWord(w)
conlluFile.close();
wordBuffer.affiche(mcd)
import sys
from Dicos import Dicos
from Mcd import Mcd
if len(sys.argv) < 4 :
print('usage:', sys.argv[0], 'mcf_file mcd_file dico_file')
exit(1)
mcfFileName = sys.argv[1]
mcdFileName = sys.argv[2]
dicoFileName = sys.argv[3]
mcd = Mcd(mcdFileName)
print('populating dicos from file ', mcfFileName)
dicos = Dicos(mcd)
dicos.populateFromMcfFile(mcfFileName, mcd, verbose=False)
print('saving dicos in file ', dicoFileName)
dicos.printToFile(dicoFileName)
import sys
from Mcd import Mcd
from WordBuffer import WordBuffer
from Word import Word
if len(sys.argv) < 6 :
print('usage:', sys.argv[0], 'ref_mcf hyp_mcf ref_mcd hyp_mcd lang')
exit(1)
refFileName = sys.argv[1]
hypFileName = sys.argv[2]
refMcdFileName = sys.argv[3]
hypMcdFileName = sys.argv[4]
lang = sys.argv[5]
#print('reading mcd from file :', refMcdFileName)
refMcd = Mcd(refMcdFileName)
#print('reading mcd from file :', hypMcdFileName)
hypMcd = Mcd(hypMcdFileName)
GovColIndex = refMcd.locateCol('GOV')
if(GovColIndex == None):
print("cannot locate column GOV in mcd :", refMcdFileName)
LabelColIndex = refMcd.locateCol('LABEL')
if(LabelColIndex == None):
print("cannot locate column LABEL in mcd :", refMcdFileName)
GovColIndex = hypMcd.locateCol('GOV')
if(GovColIndex == None):
print("cannot locate column GOV in mcd :", hypMcdFileName)
LabelColIndex = hypMcd.locateCol('LABEL')
if(LabelColIndex == None):
print("cannot locate column LABEL in mcd :", hypMcdFileName)
refWordBuffer = WordBuffer(refFileName, refMcd)
refWordBuffer.readAllMcfFile()
hypWordBuffer = WordBuffer(hypFileName, hypMcd)
hypWordBuffer.readAllMcfFile()
govCorrect = 0
labelCorrect = 0
hypSize = hypWordBuffer.getLength()
for index in range(hypSize):
refWord = refWordBuffer.getWord(index)
hypWord = hypWordBuffer.getWord(index)
refGov = refWord.getFeat("GOV")
hypGov = hypWord.getFeat("GOV")
refLabel = refWord.getFeat("LABEL")
hypLabel = hypWord.getFeat("LABEL")
if refGov == hypGov :
govCorrect += 1
if refLabel == hypLabel :
labelCorrect += 1
LAS = labelCorrect / hypSize
UAS = govCorrect / hypSize
print(lang, LAS, UAS)
# print("REF GOV = ", refGov, "HYP GOV = ", hypGov, "REF LABEL = ", refLabel, "HYP LABEL = ", hypLabel)
import sys
import Oracle
from Moves import Moves
from Mcd import Mcd
from FeatModel import FeatModel
from Dicos import Dicos
from Config import Config
from Word import Word
import numpy as np
def prepareWordBufferForTrain(buffer):
"""Add to every word of the buffer features GOVREF and LABELREF.
GOVEREF is a copy of feature GOV and LABELREF a copy of LABEL
GOV and LABEL are set to initialization values
"""
for word in buffer.array:
word.setFeat('GOVREF', word.getFeat('GOV'))
word.setFeat('GOV', str(Word.invalidGov()))
word.setFeat('LABELREF', word.getFeat('LABEL'))
word.setFeat('LABEL', Word.invalidLabel())
def prepareData(mcd, mcfFile, featModel, moves, filename, wordsLimit) :
try:
dataFile = open(filename, 'w', encoding='utf-8')
except IOError:
print('cannot open', filename)
exit(1)
dataFile.write(str(inputSize))
dataFile.write("\n")
dataFile.write(str(outputSize))
dataFile.write("\n")
c = Config(mcfFile, mcd, dicos)
numSent = 0
numWords = 0
while c.getBuffer().readNextSentence() and numWords < wordsLimit:
numWords += c.getBuffer().getLength()
numSent += 1
# print(">>>>>>>>>>>>> Sent", numSent, " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
prepareWordBufferForTrain(c.getBuffer())
while True :
mvt = Oracle.oracle(c)
outputVector = moves.buildOutputVector(mvt)
featVec = c.extractFeatVec(featModel)
inputVector = featModel.buildInputVector(featVec, dicos)
np.savetxt(dataFile, inputVector, fmt="%s", delimiter=' ', newline=' ')
dataFile.write('\n')
np.savetxt(dataFile, outputVector, fmt="%s", delimiter=' ', newline=' ')
dataFile.write('\n')
if(verbose == True) :
print("------------------------------------------")
c.affiche()
print('oracle says', mvt[0], mvt[1])
print(mvt, featVec)
# c.getBuffer().affiche(mcd)
res = c.applyMvt(mvt)
if(res == False): print("cannot apply movement")
if(c.isFinal()):
# print("is final is true")
break
if len(sys.argv) < 5 :
print('usage:', sys.argv[0], 'mcf_file feat_model_file mcd_file dicos_file data_file words_limit')
exit(1)
mcfFileName = sys.argv[1]
featModelFileName = sys.argv[2]
mcdFileName = sys.argv[3]
dicosFileName = sys.argv[4]
dataFileName = sys.argv[5]
wordsLimit = int(sys.argv[6])
verbose = False
print('reading mcd from file :', mcdFileName)
mcd = Mcd(mcdFileName)
print('reading dicos from file :', dicosFileName)
dicos = Dicos(mcd = mcd, fileName = dicosFileName, verbose=False)
#dicos.populateFromMcfFile(mcfFileName, mcd, verbose=False)
#print('saving dicos in file :', dicosFileName)
#dicos.printToFile(dicosFileName)
moves = Moves(dicos)
print('reading feature model from file :', featModelFileName)
featModel = FeatModel(featModelFileName, dicos)
inputSize = featModel.getInputSize()
outputSize = moves.getNb()
print('input size = ', inputSize, 'outputSize =' , outputSize)
print('preparing training data')
prepareData(mcd, mcfFileName, featModel, moves, dataFileName, wordsLimit)
import sys
from WordBuffer import WordBuffer
from Word import Word
if len(sys.argv) < 2 :
print('usage:', sys.argv[0], 'conllFile mcdFile')
exit(1)
conlluFilename = sys.argv[1]
def isDepProj(wordBuffer, depIndex) :
govIndex = wordBuffer.getWord(depIndex).getFeat('GOV')
# print("dep Index = ", depIndex, "gov Index =", govIndex)
if depIndex < govIndex :
for currentIndex in range(depIndex + 1, govIndex):
currentGovIndex = wordBuffer.getWord(currentIndex).getFeat('GOV')
if currentGovIndex < depIndex or currentGovIndex > govIndex :
# print("word not projective :", currentIndex)
return False
if govIndex < depIndex :
for currentIndex in range(govIndex + 1, depIndex):
currentGovIndex = wordBuffer.getWord(currentIndex).getFeat('GOV')
if currentGovIndex < govIndex or currentGovIndex > depIndex :
# print("word not projective :", currentIndex)
return False
return True
def isSentProj(wordBuffer) :
for currentIndex in range(1, wordBuffer.getLength()):
if not isDepProj(wordBuffer, currentIndex):
return False
return True
def printConllSentence(wordBuffer):
for currentIndex in range(1, wordBuffer.getLength()):
w = wordBuffer.getWord(currentIndex)
print(w.getFeat('INDEX'), end = '\t')
print(w.getFeat('FORM'), end = '\t')
print(w.getFeat('LEMMA'), end = '\t')
print(w.getFeat('POS'), end = '\t')
print(w.getFeat('X1'), end = '\t')
print(w.getFeat('MORPHO'), end = '\t')
print(w.getFeat('GOV'), end = '\t')
print(w.getFeat('LABEL'), end = '\t')
print(w.getFeat('X2'), end = '\t')
print(w.getFeat('X3'))
print()
try:
conlluFile = open(conlluFilename, encoding='utf-8')
except IOError:
print(conlluFilename, " : ce fichier n'existe pas")
exit(1)
tokens = []
wordBuffer = WordBuffer()
wordBuffer.initConll()
sentNb = 1
for ligne in conlluFile:
# print(ligne, end = '')
if ligne[0] == '\n' :
# print("sentence ", sentNb, end = '\t')
sentNb += 1
if isSentProj(wordBuffer):
# print("is projective")
printConllSentence(wordBuffer)
# else:
# print("is not projective")
wordBuffer.initConll()
next
elif ligne[0] == '#' :
#print("commentaire")
next
else :
ligne = ligne.rstrip()
# 1 Je il PRON _ Number=Sing|Person=1|PronType=Prs 2 nsubj _ _
tokens = ligne.split("\t")
if '-' not in tokens[0]:
w = Word()
index = int(tokens[0])
w.setFeat('INDEX', tokens[0])
w.setFeat('FORM', tokens[1])
w.setFeat('LEMMA', tokens[2])
w.setFeat('POS', tokens[3])
w.setFeat('X1', tokens[4])
w.setFeat('MORPHO', tokens[5])
w.setFeat('GOV', int(tokens[6]))
w.setFeat('LABEL', tokens[7])
w.setFeat('X2', tokens[8])
w.setFeat('X3', tokens[9])
w.setFeat('EOS', '0')
wordBuffer.addWord(w)
conlluFile.close();
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment