diff --git a/expe/Makefile b/expe/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..851693f3120832959d5aa26a5c7036e8ff47e32a --- /dev/null +++ b/expe/Makefile @@ -0,0 +1,73 @@ +train_conll=../data/train_$(lang).conllu +train_proj_conll=./out/train_$(lang)_proj.conllu +train_mcf=./out/train_$(lang)_pgle.mcf +train_cff=./out/train_$(lang).cff +train_word_limit=40000 + +dev_conll=../data/dev_$(lang).conllu +dev_proj_conll=./out/dev_$(lang)_proj.conllu +dev_mcf=./out/dev_$(lang)_pgle.mcf +dev_cff=./out/dev_$(lang).cff +dev_word_limit=5000 + +test_conll=../data/test_$(lang).conllu +test_mcf=./out/test_$(lang)_pgle.mcf +test_mcf_hyp=./out/test_$(lang)_hyp.mcf +test_word_limit=700 + +feat_model=basic.fm + +dicos=./out/$(lang)_train.dic +model=./out/$(lang).keras +results = ./out/$(lang).res + +mcd_pgle=PGLE.mcd + +eval: $(test_mcf_hyp) + python3 ../src/eval_mcf.py $(test_mcf) $(test_mcf_hyp) $(mcd_pgle) $(mcd_pgle) $(lang) > $(results) + + +$(test_mcf_hyp): $(test_mcf) $(model) + python3 ../src/tbp_decode.py $(test_mcf) $(model) $(dicos) $(feat_model) $(mcd_pgle) $(test_word_limit) > $(test_mcf_hyp) + +$(model): $(train_cff) $(dev_cff) + python3 ../src/tbp_train.py $(train_cff) $(dev_cff) $(model) + + +$(train_cff): $(train_mcf) $(dicos) + python3 ../src/mcf2cff.py $(train_mcf) $(feat_model) $(mcd_pgle) $(dicos) $(train_cff) $(train_word_limit) + +$(dev_cff): $(dev_mcf) $(dicos) + python3 ../src/mcf2cff.py $(dev_mcf) $(feat_model) $(mcd_pgle) $(dicos) $(dev_cff) $(dev_word_limit) + + +$(train_mcf): $(train_proj_conll) + python3 ../src/conll2mcf.py $(train_proj_conll) $(mcd_pgle) > $(train_mcf) + +$(dev_mcf): $(dev_proj_conll) + python3 ../src/conll2mcf.py $(dev_proj_conll) $(mcd_pgle) > $(dev_mcf) + +$(test_mcf): + python3 ../src/conll2mcf.py $(test_conll) $(mcd_pgle) > $(test_mcf) + + +$(train_proj_conll): + python3 ../src/remove_non_projective_sentences_from_conll.py $(train_conll) > $(train_proj_conll) + +$(dev_proj_conll): + python3 ../src/remove_non_projective_sentences_from_conll.py $(dev_conll) > $(dev_proj_conll) + +$(dicos): $(train_mcf) + python3 ../src/create_dicos.py $(train_mcf) $(mcd_pgle) $(dicos) + +clean: + - rm $(train_proj_conll) + - rm $(train_mcf) + - rm $(train_cff) + - rm $(dev_proj_conll) + - rm $(dev_mcf) + - rm $(dev_cff) + - rm $(test_mcf) + - rm $(test_mcf_hyp) + - rm $(dicos) + - rm -rf $(model) diff --git a/expe/PGLE.mcd b/expe/PGLE.mcd new file mode 100644 index 0000000000000000000000000000000000000000..6d9dca2d8381edef1c2531713d7a62352d83fb6d --- /dev/null +++ b/expe/PGLE.mcd @@ -0,0 +1,4 @@ +0 POS SYM KEEP +1 GOV INT KEEP +2 LABEL SYM KEEP +3 EOS SYM KEEP diff --git a/expe/basic.fm b/expe/basic.fm new file mode 100644 index 0000000000000000000000000000000000000000..f3dcb6e040a085d3ac9692a60dd50c1be5e6c025 --- /dev/null +++ b/expe/basic.fm @@ -0,0 +1,7 @@ +B -2 POS +B -1 POS +B 0 POS +B 1 POS +B 2 POS +S 0 POS +S 1 POS diff --git a/expe/launch.sh b/expe/launch.sh new file mode 100755 index 0000000000000000000000000000000000000000..0da3afa25f7ba9c92e57591489694ffb5f4a55e1 --- /dev/null +++ b/expe/launch.sh @@ -0,0 +1,48 @@ +make lang=ar +make lang=bg +make lang=bxr +make lang=ca +make lang=cs +make lang=da +make lang=de +make lang=el +make lang=en +make lang=es +make lang=et +make lang=eu +make lang=fa +make lang=fi +make lang=fr +make lang=ga +make lang=gl +make lang=got +make lang=grc +make lang=he +make lang=hi +make lang=hr +make lang=hsb +make lang=hu +make lang=id +make lang=it +make lang=ja +make lang=kk +make lang=kmr +make lang=ko +make lang=la +make lang=lv +make lang=nl +make lang=no +make lang=pl +make lang=pt +make lang=ro +make lang=ru +make lang=sk +make lang=sl +make lang=sme +make lang=sv +make lang=tr +make lang=ug +make lang=uk +make lang=ur +make lang=vi +make lang=zh diff --git a/expe/out/empty b/expe/out/empty new file mode 100644 index 0000000000000000000000000000000000000000..8d1c8b69c3fce7bea45c73efd06983e3c419a92f --- /dev/null +++ b/expe/out/empty @@ -0,0 +1 @@ + diff --git a/src/Config.py b/src/Config.py new file mode 100644 index 0000000000000000000000000000000000000000..fbd0e842d3b99510fba31ddac5bfbec9ef7e0d65 --- /dev/null +++ b/src/Config.py @@ -0,0 +1,147 @@ +import sys +from Stack import Stack +from Word import Word +from WordBuffer import WordBuffer + +class Config: + def __init__(self, filename, mcd, dicos): + self.wb = WordBuffer(filename, mcd) + self.st = Stack() + + def isFinal(self): + if self.getStack().getLength() == 1 and self.getStack().top() == 0 and self.getBuffer().getCurrentIndex() >= self.getBuffer().getLength(): + return True + return False + + def getStack(self): + return self.st + + def getBuffer(self): + return self.wb + + def fwd(self): + if self.getBuffer().endReached() : + return False + self.getBuffer().currentIndex += 1 + return True + + def shift(self): + if self.getBuffer().endReached() : + sys.stderr.write("cannot shift : end of buffer reached\n") + return False + self.getStack().push(self.getBuffer().currentIndex); + self.fwd() + return True + + def red(self): + if(self.getStack().isEmpty()): + sys.stderr.write("cannot reduce an empty stack !\n") + return False + + if int(self.getBuffer().getWord(self.getStack().top()).getFeat('GOV')) == Word.invalidGov() : + sys.stderr.write("cannot reduce the stack if top element does not have a governor !\n") + return False + + self.getStack().pop() + return True + + def right(self, label): + if(self.getStack().isEmpty()): + print("cannot make a right move, the stack is empty!") + return False + + govIndex = self.getStack().top() + depIndex = self.getBuffer().currentIndex + self.getBuffer().getCurrentWord().setFeat('LABEL', label) + self.getBuffer().getCurrentWord().setFeat('GOV', str(govIndex - depIndex)) + self.getBuffer().getWord(self.getStack().top()).addRightDaughter(depIndex) + self.getStack().push(self.getBuffer().currentIndex) + res = self.fwd() + return res + + def left(self, label): + if(self.getStack().isEmpty()): + print("cannot make a left move, the stack is empty!") + return False + + govIndex = self.getBuffer().currentIndex + depIndex = self.getStack().top() + self.getBuffer().getWord(self.getStack().top()).setFeat('LABEL', label) + self.getBuffer().getWord(self.getStack().top()).setFeat('GOV', str(govIndex - depIndex)) + self.getBuffer().getCurrentWord().addLeftDaughter(depIndex) + self.getStack().pop() + return True + + def applyMvt(self, mvt): + mvt_type = mvt[0] + mvt_label = mvt[1] + if(mvt_type == 'RIGHT'): + return self.right(mvt_label) + elif(mvt_type == 'LEFT'): + return self.left(mvt_label) + elif(mvt_type == 'SHIFT'): + return self.shift() + elif(mvt_type == 'REDUCE'): + return self.red() + return False + + def getWordFeat(self, featTuple): + container = featTuple[0] + index = featTuple[1] + tape = featTuple[2] + + + if(container == 'B'): +# if((index < self.getBuffer().getLength()) and (index >= 0)): + absoluteIndex = self.getBuffer().getCurrentIndex() + index + if absoluteIndex < self.getBuffer().getLength() and absoluteIndex >= 0 : + w = self.getBuffer().getWord(absoluteIndex) + else: + #print('word feature ', container, '.', index, '.', tape, ' cannot be interpreted, index ', index, "is out of bound") + return 'NULL' + + else: + if(index < self.getStack().getLength()): + #print('on cherche dans', self.getStack().getLength() - index - 1, "") + w = self.getBuffer().getWord(self.getStack().array[self.getStack().getLength() - index - 1]) + if w == None : + return 'NULL' + else: + #print('word feature ', container, '.', index, '.', tape, ' cannot be interpreted, index ', index, "is out of bound") + return 'NULL' + + return w.getFeat(tape) + + # print('word feature ', container, '.', index, '.', tape, ' cannot be interpreted, tape ', tape, "is unknown") + + # return 'NULL' + + + + + def affiche(self): + currentIndex = self.getBuffer().getCurrentIndex() + print('BUFFER = ', end = '') + for i in range(currentIndex - 2, currentIndex + 2): + if((i >= 0) and (i < len(self.getBuffer().array))): + if(i == currentIndex): + print('[[', i, ':', self.getBuffer().getWord(i).getFeat('POS'), ']] ', end = ' ') + else: + print('[', i, ':', self.getBuffer().getWord(i).getFeat('POS'), '] ', end = ' ') + + print('\nSTACK = [', end = '') + st = self.getStack() + for elt in st.array: + print(elt, ' ', end = '') + print(']') + + + def extractFeatVec(self, FeatModel): + featVec = [] + i = 0 + for f in FeatModel.getArray(): +# print(f, '=', self.getWordFeat(f)) + featVec.append(self.getWordFeat(f)) + i += 1 +# print(featVec) + return featVec diff --git a/src/Dico.py b/src/Dico.py new file mode 100644 index 0000000000000000000000000000000000000000..723966f292c05967fdd224051cf42595e16afe41 --- /dev/null +++ b/src/Dico.py @@ -0,0 +1,39 @@ +class Dico: + def __init__(self, name): + self.name = name + self.hash = {} + self.array = [] + + def add(self, symbol): + if symbol in self.hash : + return self.hash[symbol] + self.hash[symbol] = len(self.array) + self.array.append(symbol) + return len(self.array) - 1 + + def getCode(self, symbol): + if not symbol in self.hash : + return None + return self.hash[symbol] + + def getSymbol(self, code): + if code >= len(self.array) : + return None + return self.array[code] + + def getSize(self): + return len(self.array) + + def printToFile(self, dicoFile): + dicoFile.write('##') + dicoFile.write(self.name) + dicoFile.write('\n') + for i in range(len(self.array)): + dicoFile.write(self.array[i]) + dicoFile.write('\n') + + + def print(self): + print('##', self.name, sep = '') + for i in range(len(self.array)): + print(self.array[i], i) diff --git a/src/Dicos.py b/src/Dicos.py new file mode 100644 index 0000000000000000000000000000000000000000..ca0a72adca4579e57cb6bdedda36dc28b2663344 --- /dev/null +++ b/src/Dicos.py @@ -0,0 +1,107 @@ +from Dico import Dico + +class Dicos: + def __init__(self, mcd=False, fileName=False, verbose=False): + self.content = {} + if mcd : + for index in range(mcd.getNbCol()): + if(mcd.getColStatus(index) == 'KEEP') and (mcd.getColType(index) == 'SYM') : + dico = self.addDico(mcd.getColName(index)) + dico.add('NULL') + dico.add('ROOT') + if fileName : + try: + dicoFile = open(fileName, encoding='utf-8') + except IOError: + print(fileName, 'does not exist') + exit(1) + for ligne in dicoFile: + if ligne[0] == '#' and ligne[1] == '#' : + currentDicoName = ligne[2:-1] + currentDico = self.getDico(currentDicoName) + else: + symbol = ligne[:-1] + currentDico.add(symbol) + dicoFile.close() + + def populateFromMcfFile(self, mcfFilename, mcd, verbose=False): + try: + mcfFile = open(mcfFilename, encoding='utf-8') + except IOError: + print('cannot open', mcfFilename) + exit(1) + tokens = [] + for ligne in mcfFile: + ligne = ligne.rstrip('\n\r') + tokens = ligne.split("\t") + for i in range(0, len(tokens)): + if mcd.getColType(i) == 'SYM' and mcd.getColStatus(i) == 'KEEP': + self.add(mcd.getColName(i), tokens[i]) + mcfFile.close(); + for e in self.content: + print('DICO', e, ':\t', self.content[e].getSize(), 'entries') + +# def populateFromConlluFile(self, conlluFilename, verbose=False): +# try: +# conlluFile = open(conlluFilename, encoding='utf-8') +# except IOError: +# print('cannot open', conlluFilename) +# exit(1) +# mcd = (('INDEX', 'INT'), ('FORM', 'INT'), ('LEMMA', 'INT'), ('POS', 'SYM'), ('X1', 'INT'), ('MORPHO', 'INT'), ('GOV', 'SYM'), ('LABEL', 'SYM'), ('X2', 'SYM'), ('X3', 'SYM')) +# tokens = [] +# for ligne in conlluFile: +# if ligne[0] != '\n' and ligne[0] != '#' : +# tokens = ligne.split("\t") +# for i in range(0, len(tokens)): +# if mcd[i][1] == 'SYM' : +# if not tokens[i] in self.content[mcd[i][0]] : +# self.content[mcd[i][0]].append(tokens[i]) +# if(verbose): print('in module:', __name__, 'adding value ', tokens[i], 'to dico', mcd[i][0]) +# conlluFile.close(); +# for e in self.content: +# print('DICO', e, ':\t', len(self.content[e]), 'entries') + + def print(self): + for dicoName in self.content.keys(): + self.content[dicoName].print() + + def printToFile(self, filename): + try: + dicoFile = open(filename, 'w', encoding='utf-8') + except IOError: + print('cannot open', filename) + exit(1) + for dicoName in self.content.keys(): + self.content[dicoName].printToFile(dicoFile) + dicoFile.close() + + def getDico(self, dicoName): + if not dicoName in self.content : + return None + return self.content[dicoName] + + def addDico(self, dicoName): + if dicoName in self.content : + return self.content[dicoName] + dico = Dico(dicoName) + self.content[dicoName] = dico + return dico + + def getCode(self, dicoName, symbol) : + dico = self.getDico(dicoName) + if dico == None : + return None + return dico.getCode(symbol) + + def getSymbol(self, dicoName, code) : + dico = self.getDico() + if dico == None : + return None + return dico.getSymbol() + + def add(self, dicoName, symbol) : + dico = self.getDico(dicoName) + if dico == None : + return None + return dico.add(symbol) + diff --git a/src/FeatModel.py b/src/FeatModel.py new file mode 100644 index 0000000000000000000000000000000000000000..cd27a6987ec6b49e7d0cae29376cc9226e03eb7b --- /dev/null +++ b/src/FeatModel.py @@ -0,0 +1,62 @@ +import numpy as np + +class FeatModel: + array = [] + nbFeat = 0 + inputVectorSize = None + def __init__(self, featModFilename, dicos): + try: + featModFile = open(featModFilename, encoding='utf-8') + except IOError: + print(featModFilename, " : ce fichier n'existe pas") + exit(1) + for ligne in featModFile: + (container, position, wordFeature) = ligne.split() +# print("container = ", container, "position = ", position, "wordFeature = ", wordFeature) + if(container != "B" and container != "S"): + print("error while reading featMod file : ", featModFilename, "container :", container, "undefined") + exit(1) + if not wordFeature in set(['POS', 'LEMMA', 'FORM']): + print("error while reading featMod file : ", featModFilename, "wordFeature :", wordFeature, "undefined") + exit(1) + self.array.append((container, int(position), wordFeature)) + self.nbFeat += 1 + featModFile.close() + self.inputVectorSize = self.computeInputSize(dicos) + + def computeInputSize(self, dicos): + inputVectorSize = 0 + for featTuple in self.getArray(): + feat = featTuple[2] + inputVectorSize += dicos.getDico(feat).getSize() + return inputVectorSize + + def getInputSize(self): + return self.inputVectorSize + + def getNbFeat(self): + return self.nbFeat + + def getArray(self): + return self.array + + def getFeatContainer(self, featIndex): + return self.array[featIndex][0] + + def getFeatPosition(self, featIndex): + return self.array[featIndex][1] + + def getFeatWordFeature(self, featIndex): + return self.array[featIndex][2] + + def buildInputVector(self, featVec, dicos): + inputVector = np.zeros(self.inputVectorSize, dtype="int32") + origin = 0 + for i in range(self.getNbFeat()): + featureName = self.getFeatWordFeature(i) + size = dicos.getDico(featureName).getSize() + position = dicos.getCode(featureName, featVec[i]) + #print('featureName = ', featureName, 'value =', featVec[i], 'size =', size, 'position =', position, 'origin =', origin) + inputVector[origin + position] = 1 + origin += size + return inputVector diff --git a/src/Mcd.py b/src/Mcd.py new file mode 100644 index 0000000000000000000000000000000000000000..cd2b3916e43371b0c055c53ca491de931c2ee6fa --- /dev/null +++ b/src/Mcd.py @@ -0,0 +1,44 @@ +class Mcd: + array = [] + nbCol = 0 + def __init__(self, mcdFilename): + try: + mcdFile = open(mcdFilename, encoding='utf-8') + except IOError: + print(mcdFilename, " : ce fichier n'existe pas") + exit(1) + for ligne in mcdFile: + (col, name, type, status) = ligne.split() + #print("col = ", col, "name = ", name, "type = ", type, "status =", status) + if(status != "KEEP" and status != "IGNORE"): + print("error while reading mcd file : ", mcdFilename, "status :", status, "undefined") + exit(1) + if(type != "INT" and type != "SYM"): + print("error while reading mcd file : ", mcdFilename, "type :", type, "undefined") + exit(1) + self.array.append((int(col), name, type, status)) + mcdFile.close() + self.nbCol = int(col) + 1 + + def getNbCol(self): + return self.nbCol + + def getArray(self): + return self.array + + def getColName(self, colIndex): + return self.array[colIndex][1] + + def getColType(self, colIndex): + return self.array[colIndex][2] + + def getColStatus(self, colIndex): + return self.array[colIndex][3] + + def locateCol(self, name): + for colIndex in range(self.nbCol): + if self.array[colIndex][1] == name: + return colIndex + return None + + diff --git a/src/Moves.py b/src/Moves.py new file mode 100644 index 0000000000000000000000000000000000000000..973a611f5f61176204718556873ead9b9450a1f8 --- /dev/null +++ b/src/Moves.py @@ -0,0 +1,44 @@ +import numpy as np + +class Moves: + nb = 0 + + def __init__(self, dicos): + self.dicoLabels = dicos.getDico('LABEL') + if not self.dicoLabels : + print("cannot find LABEL in dicos") + exit(1) + self.nb = 2 * self.dicoLabels.getSize() + 3 + + def getNb(self): + return self.nb + + def mvtCode(self, mvt): + mvtType = mvt[0] + mvtLabel = mvt[1] + if(mvtType == 'SHIFT'): return 0 + if(mvtType == 'REDUCE'): return 1 + if(mvtType == 'ROOT'): return 2 + labelCode = self.dicoLabels.getCode(mvtLabel) + if not labelCode : + print("cannot compute code of movement ", mvt, "label ", mvtLabel, "unknown") + exit(1) + if(mvtType == 'RIGHT'): return 3 + 2 * labelCode + if(mvtType == 'LEFT'): return 3 + 2 * labelCode + 1 + + def mvtDecode(self, mvt_Code): + if(mvt_Code == 0) : return ('SHIFT', 'NULL') + if(mvt_Code == 1) : return ('REDUCE', 'NULL') + if(mvt_Code == 2) : return ('ROOT', 'NULL') + if mvt_Code % 2 == 0 : #LEFT + labelCode = int((mvt_Code - 4) / 2) + return ('LEFT', self.dicoLabels.getSymbol(labelCode)) + else : + labelCode = int((mvt_Code - 3)/ 2) + return ('RIGHT', self.dicoLabels.getSymbol(labelCode)) + + def buildOutputVector(self, mvt): + outputVector = np.zeros(self.nb, dtype="int32") + codeMvt = self.mvtCode(mvt) + outputVector[codeMvt] = 1 + return outputVector diff --git a/src/Oracle.py b/src/Oracle.py new file mode 100644 index 0000000000000000000000000000000000000000..18f029b0e7aa5a7e9acdfc90e8c6a2735121096d --- /dev/null +++ b/src/Oracle.py @@ -0,0 +1,73 @@ +from Config import Config +from Dicos import Dicos +from Word import Word + +def check_all_dependents_of_word_in_ref_are_in_hyp(c, wordIndex): + """As its name suggests, this function checks that all the dependents of a word have been found. + + this function is called by the oracle to predict a ROOT and a REDUCE action + """ + depIndex = wordIndex - 1 +# print('target =', wordIndex) + # look for a dependent of word to its left in reference + while (depIndex >=0) : +# print('depIndex = ', depIndex) + govRefIndex = int(c.getBuffer().getWord(depIndex).getFeat('GOVREF')) + depIndex +# print("govRefIndex = ", govRefIndex) + if govRefIndex == wordIndex : # dep is a dependent of word in ref + #check that dep has the same governor in hyp + govHypIndex = int(c.getBuffer().getWord(depIndex).getFeat('GOV')) + depIndex +# print(depIndex, 'is dependent ') + if govHypIndex != govRefIndex : +# print('wrong gov (', govHypIndex, ')'); + return False + depIndex -= 1 + + sentenceChange = False + depIndex = wordIndex + 1 + while depIndex < c.getBuffer().getLength() : +# print('depIndex = ', depIndex) + govRefIndex = int(c.getBuffer().getWord(depIndex).getFeat('GOVREF')) + depIndex +# print("govRefIndex = ", govRefIndex) + if(govRefIndex == wordIndex): # dep is a dependent of word in ref + govHypIndex = int(c.getBuffer().getWord(depIndex).getFeat('GOV')) + depIndex +# print(depIndex, 'is dependent ') + if govHypIndex != govRefIndex : +# print('wrong gov (', govHypIndex, ')'); + return False + depIndex += 1 + + return True + +def oracle(c): + if(c.getStack().isEmpty()): + return ('SHIFT', '') + + s0_index = c.getStack().top() + b0_index = c.getBuffer().getCurrentIndex() +# print("s0_index = ", s0_index) + s0_gov_index = int(c.getBuffer().getWord(s0_index).getFeat('GOVREF')) + s0_index + s0_label = c.getBuffer().getWord(s0_index).getFeat('LABELREF') +# print('s0_index = ', s0_index, 'b0_index = ', b0_index, 's0_gov_index = ', s0_gov_index, 'b0_gov_index = ', b0_gov_index, 's0 label =', s0_label) + + if(s0_gov_index == b0_index): + return ('LEFT', c.getBuffer().getWord(s0_index).getFeat('LABELREF')) + + if(b0_index < c.getBuffer().getLength()): + b0_gov_index = int(c.getBuffer().getWord(b0_index).getFeat('GOVREF')) + b0_index + if(b0_gov_index == s0_index): + return ('RIGHT', c.getBuffer().getWord(b0_index).getFeat('LABELREF')) + + if((c.getStack().getLength() > 1) and + check_all_dependents_of_word_in_ref_are_in_hyp(c, s0_index) and # word on top must have all its dependents + (int(c.getBuffer().getWord(c.getStack().top()).getFeat('GOV')) != Word.invalidGov())): # word on top of the stack has a governor + return('REDUCE', '') + + #print("no movement possible return SHIFT") + if not c.getBuffer().endReached(): + return('SHIFT', '') + print("The machine is stucked") + exit(1) + + + diff --git a/src/Stack.py b/src/Stack.py new file mode 100644 index 0000000000000000000000000000000000000000..bbf20bde4a4dfedb4764ebc5b3933516c14fa326 --- /dev/null +++ b/src/Stack.py @@ -0,0 +1,38 @@ +class Stack: + def __init__(self): + self.array = [] + + def isEmpty(self): + if(len(self.array) == 0): + return True + else: + return False + + def empty(self): + self.array = [] + + def push(self, elt): + self.array.append(elt) + + def pop(self): + if(self.isEmpty()): + print("cannot pop an empty stack"); + else: + return(self.array.pop()) + + def top(self): + if(self.isEmpty() == False): + return self.array[len(self.array) - 1] + + def getLength(self): + return len(self.array) + + def affiche(self): + print("---- bottom ----") + for elt in self.array: + print(elt) + print("---- top ----") + + + + diff --git a/src/Word.py b/src/Word.py new file mode 100644 index 0000000000000000000000000000000000000000..3ca62ac7bd3e9ea4675d1cc6ae0471fe23fdb0e0 --- /dev/null +++ b/src/Word.py @@ -0,0 +1,56 @@ +class Word: + def __init__(self): + self.featDic = {} + self.leftDaughters = [] + self.rightDaughters = [] + + def getFeat(self, featName): + if(not featName in self.featDic): + print('WARNING : feat', featName, 'does not exist') + return None + else: + return self.featDic[featName] + + def setFeat(self, featName, featValue): + self.featDic[featName] = featValue + + def addLeftDaughter(self, index): + self.leftDaughters.append(index) + + def addRightDaughter(self, index): + self.rightDaughters.append(index) + + def affiche(self, mcd): + first = True + for columnNb in range(mcd.getNbCol()): + if mcd.getColStatus(columnNb) == 'KEEP': + if first: + first = False + else: + print('\t', end='') + print(self.getFeat(mcd.getColName(columnNb)), end='') +# print('') + + @staticmethod + def fakeWordConll(): + w = Word() + return w + + + @staticmethod + def fakeWord(mcd): + w =Word() + for elt in mcd.getArray(): + (col, feat, type, status) = elt + w.setFeat(feat, 'ROOT') + w.setFeat('GOV', '0') + return w + + @staticmethod + def invalidGov(): + return 123456789 + + @staticmethod + def invalidLabel(): + return '' + diff --git a/src/WordBuffer.py b/src/WordBuffer.py new file mode 100644 index 0000000000000000000000000000000000000000..2702cf7b89de0ae3adca0b9903ea396bb8e15207 --- /dev/null +++ b/src/WordBuffer.py @@ -0,0 +1,90 @@ +from Word import Word + +class WordBuffer: + def __init__(self, mcfFileName=None, mcd=None): + self.currentIndex = 0 + self.array = [] + self.mcd = mcd + self.mcfFile = None + if(mcfFileName): + try: + self.mcfFile = open(mcfFileName, encoding='utf-8') + except IOError: + print(mcfFileName, " : ce fichier n'existe pas") + exit(1) + + def empty(self): + self.currentIndex = 0 + self.array = [] + + def initConll(self): + self.empty() + self.addWord(Word.fakeWordConll()) + + def init(self, mcd): + self.empty() + self.addWord(Word.fakeWord()) + + def addWord(self, w): + self.array.append(w) + + def affiche(self, mcd): + for w in self.array: + w.affiche(mcd) + print('') + + def getLength(self): + return len(self.array) + + def getCurrentIndex(self): + return self.currentIndex + + def getWord(self, index): + if index >= len(self.array): + return None + return self.array[index] + + def getCurrentWord(self): + return self.getWord(self.currentIndex) + + def readNextWord(self): + line = self.mcfFile.readline() + if line == "" : + return None + + line = line.rstrip() + tokens = line.split("\t") + w = Word() + for i in range(0, len(tokens)): + w.setFeat(self.mcd.getColName(i), tokens[i]) + self.addWord(w) + return w + + + def readNextSentence(self): + self.currentIndex = 0 + self.array = [] + self.addWord(Word.fakeWord(self.mcd)) + while True: + w = self.readNextWord() + if w == None : + return False + if w.getFeat('EOS') == '1': + return True + + def endReached(self): + if(self.getCurrentIndex() >= self.getLength()): + return True + else: + return False + + def readAllMcfFile(self): + tokens = [] + for ligne in self.mcfFile: + ligne = ligne.rstrip() + tokens = ligne.split("\t") + w = Word() + for i in range(0, len(tokens)): + w.setFeat(self.mcd.getColName(i), tokens[i]) + self.addWord(w) + self.mcfFile.close(); diff --git a/src/conll2mcf.py b/src/conll2mcf.py new file mode 100644 index 0000000000000000000000000000000000000000..9dd3170919c879a0deece7066ea1656c2a02c3e7 --- /dev/null +++ b/src/conll2mcf.py @@ -0,0 +1,58 @@ +import sys +from WordBuffer import WordBuffer +from Word import Word +from Mcd import Mcd + +if len(sys.argv) < 3 : + print('usage:', sys.argv[0], 'conllFile mcdFile') + exit(1) + + +conlluFilename = sys.argv[1] +mcdFilename = sys.argv[2] + +mcd = Mcd(mcdFilename) + +try: + conlluFile = open(conlluFilename, encoding='utf-8') +except IOError: + print(conlluFilename, " : ce fichier n'existe pas") + exit(1) + +tokens = [] +wordBuffer = WordBuffer() +for ligne in conlluFile: + if ligne[0] == '\n' : + wordBuffer.getWord(wordBuffer.currentIndex - 1).setFeat('EOS', '1') + next + elif ligne[0] == '#' : + #print("commentaire") + next + else : + ligne = ligne.rstrip() +# 1 Je il PRON _ Number=Sing|Person=1|PronType=Prs 2 nsubj _ _ + tokens = ligne.split("\t") + if '-' not in tokens[0]: + w = Word() + index = int(tokens[0]) + w.setFeat('INDEX', tokens[0]) + w.setFeat('FORM', tokens[1]) + w.setFeat('LEMMA', tokens[2]) + w.setFeat('POS', tokens[3]) + w.setFeat('X1', tokens[4]) + w.setFeat('MORPHO', tokens[5]) + w.setFeat('GOV', int(tokens[6]) - index) + w.setFeat('LABEL', tokens[7]) + w.setFeat('X2', tokens[8]) + w.setFeat('X3', tokens[9]) + w.setFeat('EOS', '0') + wordBuffer.addWord(w) + +conlluFile.close(); + +wordBuffer.affiche(mcd) + + + + + diff --git a/src/create_dicos.py b/src/create_dicos.py new file mode 100644 index 0000000000000000000000000000000000000000..260483829cfea0afe4626805bd773b45776c6e42 --- /dev/null +++ b/src/create_dicos.py @@ -0,0 +1,22 @@ +import sys +from Dicos import Dicos +from Mcd import Mcd + + +if len(sys.argv) < 4 : + print('usage:', sys.argv[0], 'mcf_file mcd_file dico_file') + exit(1) + +mcfFileName = sys.argv[1] +mcdFileName = sys.argv[2] +dicoFileName = sys.argv[3] + +mcd = Mcd(mcdFileName) + +print('populating dicos from file ', mcfFileName) +dicos = Dicos(mcd) +dicos.populateFromMcfFile(mcfFileName, mcd, verbose=False) +print('saving dicos in file ', dicoFileName) +dicos.printToFile(dicoFileName) + + diff --git a/src/eval_mcf.py b/src/eval_mcf.py new file mode 100644 index 0000000000000000000000000000000000000000..0cdf9cae1256664a40c4a8f2ac319f9737b7c9fd --- /dev/null +++ b/src/eval_mcf.py @@ -0,0 +1,67 @@ +import sys +from Mcd import Mcd +from WordBuffer import WordBuffer +from Word import Word + +if len(sys.argv) < 6 : + print('usage:', sys.argv[0], 'ref_mcf hyp_mcf ref_mcd hyp_mcd lang') + exit(1) + +refFileName = sys.argv[1] +hypFileName = sys.argv[2] +refMcdFileName = sys.argv[3] +hypMcdFileName = sys.argv[4] +lang = sys.argv[5] + +#print('reading mcd from file :', refMcdFileName) +refMcd = Mcd(refMcdFileName) + +#print('reading mcd from file :', hypMcdFileName) +hypMcd = Mcd(hypMcdFileName) + +GovColIndex = refMcd.locateCol('GOV') +if(GovColIndex == None): + print("cannot locate column GOV in mcd :", refMcdFileName) + +LabelColIndex = refMcd.locateCol('LABEL') +if(LabelColIndex == None): + print("cannot locate column LABEL in mcd :", refMcdFileName) + +GovColIndex = hypMcd.locateCol('GOV') +if(GovColIndex == None): + print("cannot locate column GOV in mcd :", hypMcdFileName) + +LabelColIndex = hypMcd.locateCol('LABEL') +if(LabelColIndex == None): + print("cannot locate column LABEL in mcd :", hypMcdFileName) + +refWordBuffer = WordBuffer(refFileName, refMcd) +refWordBuffer.readAllMcfFile() + +hypWordBuffer = WordBuffer(hypFileName, hypMcd) +hypWordBuffer.readAllMcfFile() + +govCorrect = 0 +labelCorrect = 0 + +hypSize = hypWordBuffer.getLength() +for index in range(hypSize): + refWord = refWordBuffer.getWord(index) + hypWord = hypWordBuffer.getWord(index) + refGov = refWord.getFeat("GOV") + hypGov = hypWord.getFeat("GOV") + refLabel = refWord.getFeat("LABEL") + hypLabel = hypWord.getFeat("LABEL") + if refGov == hypGov : + govCorrect += 1 + if refLabel == hypLabel : + labelCorrect += 1 + +LAS = labelCorrect / hypSize +UAS = govCorrect / hypSize + +print(lang, LAS, UAS) + + + +# print("REF GOV = ", refGov, "HYP GOV = ", hypGov, "REF LABEL = ", refLabel, "HYP LABEL = ", hypLabel) diff --git a/src/mcf2cff.py b/src/mcf2cff.py new file mode 100644 index 0000000000000000000000000000000000000000..96706abf55a267df90e98fa0a5f583c32f51f18c --- /dev/null +++ b/src/mcf2cff.py @@ -0,0 +1,102 @@ +import sys +import Oracle +from Moves import Moves +from Mcd import Mcd +from FeatModel import FeatModel +from Dicos import Dicos +from Config import Config +from Word import Word +import numpy as np + +def prepareWordBufferForTrain(buffer): + """Add to every word of the buffer features GOVREF and LABELREF. + + GOVEREF is a copy of feature GOV and LABELREF a copy of LABEL + GOV and LABEL are set to initialization values + """ + for word in buffer.array: + word.setFeat('GOVREF', word.getFeat('GOV')) + word.setFeat('GOV', str(Word.invalidGov())) + word.setFeat('LABELREF', word.getFeat('LABEL')) + word.setFeat('LABEL', Word.invalidLabel()) + +def prepareData(mcd, mcfFile, featModel, moves, filename, wordsLimit) : + try: + dataFile = open(filename, 'w', encoding='utf-8') + except IOError: + print('cannot open', filename) + exit(1) + + dataFile.write(str(inputSize)) + dataFile.write("\n") + dataFile.write(str(outputSize)) + dataFile.write("\n") + c = Config(mcfFile, mcd, dicos) + numSent = 0 + numWords = 0 + while c.getBuffer().readNextSentence() and numWords < wordsLimit: + numWords += c.getBuffer().getLength() + numSent += 1 +# print(">>>>>>>>>>>>> Sent", numSent, " >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") + prepareWordBufferForTrain(c.getBuffer()) + while True : + mvt = Oracle.oracle(c) + outputVector = moves.buildOutputVector(mvt) + featVec = c.extractFeatVec(featModel) + inputVector = featModel.buildInputVector(featVec, dicos) + np.savetxt(dataFile, inputVector, fmt="%s", delimiter=' ', newline=' ') + dataFile.write('\n') + np.savetxt(dataFile, outputVector, fmt="%s", delimiter=' ', newline=' ') + dataFile.write('\n') + + if(verbose == True) : + print("------------------------------------------") + c.affiche() + print('oracle says', mvt[0], mvt[1]) + print(mvt, featVec) + + # c.getBuffer().affiche(mcd) + res = c.applyMvt(mvt) + if(res == False): print("cannot apply movement") + if(c.isFinal()): +# print("is final is true") + break + + + +if len(sys.argv) < 5 : + print('usage:', sys.argv[0], 'mcf_file feat_model_file mcd_file dicos_file data_file words_limit') + exit(1) + +mcfFileName = sys.argv[1] +featModelFileName = sys.argv[2] +mcdFileName = sys.argv[3] +dicosFileName = sys.argv[4] +dataFileName = sys.argv[5] +wordsLimit = int(sys.argv[6]) +verbose = False + +print('reading mcd from file :', mcdFileName) +mcd = Mcd(mcdFileName) + +print('reading dicos from file :', dicosFileName) +dicos = Dicos(mcd = mcd, fileName = dicosFileName, verbose=False) + +#dicos.populateFromMcfFile(mcfFileName, mcd, verbose=False) +#print('saving dicos in file :', dicosFileName) +#dicos.printToFile(dicosFileName) + +moves = Moves(dicos) + +print('reading feature model from file :', featModelFileName) +featModel = FeatModel(featModelFileName, dicos) + +inputSize = featModel.getInputSize() +outputSize = moves.getNb() +print('input size = ', inputSize, 'outputSize =' , outputSize) + +print('preparing training data') +prepareData(mcd, mcfFileName, featModel, moves, dataFileName, wordsLimit) + + + diff --git a/src/remove_non_projective_sentences_from_conll.py b/src/remove_non_projective_sentences_from_conll.py new file mode 100644 index 0000000000000000000000000000000000000000..f1188994031d47e5ea26933781d6b7c9dd4464b5 --- /dev/null +++ b/src/remove_non_projective_sentences_from_conll.py @@ -0,0 +1,103 @@ +import sys +from WordBuffer import WordBuffer +from Word import Word + +if len(sys.argv) < 2 : + print('usage:', sys.argv[0], 'conllFile mcdFile') + exit(1) + + +conlluFilename = sys.argv[1] + +def isDepProj(wordBuffer, depIndex) : + govIndex = wordBuffer.getWord(depIndex).getFeat('GOV') +# print("dep Index = ", depIndex, "gov Index =", govIndex) + if depIndex < govIndex : + for currentIndex in range(depIndex + 1, govIndex): + currentGovIndex = wordBuffer.getWord(currentIndex).getFeat('GOV') + if currentGovIndex < depIndex or currentGovIndex > govIndex : +# print("word not projective :", currentIndex) + return False + if govIndex < depIndex : + for currentIndex in range(govIndex + 1, depIndex): + currentGovIndex = wordBuffer.getWord(currentIndex).getFeat('GOV') + if currentGovIndex < govIndex or currentGovIndex > depIndex : +# print("word not projective :", currentIndex) + return False + return True + +def isSentProj(wordBuffer) : + for currentIndex in range(1, wordBuffer.getLength()): + if not isDepProj(wordBuffer, currentIndex): + return False + return True + +def printConllSentence(wordBuffer): + for currentIndex in range(1, wordBuffer.getLength()): + w = wordBuffer.getWord(currentIndex) + print(w.getFeat('INDEX'), end = '\t') + print(w.getFeat('FORM'), end = '\t') + print(w.getFeat('LEMMA'), end = '\t') + print(w.getFeat('POS'), end = '\t') + print(w.getFeat('X1'), end = '\t') + print(w.getFeat('MORPHO'), end = '\t') + print(w.getFeat('GOV'), end = '\t') + print(w.getFeat('LABEL'), end = '\t') + print(w.getFeat('X2'), end = '\t') + print(w.getFeat('X3')) + print() + + + +try: + conlluFile = open(conlluFilename, encoding='utf-8') +except IOError: + print(conlluFilename, " : ce fichier n'existe pas") + exit(1) + +tokens = [] +wordBuffer = WordBuffer() +wordBuffer.initConll() +sentNb = 1 +for ligne in conlluFile: +# print(ligne, end = '') + if ligne[0] == '\n' : +# print("sentence ", sentNb, end = '\t') + sentNb += 1 + if isSentProj(wordBuffer): +# print("is projective") + printConllSentence(wordBuffer) +# else: +# print("is not projective") + wordBuffer.initConll() + next + elif ligne[0] == '#' : + #print("commentaire") + next + else : + ligne = ligne.rstrip() +# 1 Je il PRON _ Number=Sing|Person=1|PronType=Prs 2 nsubj _ _ + tokens = ligne.split("\t") + if '-' not in tokens[0]: + w = Word() + index = int(tokens[0]) + w.setFeat('INDEX', tokens[0]) + w.setFeat('FORM', tokens[1]) + w.setFeat('LEMMA', tokens[2]) + w.setFeat('POS', tokens[3]) + w.setFeat('X1', tokens[4]) + w.setFeat('MORPHO', tokens[5]) + w.setFeat('GOV', int(tokens[6])) + w.setFeat('LABEL', tokens[7]) + w.setFeat('X2', tokens[8]) + w.setFeat('X3', tokens[9]) + w.setFeat('EOS', '0') + wordBuffer.addWord(w) + +conlluFile.close(); + + + + + + diff --git a/src/tbp_decode.py b/src/tbp_decode.py new file mode 100644 index 0000000000000000000000000000000000000000..29c1a66590c16549cd4918da2e7e5cd1709080d8 --- /dev/null +++ b/src/tbp_decode.py @@ -0,0 +1,117 @@ +import sys +import Oracle +from Dicos import Dicos +from Config import Config +from Word import Word +from Mcd import Mcd +from Moves import Moves +from keras.models import Sequential +from keras.layers import Dense +from keras.models import load_model +from FeatModel import FeatModel +import numpy as np + + + +def prepareWordBufferForDecode(buffer): + """Add to every word of the buffer features GOVREF and LABELREF. + + GOVEREF is a copy of feature GOV and LABELREF a copy of LABEL + GOV and LABEL are set to initialization values + """ + for word in buffer.array: + word.setFeat('GOV', str(Word.invalidGov())) + word.setFeat('LABEL', Word.invalidLabel()) + + +verbose = False +if len(sys.argv) != 7 : + print('usage:', sys.argv[0], 'mcf_file model_file dicos_file feat_model mcd_file words_limit') + exit(1) + +mcf_file = sys.argv[1] +model_file = sys.argv[2] +dicos_file = sys.argv[3] +feat_model = sys.argv[4] +mcd_file = sys.argv[5] +wordsLimit = int(sys.argv[6]) + + +sys.stderr.write('reading mcd from file :') +sys.stderr.write(mcd_file) +sys.stderr.write('\n') +mcd = Mcd(mcd_file) + +sys.stderr.write('loading dicos\n') +dicos = Dicos(mcd = mcd, fileName=dicos_file, verbose=False) + +moves = Moves(dicos) + +sys.stderr.write('reading feature model from file :') +sys.stderr.write(feat_model) +sys.stderr.write('\n') +featModel = FeatModel(feat_model, dicos) + +sys.stderr.write('loading model :') +sys.stderr.write(model_file) +sys.stderr.write('\n') +model = load_model(model_file) + +inputSize = featModel.getInputSize() +outputSize = moves.getNb() + +c = Config(mcf_file, mcd, dicos) +numSent = 0 +verbose = False +numWords = 0 + +while c.getBuffer().readNextSentence() and numWords < wordsLimit : + c.getStack().empty() + prepareWordBufferForDecode(c.getBuffer()) + numWords += c.getBuffer().getLength() + + while True : + featVec = c.extractFeatVec(featModel) + inputVector = featModel.buildInputVector(featVec, dicos) + outputVector = model.predict(inputVector.reshape((1,inputSize)), batch_size=1, verbose=0, steps=None) + mvt_Code = outputVector.argmax() + mvt = moves.mvtDecode(mvt_Code) + + if(verbose == True) : + print("------------------------------------------") + c.affiche() + print('predicted move', mvt[0], mvt[1]) + print(mvt, featVec) + + res = c.applyMvt(mvt) + if not res : + sys.stderr.write("cannot apply predicted movement\n") + mvt_type = mvt[0] + mvt_label = mvt[1] + if mvt_type != "SHIFT" : + sys.stderr.write("try to force SHIFT\n") + res = c.shift() + if res == False : + sys.stderr.write("try to force REDUCE\n") + res = c.red() + if res == False : + sys.stderr.write("abort sentence\n") + break + if(c.isFinal()): + break + for i in range(1, c.getBuffer().getLength()): + w = c.getBuffer().getWord(i) + w.affiche(mcd) + print('') +# print('\t', w.getFeat("GOV"), end='\t') +# print(w.getFeat("LABEL")) + + numSent += 1 +# if numSent % 10 == 0: +# print ("Sent : ", numSent) + + + + + + diff --git a/src/tbp_train.py b/src/tbp_train.py new file mode 100644 index 0000000000000000000000000000000000000000..4c48ab0c20c1a45da2c6cd222805031050f44101 --- /dev/null +++ b/src/tbp_train.py @@ -0,0 +1,73 @@ +import sys +from keras.models import Sequential +from keras.layers import Dense, Activation, Dropout +import numpy as np + +def readData(dataFilename) : + allX = [] + allY = [] + try: +# dataFile = open(dataFilename, encoding='utf-8') + dataFile = open(dataFilename) + except IOError: + print(dataFilename, " : ce fichier n'existe pas") + exit(1) + + + + inputSize = int(dataFile.readline()) + print("input size = ", inputSize) + outputSize = int(dataFile.readline()) + print("output size = ", outputSize) + + inputLine = True + for ligne in dataFile: +# print(ligne) + vector = ligne.split() + vector[:] = list(map(int, vector)) + if inputLine == True: + #print("input ", vector) + allX.append(vector) + inputLine = False + else: + #print("output ", vector) + allY.append(vector) + inputLine = True + # x_train and y_train are Numpy arrays + x_train = np.array(allX) + y_train = np.array(allY) + return (inputSize, outputSize, x_train, y_train) + + + +if len(sys.argv) < 3 : + print('usage:', sys.argv[0], 'cffTrainFileName cffDevFileName kerasModelFileName') + exit(1) + +cffTrainFileName = sys.argv[1] +cffDevFileName = sys.argv[2] +kerasModelFileName = sys.argv[3] + +inputSize, outputSize, x_train, y_train = readData(cffTrainFileName) +devInputSize, devOutputSize, x_dev, y_dev = readData(cffDevFileName) + + +model = Sequential() +model.add(Dense(units=128, activation='relu', input_dim=inputSize)) +model.add(Dropout(0.4)) +model.add(Dense(units=outputSize, activation='softmax')) +model.compile(loss='categorical_crossentropy', + optimizer='adam', + metrics=['accuracy']) + +model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_dev,y_dev)) + + +#if len(sys.argv) == 5 : +# model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_dev,y_dev)) +#else : +# model.fit(x_train, y_train, epochs=10, batch_size=32) + +model.save(kerasModelFileName) + +