diff --git a/.gitignore b/.gitignore index c42a158d8042b612560c1cc05459b15f5aaa6290..31c613330bb03f283588c47272b48a5f2a55c51f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ bin/* .idea total_test_gold.conllu total_test_predicted.conllu -venv/* \ No newline at end of file +venv diff --git a/Config.py b/Config.py index 1cbf4621663770d895a4554cea109ea9835fc750..9c68edfca46b91a1046202e56b1225f70b32168b 100644 --- a/Config.py +++ b/Config.py @@ -11,6 +11,7 @@ class Config : self.index2col = index2col self.predicted = set({"HEAD", "DEPREL"}) self.wordIndex = 0 + self.maxWordIndex = 0 #To keep a track of the max value, in case of backtrack self.stack = [] self.comments = [] self.history = [] @@ -64,10 +65,12 @@ class Config : if self.wordIndex+relMov in range(0, len((self.lines))) : self.wordIndex += relMov else : + self.maxWordIndex = max(self.maxWordIndex, self.wordIndex) return False if self.isMultiword(self.wordIndex) : self.wordIndex += relMov done += 1 + self.maxWordIndex = max(self.maxWordIndex, self.wordIndex) return True def isMultiword(self, index) : diff --git a/Decode.py b/Decode.py index 11060f22ecd1e643499fbbe4fcdfc728b5d0005e..720e158a4a33d349c5086273264dbc6a51a091d6 100644 --- a/Decode.py +++ b/Decode.py @@ -76,7 +76,7 @@ def decodeModel(ts, strat, config, network, dicts, debug) : ################################################################################ ################################################################################ -def decodeMode(debug, filename, type, transitionSet, strategy, modelDir = None, network=None, dicts=None, output=sys.stdout) : +def decodeMode(debug, filename, type, transitionSet, strategy, modelDir=None, network=None, dicts=None, output=sys.stdout) : sentences = Config.readConllu(filename) diff --git a/Dicts.py b/Dicts.py index 8c6129e510b6d56f77645b76562e8a57fed8d724..f6787ec1ab945ed1066c04f0790960e2b8f6bab2 100644 --- a/Dicts.py +++ b/Dicts.py @@ -12,6 +12,7 @@ class Dicts : self.noDepLeft = "__nodepleft__" self.noDepRight = "__nodepright__" self.noGov = "__nogov__" + self.notSeen = "__notseen__" def addDict(self, name, d) : if name in self.dicts : @@ -38,7 +39,7 @@ class Dicts : targetColumns = list(col2index.keys()) else : targetColumns = list(colsSet) - self.dicts = {col : {self.unkToken : (0,minCount), self.nullToken : (1,minCount), self.noStackToken : (2,minCount), self.oobToken : (3,minCount), self.noDepLeft : (4,minCount), self.noDepRight : (5,minCount), self.noGov : (6,minCount)} for col in targetColumns} + self.dicts = {col : {self.unkToken : (0,minCount), self.nullToken : (1,minCount), self.noStackToken : (2,minCount), self.oobToken : (3,minCount), self.noDepLeft : (4,minCount), self.noDepRight : (5,minCount), self.noGov : (6,minCount), self.notSeen : (7,minCount)} for col in targetColumns} splited = line.split('\t') for col in targetColumns : diff --git a/Features.py b/Features.py index 3f86e276880e3eef0561358d2f250d8c229c3cc5..b86e454696c07f0d5ee15ab5db8c9816e648bd70 100644 --- a/Features.py +++ b/Features.py @@ -11,7 +11,10 @@ from Util import isEmpty # -3 : No dependent left # -4 : No dependent right # -5 : No gov -def extractIndexes(config, featureFunction) : +# -6 : Not seen +# If incremental is true, only words that have been 'seen' (at wordIndex) can be used +# others will be marked as not seen. +def extractIndexes(config, featureFunction, incremental) : features = featureFunction.split() res = [] for feature in features : @@ -27,6 +30,8 @@ def extractIndexes(config, featureFunction) : index = -2 else : index = config.stack[-1-index] + if incremental and index > config.maxWordIndex : + index = -6 for depIndex in map(int,splited[2:]) : if index < 0 : break @@ -56,10 +61,10 @@ def extractIndexes(config, featureFunction) : ################################################################################ # For each element of the feature function and for each column, concatenante the dict index -def extractColsFeatures(dicts, config, featureFunction, cols) : - specialValues = {-1 : dicts.oobToken, -2 : dicts.noStackToken, -3 : dicts.noDepLeft, -4 : dicts.noDepRight, -5 : dicts.noGov} +def extractColsFeatures(dicts, config, featureFunction, cols, incremental) : + specialValues = {-1 : dicts.oobToken, -2 : dicts.noStackToken, -3 : dicts.noDepLeft, -4 : dicts.noDepRight, -5 : dicts.noGov, -6 : dicts.notSeen} - indexes = extractIndexes(config, featureFunction) + indexes = extractIndexes(config, featureFunction, incremental) totalSize = len(cols)*len(indexes) result = torch.zeros(totalSize, dtype=torch.int) diff --git a/Networks.py b/Networks.py index 80cc90afe55b990f3bf3f2739191fa5f33f1a91b..3f4518f16cfd6b0eee17c3d7baadd873afe0b411 100644 --- a/Networks.py +++ b/Networks.py @@ -5,10 +5,11 @@ import Features ################################################################################ class BaseNet(nn.Module): - def __init__(self, dicts, outputSize) : + def __init__(self, dicts, outputSize, incremental) : super().__init__() self.dummyParam = nn.Parameter(torch.empty(0), requires_grad=False) + self.incremental = incremental self.featureFunction = "b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.0.1 s.1.0 s.1.-1 s.1.1 s.2.0 s.2.-1 s.2.1" self.historyNb = 5 self.columns = ["UPOS", "FORM"] @@ -47,7 +48,7 @@ class BaseNet(nn.Module): m.bias.data.fill_(0.01) def extractFeatures(self, dicts, config) : - colsValues = Features.extractColsFeatures(dicts, config, self.featureFunction, self.columns) + colsValues = Features.extractColsFeatures(dicts, config, self.featureFunction, self.columns, self.incremental) historyValues = Features.extractHistoryFeatures(dicts, config, self.historyNb) return torch.cat([colsValues, historyValues]) diff --git a/Train.py b/Train.py index a03d3079199ba3376f0a11b06f8a1e881a6f6698..2e7d280044a79e62ef0c21ab035c44bef30443cd 100644 --- a/Train.py +++ b/Train.py @@ -16,15 +16,15 @@ import Config from conll18_ud_eval import load_conllu, evaluate ################################################################################ -def trainMode(debug, filename, type, transitionSet, strategy, modelDir, nbIter, batchSize, devFile, bootstrapInterval, silent=False) : +def trainMode(debug, filename, type, transitionSet, strategy, modelDir, nbIter, batchSize, devFile, bootstrapInterval, incremental, silent=False) : sentences = Config.readConllu(filename) if type == "oracle" : - trainModelOracle(debug, modelDir, filename, nbIter, batchSize, devFile, transitionSet, strategy, sentences, bootstrapInterval, silent) + trainModelOracle(debug, modelDir, filename, nbIter, batchSize, devFile, transitionSet, strategy, sentences, bootstrapInterval, incremental, silent) return if type == "rl": - trainModelRl(debug, modelDir, filename, nbIter, batchSize, devFile, transitionSet, strategy, sentences, silent) + trainModelRl(debug, modelDir, filename, nbIter, batchSize, devFile, transitionSet, strategy, sentences, incremental, silent) return print("ERROR : unknown type '%s'"%type, file=sys.stderr) @@ -70,7 +70,7 @@ def extractExamples(debug, ts, strat, config, dicts, network, dynamic) : ################################################################################ ################################################################################ -def evalModelAndSave(debug, model, ts, strat, dicts, modelDir, devFile, bestLoss, totalLoss, bestScore, epoch, nbIter) : +def evalModelAndSave(debug, model, ts, strat, dicts, modelDir, devFile, bestLoss, totalLoss, bestScore, epoch, nbIter, incremental) : devScore = "" saved = True if bestLoss is None else totalLoss < bestLoss bestLoss = totalLoss if bestLoss is None else min(bestLoss, totalLoss) @@ -92,12 +92,12 @@ def evalModelAndSave(debug, model, ts, strat, dicts, modelDir, devFile, bestLoss ################################################################################ ################################################################################ -def trainModelOracle(debug, modelDir, filename, nbEpochs, batchSize, devFile, transitionSet, strategy, sentencesOriginal, bootstrapInterval, silent=False) : +def trainModelOracle(debug, modelDir, filename, nbEpochs, batchSize, devFile, transitionSet, strategy, sentencesOriginal, bootstrapInterval, incremental, silent=False) : dicts = Dicts() dicts.readConllu(filename, ["FORM","UPOS"], 2) dicts.addDict("HISTORY", {**{t.name : (transitionSet.index(t),0) for t in transitionSet}, **{dicts.nullToken : (len(transitionSet),0)}}) dicts.save(modelDir+"/dicts.json") - network = Networks.BaseNet(dicts, len(transitionSet)).to(getDevice()) + network = Networks.BaseNet(dicts, len(transitionSet), incremental).to(getDevice()) examples = [] sentences = copy.deepcopy(sentencesOriginal) print("%s : Starting to extract examples..."%(timeStamp()), file=sys.stderr) @@ -143,11 +143,11 @@ def trainModelOracle(debug, modelDir, filename, nbEpochs, batchSize, devFile, tr optimizer.step() totalLoss += float(loss) - bestLoss, bestScore = evalModelAndSave(debug, network, transitionSet, strategy, dicts, modelDir, devFile, bestLoss, totalLoss, bestScore, epoch, nbEpochs) + bestLoss, bestScore = evalModelAndSave(debug, network, transitionSet, strategy, dicts, modelDir, devFile, bestLoss, totalLoss, bestScore, epoch, nbEpochs, incremental) ################################################################################ ################################################################################ -def trainModelRl(debug, modelDir, filename, nbIter, batchSize, devFile, transitionSet, strategy, sentencesOriginal, silent=False) : +def trainModelRl(debug, modelDir, filename, nbIter, batchSize, devFile, transitionSet, strategy, sentencesOriginal, incremental, silent=False) : memory = None dicts = Dicts() @@ -155,8 +155,8 @@ def trainModelRl(debug, modelDir, filename, nbIter, batchSize, devFile, transiti dicts.addDict("HISTORY", {**{t.name : (transitionSet.index(t),0) for t in transitionSet}, **{dicts.nullToken : (len(transitionSet),0)}}) dicts.save(modelDir + "/dicts.json") - policy_net = Networks.BaseNet(dicts, len(transitionSet)).to(getDevice()) - target_net = Networks.BaseNet(dicts, len(transitionSet)).to(getDevice()) + policy_net = Networks.BaseNet(dicts, len(transitionSet), incremental).to(getDevice()) + target_net = Networks.BaseNet(dicts, len(transitionSet), incremental).to(getDevice()) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() policy_net.train() @@ -226,6 +226,6 @@ def trainModelRl(debug, modelDir, filename, nbIter, batchSize, devFile, transiti if i >= nbExByEpoch : break sentIndex += 1 - bestLoss, bestScore = evalModelAndSave(debug, policy_net, transitionSet, strategy, dicts, modelDir, devFile, bestLoss, totalLoss, bestScore, epoch, nbIter) + bestLoss, bestScore = evalModelAndSave(debug, policy_net, transitionSet, strategy, dicts, modelDir, devFile, bestLoss, totalLoss, bestScore, epoch, nbIter, incremental) ################################################################################ diff --git a/Transition.py b/Transition.py index c82ca1c929756309aa96f13f5cc844308a60a052..582a425f6f3999b9c74f650c42bb53a7e7a0ccb6 100644 --- a/Transition.py +++ b/Transition.py @@ -4,7 +4,6 @@ from Util import isEmpty ################################################################################ class Transition : - available = lambda self,x: x in {"RIGHT", "LEFT", "SHIFT", "REDUCE", "EOS"} or ("BACK" in x and len(x.split()) == 2) def __init__(self, name) : if not self.available(name) : @@ -14,10 +13,11 @@ class Transition : def __lt__(self, other) : return self.name < other.name + def available(self, x) : + return x in {"RIGHT", "LEFT", "SHIFT", "REDUCE", "EOS"} or ("BACK" in x and len(x.split()) == 2) + def apply(self, config, strategy) : data = None - - config.historyHistory.add(str([t[0].name for t in config.historyPop])) if self.name == "RIGHT" : applyRight(config) @@ -30,6 +30,7 @@ class Transition : elif self.name == "EOS" : applyEOS(config) elif "BACK" in self.name : + config.historyHistory.add(str([t[0].name for t in config.historyPop])) size = int(self.name.split()[-1]) applyBack(config, strategy, size) else : diff --git a/main.py b/main.py index bcceaf374f1de756a876ba989f3942e885823b27..68e11688889f915f7bc195745a51bf0b24078d97 100755 --- a/main.py +++ b/main.py @@ -5,6 +5,7 @@ import os import argparse import random import torch +import json import Util import Train @@ -32,6 +33,8 @@ if __name__ == "__main__" : help="If not none, extract examples in bootstrap mode (oracle train only).") parser.add_argument("--dev", default=None, help="Name of the CoNLL-U file of the dev corpus.") + parser.add_argument("--incr", "-i", default=False, action="store_true", + help="If true, the neural network will be 'incremenal' i.e. will not see right context words if they have never been the word index.") parser.add_argument("--debug", "-d", default=False, action="store_true", help="Print debug infos on stderr.") parser.add_argument("--silent", "-s", default=False, action="store_true", @@ -59,8 +62,15 @@ if __name__ == "__main__" : print("Transition Set :", [trans.name for trans in transitionSet]) if args.mode == "train" : - Train.trainMode(args.debug, args.corpus, args.type, transitionSet, strategy, args.model, int(args.iter), int(args.batchSize), args.dev, args.bootstrap, args.silent) + json.dump([t.name for t in transitionSet], open(args.model+"/transitions.json", "w")) + json.dump(strategy, open(args.model+"/strategy.json", "w")) + print("Transition Set :", [trans.name for trans in transitionSet], file=sys.stderr) + Train.trainMode(args.debug, args.corpus, args.type, transitionSet, strategy, args.model, int(args.iter), int(args.batchSize), args.dev, args.bootstrap, args.incr, args.silent) elif args.mode == "decode" : + transNames = json.load(open(args.model+"/transitions.json", "r")) + transitionSet = [Transition(elem) for elem in transNames] + strategy = json.load(open(args.model+"/strategy.json", "r")) + print("Transition Set :", [trans.name for trans in transitionSet], file=sys.stderr) Decode.decodeMode(args.debug, args.corpus, args.type, transitionSet, strategy, args.model) else : print("ERROR : unknown mode '%s'"%args.mode, file=sys.stderr)