diff --git a/Config.py b/Config.py index 880c39a18dd8146b635ee1381645afad44378e24..a675e4dd5c83605cf321e29b5b614a5f11b5b184 100644 --- a/Config.py +++ b/Config.py @@ -1,4 +1,5 @@ from readMCD import readMCD +import Dicts import sys ################################################################################ @@ -103,7 +104,7 @@ class Config : value = str(self.getAsFeature(lineIndex, self.index2col[colIndex])) if value == "" : value = "_" - elif self.index2col[colIndex] == "HEAD" and value != "-1": + elif self.index2col[colIndex] == "HEAD" and value != "-1" and value != Dicts.Dicts.erased : value = self.getAsFeature(int(value), "ID") elif self.index2col[colIndex] == "HEAD" and value == "-1": value = "0" @@ -126,9 +127,9 @@ class Config : value = str(self.getAsFeature(index, self.index2col[colIndex])) if value == "" or value == "_" : value = "_" - elif self.index2col[colIndex] == "HEAD" and value != "-1": + elif self.index2col[colIndex] == "HEAD" and value != "-1" and value != Dicts.Dicts.erased: value = self.getAsFeature(int(value), "ID") - elif self.index2col[colIndex] == "HEAD" and value == "-1": + elif self.index2col[colIndex] == "HEAD" and value == "-1" and value != Dicts.Dicts.erased: value = "0" toPrint.append(value) print("\t".join(toPrint), file=output) diff --git a/Dicts.py b/Dicts.py index 654a7eda1e8651245deffbfebe3cfcaaca45eb81..5acbf1953073cc73fdc558e0d91c7d0689188840 100644 --- a/Dicts.py +++ b/Dicts.py @@ -3,16 +3,18 @@ from readMCD import readMCD ################################################################################ class Dicts : + unkToken = "__unknown__" + nullToken = "__null__" + noStackToken = "__nostack__" + oobToken = "__oob__" + noDepLeft = "__nodepleft__" + noDepRight = "__nodepright__" + noGov = "__nogov__" + notSeen = "__notseen__" + erased = "__erased__" + def __init__(self) : self.dicts = {} - self.unkToken = "__unknown__" - self.nullToken = "__null__" - self.noStackToken = "__nostack__" - self.oobToken = "__oob__" - self.noDepLeft = "__nodepleft__" - self.noDepRight = "__nodepright__" - self.noGov = "__nogov__" - self.notSeen = "__notseen__" def addDict(self, name, d) : if name in self.dicts : @@ -39,7 +41,7 @@ class Dicts : targetColumns = list(col2index.keys()) else : targetColumns = list(colsSet) - self.dicts = {col : {self.unkToken : (0,minCount), self.nullToken : (1,minCount), self.noStackToken : (2,minCount), self.oobToken : (3,minCount), self.noDepLeft : (4,minCount), self.noDepRight : (5,minCount), self.noGov : (6,minCount), self.notSeen : (7,minCount)} for col in targetColumns} + self.dicts = {col : {self.unkToken : (0,minCount), self.nullToken : (1,minCount), self.noStackToken : (2,minCount), self.oobToken : (3,minCount), self.noDepLeft : (4,minCount), self.noDepRight : (5,minCount), self.noGov : (6,minCount), self.notSeen : (7,minCount), self.erased : (8,minCount)} for col in targetColumns} splited = line.split('\t') for col in targetColumns : diff --git a/Features.py b/Features.py index 518252ce8aac43bb8cd0ba7a3ae712d68b771df7..0662da411480722303bb0699b2da12f08eb17472 100644 --- a/Features.py +++ b/Features.py @@ -1,6 +1,7 @@ import torch import sys from Util import isEmpty +import Dicts ################################################################################ # Input : b=buffer s=stack .0=governor .x=rightChild#x+1 .-x=leftChild#-x-1 @@ -12,6 +13,7 @@ from Util import isEmpty # -4 : No dependent right # -5 : No gov # -6 : Not seen +# -7 : Erased # If incremental is true, only words that have been 'seen' (at wordIndex) can be used # others will be marked as not seen. def extractIndexes(config, featureFunction, incremental) : @@ -39,6 +41,8 @@ def extractIndexes(config, featureFunction, incremental) : head = config.getAsFeature(index, "HEAD") if isEmpty(head) : index = -5 + elif head == Dicts.Dicts.erased : + index = -7 else : index = int(head) continue @@ -62,7 +66,7 @@ def extractIndexes(config, featureFunction, incremental) : ################################################################################ # For each element of the feature function and for each column, concatenante the dict index def extractColsFeatures(dicts, config, featureFunction, cols, incremental) : - specialValues = {-1 : dicts.oobToken, -2 : dicts.noStackToken, -3 : dicts.noDepLeft, -4 : dicts.noDepRight, -5 : dicts.noGov, -6 : dicts.notSeen} + specialValues = {-1 : dicts.oobToken, -2 : dicts.noStackToken, -3 : dicts.noDepLeft, -4 : dicts.noDepRight, -5 : dicts.noGov, -6 : dicts.notSeen, -7 : dicts.erased} indexes = extractIndexes(config, featureFunction, incremental) totalSize = len(cols)*len(indexes) diff --git a/Transition.py b/Transition.py index 7af410cdd504b2d8a051394b10ce535ea05f2169..e4864a16705cd9d04ad055c639950f8daeceee37 100644 --- a/Transition.py +++ b/Transition.py @@ -1,5 +1,6 @@ import sys import Config +import Dicts from Util import isEmpty ################################################################################ @@ -53,31 +54,31 @@ class Transition : def appliable(self, config) : if self.name == "RIGHT" : for colName in config.predicted : - if colName not in ["HEAD","DEPREL"] and isEmpty(config.getAsFeature(config.wordIndex, colName)) : + if colName not in ["HEAD","DEPREL"] and (isEmpty(config.getAsFeature(config.wordIndex, colName)) or config.getAsFeature(config.wordIndex, colName) == Dicts.Dicts.erased) : return False - if not (len(config.stack) >= self.size and isEmpty(config.getAsFeature(config.wordIndex, "HEAD")) and not linkCauseCycle(config, config.stack[-self.size], config.wordIndex)) : + if not (len(config.stack) >= self.size and (isEmpty(config.getAsFeature(config.wordIndex, "HEAD")) or config.getAsFeature(config.wordIndex, "HEAD") == Dicts.Dicts.erased) and not linkCauseCycle(config, config.stack[-self.size], config.wordIndex)) : return False - orphansInStack = [s for s in config.stack[-self.size+1:] if isEmpty(config.getAsFeature(s, "HEAD"))] if self.size > 1 else [] + orphansInStack = [s for s in config.stack[-self.size+1:] if isEmpty(config.getAsFeature(s, "HEAD")) or config.getAsFeature(s, "HEAD") == Dicts.Dicts.erased] if self.size > 1 else [] return len(orphansInStack) == 0 if self.name == "LEFT" : for colName in config.predicted : - if colName not in ["HEAD","DEPREL"] and isEmpty(config.getAsFeature(config.wordIndex, colName)) : + if colName not in ["HEAD","DEPREL"] and (isEmpty(config.getAsFeature(config.wordIndex, colName)) or config.getAsFeature(config.wordIndex, colName) == Dicts.Dicts.erased) : return False - if not (len(config.stack) >= self.size and isEmpty(config.getAsFeature(config.stack[-self.size], "HEAD")) and not linkCauseCycle(config, config.wordIndex, config.stack[-self.size])) : + if not (len(config.stack) >= self.size and (isEmpty(config.getAsFeature(config.stack[-self.size], "HEAD"))or config.getAsFeature(config.stack[-self.size], "HEAD") == Dicts.Dicts.erased) and not linkCauseCycle(config, config.wordIndex, config.stack[-self.size])) : return False - orphansInStack = [s for s in config.stack[-self.size+1:] if isEmpty(config.getAsFeature(s, "HEAD"))] if self.size > 1 else [] + orphansInStack = [s for s in config.stack[-self.size+1:] if (isEmpty(config.getAsFeature(s, "HEAD")) or config.getAsFeature(s, "HEAD") == Dicts.Dicts.erased)] if self.size > 1 else [] return len(orphansInStack) == 0 if self.name == "SHIFT" : for colName in config.predicted : - if colName not in ["HEAD","DEPREL"] and isEmpty(config.getAsFeature(config.wordIndex, colName)) : + if colName not in ["HEAD","DEPREL"] and (isEmpty(config.getAsFeature(config.wordIndex, colName))or config.getAsFeature(config.wordIndex, colName) == Dicts.Dicts.erased) : return False return config.wordIndex < len(config.lines) - 1 if self.name == "REDUCE" : - return len(config.stack) > 0 and not isEmpty(config.getAsFeature(config.stack[-1], "HEAD")) + return len(config.stack) > 0 and not (isEmpty(config.getAsFeature(config.stack[-1], "HEAD")) or config.getAsFeature(config.stack[-1], "HEAD") == Dicts.Dicts.erased) if self.name == "EOS" : return config.wordIndex == len(config.lines) - 1 if self.name == "TAG" : - return isEmpty(config.getAsFeature(config.wordIndex, self.colName)) + return isEmpty(config.getAsFeature(config.wordIndex, self.colName)) or config.getAsFeature(config.wordIndex, self.colName) == Dicts.Dicts.erased if self.name == "NOBACK" : return True if "BACK" in self.name : @@ -149,7 +150,7 @@ def nbLinksBufferStack(config) : ################################################################################ # Return True if link between from and to would cause a cycle def linkCauseCycle(config, fromIndex, toIndex) : - while not isEmpty(config.getAsFeature(fromIndex, "HEAD")) : + while not isEmpty(config.getAsFeature(fromIndex, "HEAD")) and not config.getAsFeature(fromIndex, "HEAD") == Dicts.Dicts.erased : fromIndex = int(config.getAsFeature(fromIndex, "HEAD")) if fromIndex == toIndex : return True @@ -208,7 +209,7 @@ def applyBackRight(config, data, size) : config.stack.pop() while len(data) > 0 : config.stack.append(data.pop()) - config.set(config.wordIndex, "HEAD", "") + config.set(config.wordIndex, "HEAD", Dicts.Dicts.erased) config.predChilds[config.stack[-size]].pop() ################################################################################ @@ -217,7 +218,7 @@ def applyBackLeft(config, data, size) : config.stack.append(data.pop()) while len(data) > 0 : config.stack.append(data.pop()) - config.set(config.stack[-size], "HEAD", "") + config.set(config.stack[-size], "HEAD", Dicts.Dicts.erased) config.predChilds[config.wordIndex].pop() ################################################################################ @@ -233,7 +234,7 @@ def applyBackReduce(config, data) : ################################################################################ def applyBackTag(config, colName) : - config.set(config.wordIndex, colName, "") + config.set(config.wordIndex, colName, Dicts.Dicts.erased) ################################################################################ ################################################################################ @@ -273,9 +274,9 @@ def applyEOS(config) : if not config.hasCol("HEAD") or not config.isPredicted("HEAD") : return - rootCandidates = [index for index in config.stack if not config.isMultiword(index) and isEmpty(config.getAsFeature(index, "HEAD"))] + rootCandidates = [index for index in config.stack if not config.isMultiword(index) and (isEmpty(config.getAsFeature(index, "HEAD")) or config.getAsFeature(index, "HEAD") == Dicts.Dicts.erased)] if len(rootCandidates) == 0 : - rootCandidates = [index for index in range(len(config.lines)) if not config.isMultiword(index) and isEmpty(config.getAsFeature(index, "HEAD"))] + rootCandidates = [index for index in range(len(config.lines)) if not config.isMultiword(index) and (isEmpty(config.getAsFeature(index, "HEAD")) or config.getAsFeature(index, "HEAD") == Dicts.Dicts.erased)] if len(rootCandidates) == 0 : print("ERROR : no candidates for root", file=sys.stderr) @@ -287,7 +288,7 @@ def applyEOS(config) : config.set(rootIndex, "DEPREL", "root") for index in range(len(config.lines)) : - if config.isMultiword(index) or not isEmpty(config.getAsFeature(index, "HEAD")) : + if config.isMultiword(index) or not (isEmpty(config.getAsFeature(index, "HEAD")) or config.getAsFeature(index, "HEAD") == Dicts.Dicts.erased) : continue config.set(index, "HEAD", str(rootIndex)) config.predChilds[rootIndex].append(index)