Newer
Older
from readMCD import readMCD
import sys
################################################################################
class Config :
def __init__(self, col2index, index2col) :
self.lines = []
Franck Dary
committed
self.childs = []
self.col2index = col2index
self.index2col = index2col
self.predicted = set({"HEAD", "DEPREL"})
self.wordIndex = 0
self.stack = []
self.comments = []
def addLine(self, cols) :
self.lines.append([[val,""] for val in cols])
Franck Dary
committed
self.childs.append([])
def get(self, lineIndex, colname, predicted) :
if lineIndex not in range(len(self.lines)) :
print("Line index %d is out of range (0,%d)"%(lineIndex, len(self.lines)), file=sys.stderr)
exit(1)
if colname not in self.col2index :
print("Unknown colname '%s'"%(colname), file=sys.stderr)
exit(1)
index = 1 if predicted else 0
return self.lines[lineIndex][self.col2index[colname]][index]
def set(self, lineIndex, colname, value, predicted=True) :
if lineIndex not in range(len(self.lines)) :
print("Line index %d is out of range (0,%d)"%(lineIndex, len(self.lines)), file=sys.stderr)
exit(1)
if colname not in self.col2index :
print("Unknown colname '%s'"%(colname), file=sys.stderr)
exit(1)
index = 1 if predicted else 0
self.lines[lineIndex][self.col2index[colname]][index] = value
def getAsFeature(self, lineIndex, colname) :
return self.get(lineIndex, colname, colname in self.predicted)
def getGold(self, lineIndex, colname) :
return self.get(lineIndex, colname, False)
def addWordIndexToStack(self) :
self.stack.append(self.wordIndex)
def popStack(self) :
self.stack.pop()
Franck Dary
committed
# Move wordIndex by a relative forward movement if possible. Ignore multiwords.
# Don't go out of bounds, but don't fail either.
# Return true if movement was completed.
def moveWordIndex(self, movement) :
done = 0
if self.isMultiword(self.wordIndex) :
self.wordIndex += 1
while done != movement :
Franck Dary
committed
if self.wordIndex < len(self.lines) - 1 :
self.wordIndex += 1
else :
return False
if self.isMultiword(self.wordIndex) :
self.wordIndex += 1
done += 1
Franck Dary
committed
return True
def isMultiword(self, index) :
return "-" in self.getAsFeature(index, "ID")
def __len__(self) :
return len(self.lines)
def printForDebug(self, output) :
print("stack :",[self.getAsFeature(ind, "ID") for ind in self.stack], file=output)
for lineIndex in range(len(self.lines)) :
print("%s"%("=>" if lineIndex == self.wordIndex else " "), end="", file=output)
toPrint = []
for colIndex in range(len(self.lines[lineIndex])) :
value = str(self.getAsFeature(lineIndex, self.index2col[colIndex]))
if value == "" :
value = "_"
elif self.index2col[colIndex] == "HEAD" and value != "-1":
elif self.index2col[colIndex] == "HEAD" and value == "-1":
value = "0"
toPrint.append(value)
print("\t".join(toPrint), file=output)
def print(self, output, header=False) :
if header :
print("# global.columns = %s"%(" ".join(self.col2index.keys())), file=output)
if len(self.comments) > 0 :
print("\n".join(self.comments), file=output)
for index in range(len(self.lines)) :
toPrint = []
for colIndex in range(len(self.lines[index])) :
value = str(self.getAsFeature(index, self.index2col[colIndex]))
if value == "" :
value = "_"
elif self.index2col[colIndex] == "HEAD" and value != "-1":
elif self.index2col[colIndex] == "HEAD" and value == "-1":
value = "0"
toPrint.append(value)
print("\t".join(toPrint), file=output)
################################################################################
################################################################################
def readConllu(filename) :
configs = []
defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
col2index, index2col = readMCD(defaultMCD)
currentIndex = 0
id2index = {}
comments = []
for line in open(filename, "r") :
line = line.strip()
if "# global.columns =" in line :
mcd = line.split('=')[-1].strip()
col2index, index2col = readMCD(mcd)
if len(line) == 0 :
for index in range(len(configs[-1])) :
head = configs[-1].getGold(index, "HEAD")
if head == "_" :
continue
if head == "0" :
continue
configs[-1].set(index, "HEAD", id2index[head], False)
Franck Dary
committed
configs[-1].childs[int(id2index[head])].append(index)
configs[-1].comments = comments
configs.append(Config(col2index, index2col))
currentIndex = 0
id2index = {}
comments = []
comments.append(line)
continue
if len(configs) == 0 :
configs.append(Config(col2index, index2col))
currentIndex = 0
id2index = {}
splited = line.split('\t')
ID = splited[col2index["ID"]]
if '.' in ID :
continue
configs[-1].addLine(splited)
ID = configs[-1].getGold(currentIndex, "ID")
id2index[ID] = currentIndex
currentIndex += 1
if len(configs[-1]) == 0 :
configs.pop()
return configs
################################################################################