Newer
Older
from readMCD import readMCD
import sys
################################################################################
class Config :
def __init__(self, col2index, index2col) :
self.lines = []
self.col2index = col2index
self.index2col = index2col
self.predicted = set({"HEAD", "DEPREL"})
self.wordIndex = 0
self.stack = []
self.comments = []
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def addLine(self, cols) :
self.lines.append([[val,""] for val in cols])
def get(self, lineIndex, colname, predicted) :
if lineIndex not in range(len(self.lines)) :
print("Line index %d is out of range (0,%d)"%(lineIndex, len(self.lines)), file=sys.stderr)
exit(1)
if colname not in self.col2index :
print("Unknown colname '%s'"%(colname), file=sys.stderr)
exit(1)
index = 1 if predicted else 0
return self.lines[lineIndex][self.col2index[colname]][index]
def set(self, lineIndex, colname, value, predicted=True) :
if lineIndex not in range(len(self.lines)) :
print("Line index %d is out of range (0,%d)"%(lineIndex, len(self.lines)), file=sys.stderr)
exit(1)
if colname not in self.col2index :
print("Unknown colname '%s'"%(colname), file=sys.stderr)
exit(1)
index = 1 if predicted else 0
self.lines[lineIndex][self.col2index[colname]][index] = value
def getAsFeature(self, lineIndex, colname) :
return self.get(lineIndex, colname, colname in self.predicted)
def getGold(self, lineIndex, colname) :
return self.get(lineIndex, colname, False)
def addWordIndexToStack(self) :
self.stack.append(self.wordIndex)
def popStack(self) :
self.stack.pop()
def moveWordIndex(self, movement) :
done = 0
if self.isMultiword(self.wordIndex) :
self.wordIndex += 1
while done != movement :
self.wordIndex += 1
if self.isMultiword(self.wordIndex) :
self.wordIndex += 1
done += 1
def isMultiword(self, index) :
return "-" in self.getAsFeature(index, "ID")
def __len__(self) :
return len(self.lines)
def printForDebug(self, output) :
print("stack :",[self.getAsFeature(ind, "ID") for ind in self.stack], file=output)
for lineIndex in range(len(self.lines)) :
print("%s"%("=>" if lineIndex == self.wordIndex else " "), end="", file=output)
toPrint = []
for colIndex in range(len(self.lines[lineIndex])) :
value = str(self.getAsFeature(lineIndex, self.index2col[colIndex]))
if value == "" :
value = "_"
elif self.index2col[colIndex] == "HEAD" and value != "-1":
elif self.index2col[colIndex] == "HEAD" and value == "-1":
value = "0"
toPrint.append(value)
print("\t".join(toPrint), file=output)
print("", file=output)
def print(self, output, header=False) :
if header :
print("# global.columns = %s"%(" ".join(self.col2index.keys())), file=output)
print("\n".join(self.comments))
for index in range(len(self.lines)) :
toPrint = []
for colIndex in range(len(self.lines[index])) :
value = str(self.getAsFeature(index, self.index2col[colIndex]))
if value == "" :
value = "_"
elif self.index2col[colIndex] == "HEAD" and value != "-1":
elif self.index2col[colIndex] == "HEAD" and value == "-1":
value = "0"
toPrint.append(value)
print("\t".join(toPrint), file=output)
print("")
################################################################################
################################################################################
def readConllu(filename) :
configs = []
defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
col2index, index2col = readMCD(defaultMCD)
currentIndex = 0
id2index = {}
comments = []
for line in open(filename, "r") :
line = line.strip()
if "# global.columns =" in line :
mcd = line.split('=')[-1].strip()
col2index, index2col = readMCD(mcd)
if len(line) == 0 :
for index in range(len(configs[-1])) :
head = configs[-1].getGold(index, "HEAD")
if head == "_" :
continue
if head == "0" :
continue
configs[-1].set(index, "HEAD", id2index[head], False)
configs[-1].comments = comments
configs.append(Config(col2index, index2col))
currentIndex = 0
id2index = {}
comments = []
comments.append(line)
continue
if len(configs) == 0 :
configs.append(Config(col2index, index2col))
currentIndex = 0
id2index = {}
splited = line.split('\t')
ID = splited[col2index["ID"]]
if '.' in ID :
continue
configs[-1].addLine(splited)
ID = configs[-1].getGold(currentIndex, "ID")
id2index[ID] = currentIndex
currentIndex += 1
if len(configs[-1]) == 0 :
configs.pop()
return configs
################################################################################