Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from readMCD import readMCD
import sys
################################################################################
class Config :
def __init__(self, col2index, index2col) :
self.lines = []
self.col2index = col2index
self.index2col = index2col
self.predicted = set({"HEAD", "DEPREL"})
self.wordIndex = 0
self.stack = []
def addLine(self, cols) :
self.lines.append([[val,""] for val in cols])
def get(self, lineIndex, colname, predicted) :
if lineIndex not in range(len(self.lines)) :
print("Line index %d is out of range (0,%d)"%(lineIndex, len(self.lines)), file=sys.stderr)
exit(1)
if colname not in self.col2index :
print("Unknown colname '%s'"%(colname), file=sys.stderr)
exit(1)
index = 1 if predicted else 0
return self.lines[lineIndex][self.col2index[colname]][index]
def set(self, lineIndex, colname, value, predicted=True) :
if lineIndex not in range(len(self.lines)) :
print("Line index %d is out of range (0,%d)"%(lineIndex, len(self.lines)), file=sys.stderr)
exit(1)
if colname not in self.col2index :
print("Unknown colname '%s'"%(colname), file=sys.stderr)
exit(1)
index = 1 if predicted else 0
self.lines[lineIndex][self.col2index[colname]][index] = value
def getAsFeature(self, lineIndex, colname) :
return self.get(lineIndex, colname, colname in self.predicted)
def getGold(self, lineIndex, colname) :
return self.get(lineIndex, colname, False)
def addWordIndexToStack(self) :
self.stack.append(self.wordIndex)
def popStack(self) :
self.stack.pop()
def moveWordIndex(self, movement) :
done = 0
if self.isMultiword(self.wordIndex) :
self.wordIndex += 1
while done != movement :
self.wordIndex += 1
if self.isMultiword(self.wordIndex) :
self.wordIndex += 1
done += 1
def isMultiword(self, index) :
return "-" in self.getAsFeature(index, "ID")
def __len__(self) :
return len(self.lines)
def printForDebug(self, output) :
print("stack :",[self.getAsFeature(ind, "ID") for ind in self.stack], file=output)
for lineIndex in range(len(self.lines)) :
print("%s"%("=>" if lineIndex == self.wordIndex else " "), end="", file=output)
toPrint = []
for colIndex in range(len(self.lines[lineIndex])) :
value = str(self.getAsFeature(lineIndex, self.index2col[colIndex]))
if value == "" :
value = "_"
elif self.index2col[colIndex] == "HEAD" and value != "0":
value = self.getAsFeature(int(value), "ID")
toPrint.append(value)
print("\t".join(toPrint), file=output)
print("", file=output)
def print(self, output) :
print("# global.columns = %s"%(" ".join(self.col2index.keys())), file=output)
for index in range(len(self.lines)) :
toPrint = []
for colIndex in range(len(self.lines[index])) :
value = str(self.getAsFeature(index, self.index2col[colIndex]))
if value == "" :
value = "_"
elif self.index2col[colIndex] == "HEAD" and value != "0":
value = self.getAsFeature(int(value), "ID")
toPrint.append(value)
print("\t".join(toPrint), file=output)
print("")
################################################################################
################################################################################
def readConllu(filename) :
configs = []
defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
col2index, index2col = readMCD(defaultMCD)
currentIndex = 0
id2index = {}
for line in open(filename, "r") :
line = line.strip()
if "# global.columns =" in line :
mcd = line.split('=')[-1].strip()
col2index, index2col = readMCD(mcd)
if len(line) == 0 :
for index in range(len(configs[-1])) :
head = configs[-1].getGold(index, "HEAD")
if head == "_" :
continue
if head == "0" :
continue
configs[-1].set(index, "HEAD", id2index[head], False)
configs.append(Config(col2index, index2col))
currentIndex = 0
id2index = {}
continue
if line[0] == '#' :
continue
if len(configs) == 0 :
configs.append(Config(col2index, index2col))
currentIndex = 0
splited = line.split('\t')
ID = splited[col2index["ID"]]
if '.' in ID :
continue
configs[-1].addLine(splited)
ID = configs[-1].getGold(currentIndex, "ID")
id2index[ID] = currentIndex
currentIndex += 1
if len(configs[-1]) == 0 :
configs.pop()
return configs
################################################################################