from readMCD import readMCD import sys ################################################################################ class Config : def __init__(self, col2index, index2col) : self.lines = [] self.col2index = col2index self.index2col = index2col self.predicted = set({"HEAD", "DEPREL"}) self.wordIndex = 0 self.stack = [] self.comments = [] def addLine(self, cols) : self.lines.append([[val,""] for val in cols]) def get(self, lineIndex, colname, predicted) : if lineIndex not in range(len(self.lines)) : print("Line index %d is out of range (0,%d)"%(lineIndex, len(self.lines)), file=sys.stderr) exit(1) if colname not in self.col2index : print("Unknown colname '%s'"%(colname), file=sys.stderr) exit(1) index = 1 if predicted else 0 return self.lines[lineIndex][self.col2index[colname]][index] def set(self, lineIndex, colname, value, predicted=True) : if lineIndex not in range(len(self.lines)) : print("Line index %d is out of range (0,%d)"%(lineIndex, len(self.lines)), file=sys.stderr) exit(1) if colname not in self.col2index : print("Unknown colname '%s'"%(colname), file=sys.stderr) exit(1) index = 1 if predicted else 0 self.lines[lineIndex][self.col2index[colname]][index] = value def getAsFeature(self, lineIndex, colname) : return self.get(lineIndex, colname, colname in self.predicted) def getGold(self, lineIndex, colname) : return self.get(lineIndex, colname, False) def addWordIndexToStack(self) : self.stack.append(self.wordIndex) def popStack(self) : self.stack.pop() def moveWordIndex(self, movement) : done = 0 if self.isMultiword(self.wordIndex) : self.wordIndex += 1 while done != movement : self.wordIndex += 1 if self.isMultiword(self.wordIndex) : self.wordIndex += 1 done += 1 def isMultiword(self, index) : return "-" in self.getAsFeature(index, "ID") def __len__(self) : return len(self.lines) def printForDebug(self, output) : print("stack :",[self.getAsFeature(ind, "ID") for ind in self.stack], file=output) for lineIndex in range(len(self.lines)) : print("%s"%("=>" if lineIndex == self.wordIndex else " "), end="", file=output) toPrint = [] for colIndex in range(len(self.lines[lineIndex])) : value = str(self.getAsFeature(lineIndex, self.index2col[colIndex])) if value == "" : value = "_" elif self.index2col[colIndex] == "HEAD" and value != "-1": value = self.getAsFeature(int(value), "ID") elif self.index2col[colIndex] == "HEAD" and value == "-1": value = "0" toPrint.append(value) print("\t".join(toPrint), file=output) print("", file=output) def print(self, output, header=False) : if header : print("# global.columns = %s"%(" ".join(self.col2index.keys())), file=output) print("\n".join(self.comments)) for index in range(len(self.lines)) : toPrint = [] for colIndex in range(len(self.lines[index])) : value = str(self.getAsFeature(index, self.index2col[colIndex])) if value == "" : value = "_" elif self.index2col[colIndex] == "HEAD" and value != "-1": value = self.getAsFeature(int(value), "ID") elif self.index2col[colIndex] == "HEAD" and value == "-1": value = "0" toPrint.append(value) print("\t".join(toPrint), file=output) print("") ################################################################################ ################################################################################ def readConllu(filename) : configs = [] defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" col2index, index2col = readMCD(defaultMCD) currentIndex = 0 id2index = {} comments = [] for line in open(filename, "r") : line = line.strip() if "# global.columns =" in line : mcd = line.split('=')[-1].strip() col2index, index2col = readMCD(mcd) continue if len(line) == 0 : for index in range(len(configs[-1])) : head = configs[-1].getGold(index, "HEAD") if head == "_" : continue if head == "0" : continue configs[-1].set(index, "HEAD", id2index[head], False) configs[-1].comments = comments configs.append(Config(col2index, index2col)) currentIndex = 0 id2index = {} comments = [] continue if line[0] == '#' : comments.append(line) continue if len(configs) == 0 : configs.append(Config(col2index, index2col)) currentIndex = 0 id2index = {} splited = line.split('\t') ID = splited[col2index["ID"]] if '.' in ID : continue configs[-1].addLine(splited) ID = configs[-1].getGold(currentIndex, "ID") id2index[ID] = currentIndex currentIndex += 1 if len(configs[-1]) == 0 : configs.pop() return configs ################################################################################