import json from readMCD import readMCD ################################################################################ class Dicts : def __init__(self) : self.dicts = {} self.unkToken = "__unknown__" self.nullToken = "__null__" self.noStackToken = "__nostack__" self.oobToken = "__oob__" self.noDepLeft = "__nodepleft__" self.noDepRight = "__nodepright__" self.noGov = "__nogov__" def readConllu(self, filename, colsSet=None) : defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" col2index, index2col = readMCD(defaultMCD) targetColumns = [] for line in open(filename, "r") : line = line.strip() if "# global.columns =" in line : mcd = line.split('=')[-1].strip() col2index, index2col = readMCD(mcd) continue if len(line) == 0 or line[0] == '#' : continue if len(targetColumns) == 0 : if colsSet is None : targetColumns = list(col2index.keys()) else : targetColumns = list(colsSet) self.dicts = {col : {self.unkToken : 0, self.nullToken : 1, self.noStackToken : 2, self.oobToken : 3, self.noDepLeft : 4, self.noDepRight : 5, self.noGov : 6} for col in targetColumns} splited = line.split('\t') for col in targetColumns : value = splited[col2index[col]] if value not in self.dicts[col] : self.dicts[col][value] = len(self.dicts[col]) def get(self, col, value) : if value in self.dicts[col] : return self.dicts[col][value] if value.lower() in self.dicts[col] : return self.dicts[col][value.lower()] return self.dicts[col][self.unkToken] def save(self, target) : json.dump(self.dicts, open(target, "w")) def load(self, target) : self.dicts = json.load(open(target, "r")) ################################################################################