import json from readMCD import readMCD ################################################################################ class Dicts : def __init__(self) : self.dicts = {} self.unkToken = "__unknown__" self.nullToken = "__null__" self.noStackToken = "__nostack__" self.oobToken = "__oob__" self.noDepLeft = "__nodepleft__" self.noDepRight = "__nodepright__" self.noGov = "__nogov__" self.notSeen = "__notseen__" def addDict(self, name, d) : if name in self.dicts : raise(Exception(name+" already in dicts")) self.dicts[name] = d def readConllu(self, filename, colsSet=None, minCount=0) : defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" col2index, index2col = readMCD(defaultMCD) targetColumns = [] for line in open(filename, "r") : line = line.strip() if "# global.columns =" in line : mcd = line.split('=')[-1].strip() col2index, index2col = readMCD(mcd) continue if len(line) == 0 or line[0] == '#' : continue if len(targetColumns) == 0 : if colsSet is None : targetColumns = list(col2index.keys()) else : targetColumns = list(colsSet) self.dicts = {col : {self.unkToken : (0,minCount), self.nullToken : (1,minCount), self.noStackToken : (2,minCount), self.oobToken : (3,minCount), self.noDepLeft : (4,minCount), self.noDepRight : (5,minCount), self.noGov : (6,minCount), self.notSeen : (7,minCount)} for col in targetColumns} splited = line.split('\t') for col in targetColumns : if col == "LETTER" : values = [letter for letter in splited[col2index["FORM"]]] else : values = [splited[col2index[col]]] for value in values : if value not in self.dicts[col] : self.dicts[col][value] = (len(self.dicts[col]),1) else : self.dicts[col][value] = (self.dicts[col][value][0],self.dicts[col][value][1]+1) for name in self.dicts : newDict = {} for value in self.dicts[name] : if self.dicts[name][value][1] >= minCount : newDict[value] = (len(newDict),self.dicts[name][value][1]) self.dicts[name] = newDict def get(self, col, value) : if col not in self.dicts : raise Exception("Unknown dict name '%s' among %s"%(col, str(list(self.dicts.keys())))) if value in self.dicts[col] : return self.dicts[col][value][0] if value.lower() in self.dicts[col] : return self.dicts[col][value.lower()][0] return self.dicts[col][self.unkToken][0] def getElementsOf(self, col) : if col not in self.dicts : raise Exception("Unknown dict name %s"%col) return self.dicts[col].keys() def save(self, target) : json.dump(self.dicts, open(target, "w")) def load(self, target) : self.dicts = json.load(open(target, "r")) ################################################################################