Newer
Older
import json
from readMCD import readMCD
################################################################################
class Dicts :
def __init__(self) :
self.dicts = {}
self.unkToken = "__unknown__"
self.nullToken = "__null__"
self.noStackToken = "__nostack__"
self.oobToken = "__oob__"
Franck Dary
committed
self.noDepLeft = "__nodepleft__"
self.noDepRight = "__nodepright__"
self.noGov = "__nogov__"
Franck Dary
committed
self.notSeen = "__notseen__"
def addDict(self, name, d) :
if name in self.dicts :
raise(Exception(name+" already in dicts"))
self.dicts[name] = d
Franck Dary
committed
def readConllu(self, filename, colsSet=None, minCount=0) :
defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
col2index, index2col = readMCD(defaultMCD)
targetColumns = []
for line in open(filename, "r") :
line = line.strip()
if "# global.columns =" in line :
mcd = line.split('=')[-1].strip()
col2index, index2col = readMCD(mcd)
continue
if len(line) == 0 or line[0] == '#' :
continue
if len(targetColumns) == 0 :
if colsSet is None :
targetColumns = list(col2index.keys())
else :
targetColumns = list(colsSet)
Franck Dary
committed
self.dicts = {col : {self.unkToken : (0,minCount), self.nullToken : (1,minCount), self.noStackToken : (2,minCount), self.oobToken : (3,minCount), self.noDepLeft : (4,minCount), self.noDepRight : (5,minCount), self.noGov : (6,minCount), self.notSeen : (7,minCount)} for col in targetColumns}
splited = line.split('\t')
for col in targetColumns :
if col == "LETTER" :
values = [letter for letter in splited[col2index["FORM"]]]
Franck Dary
committed
else :
values = [splited[col2index[col]]]
for value in values :
if value not in self.dicts[col] :
self.dicts[col][value] = (len(self.dicts[col]),1)
else :
self.dicts[col][value] = (self.dicts[col][value][0],self.dicts[col][value][1]+1)
Franck Dary
committed
for name in self.dicts :
newDict = {}
for value in self.dicts[name] :
if self.dicts[name][value][1] >= minCount :
newDict[value] = (len(newDict),self.dicts[name][value][1])
self.dicts[name] = newDict
if col not in self.dicts :
raise Exception("Unknown dict name '%s' among %s"%(col, str(list(self.dicts.keys()))))
Franck Dary
committed
return self.dicts[col][value][0]
if value.lower() in self.dicts[col] :
Franck Dary
committed
return self.dicts[col][value.lower()][0]
return self.dicts[col][self.unkToken][0]
def getElementsOf(self, col) :
if col not in self.dicts :
raise Exception("Unknown dict name %s"%col)
return self.dicts[col].keys()
def save(self, target) :
json.dump(self.dicts, open(target, "w"))
def load(self, target) :
self.dicts = json.load(open(target, "r"))
################################################################################