Newer
Older
import json
from readMCD import readMCD
################################################################################
class Dicts :
def __init__(self) :
self.dicts = {}
self.unkToken = "__unknown__"
self.nullToken = "__null__"
self.noStackToken = "__nostack__"
self.oobToken = "__oob__"
Franck Dary
committed
self.noDepLeft = "__nodepleft__"
self.noDepRight = "__nodepright__"
self.noGov = "__nogov__"
def addDict(self, name, d) :
if name in self.dicts :
raise(Exception(name+" already in dicts"))
self.dicts[name] = d
Franck Dary
committed
def readConllu(self, filename, colsSet=None, minCount=0) :
defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
col2index, index2col = readMCD(defaultMCD)
targetColumns = []
for line in open(filename, "r") :
line = line.strip()
if "# global.columns =" in line :
mcd = line.split('=')[-1].strip()
col2index, index2col = readMCD(mcd)
continue
if len(line) == 0 or line[0] == '#' :
continue
if len(targetColumns) == 0 :
if colsSet is None :
targetColumns = list(col2index.keys())
else :
targetColumns = list(colsSet)
Franck Dary
committed
self.dicts = {col : {self.unkToken : (0,minCount), self.nullToken : (1,minCount), self.noStackToken : (2,minCount), self.oobToken : (3,minCount), self.noDepLeft : (4,minCount), self.noDepRight : (5,minCount), self.noGov : (6,minCount)} for col in targetColumns}
splited = line.split('\t')
for col in targetColumns :
value = splited[col2index[col]]
if value not in self.dicts[col] :
Franck Dary
committed
self.dicts[col][value] = (len(self.dicts[col]),1)
else :
self.dicts[col][value] = (self.dicts[col][value][0],self.dicts[col][value][1]+1)
for name in self.dicts :
newDict = {}
for value in self.dicts[name] :
if self.dicts[name][value][1] >= minCount :
newDict[value] = (len(newDict),self.dicts[name][value][1])
self.dicts[name] = newDict
def get(self, col, value) :
if value in self.dicts[col] :
Franck Dary
committed
return self.dicts[col][value][0]
if value.lower() in self.dicts[col] :
Franck Dary
committed
return self.dicts[col][value.lower()][0]
return self.dicts[col][self.unkToken][0]
def save(self, target) :
json.dump(self.dicts, open(target, "w"))
def load(self, target) :
self.dicts = json.load(open(target, "r"))
################################################################################