-
Franck Dary authoredFranck Dary authored
Dicts.py 1.59 KiB
import json
from readMCD import readMCD
################################################################################
class Dicts :
def __init__(self) :
self.dicts = {}
self.unkToken = "__unknown__"
self.nullToken = "__null__"
def readConllu(self, filename, colsSet=None) :
defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
col2index, index2col = readMCD(defaultMCD)
targetColumns = []
for line in open(filename, "r") :
line = line.strip()
if "# global.columns =" in line :
mcd = line.split('=')[-1].strip()
col2index, index2col = readMCD(mcd)
continue
if len(line) == 0 or line[0] == '#' :
continue
if len(targetColumns) == 0 :
if colsSet is None :
targetColumns = list(col2index.keys())
else :
targetColumns = list(colsSet)
self.dicts = {col : {self.unkToken : 0, self.nullToken : 1} for col in targetColumns}
splited = line.split('\t')
for col in targetColumns :
value = splited[col2index[col]]
if value not in self.dicts[col] :
self.dicts[col][value] = len(self.dicts[col])
def get(self, col, value) :
if value in self.dicts[col] :
return self.dicts[col][value]
if value.lower() in self.dicts[col] :
return self.dicts[col][value.lower()]
return self.dicts[col][self.unkToken]
def save(self, target) :
json.dump(self.dicts, open(target, "w"))
def load(self, target) :
self.dicts = json.load(open(target, "r"))
################################################################################