Newer
Older
import json
from readMCD import readMCD
################################################################################
class Dicts :
def __init__(self) :
self.dicts = {}
self.unkToken = "__unknown__"
self.nullToken = "__null__"
self.noStackToken = "__nostack__"
self.oobToken = "__oob__"
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def readConllu(self, filename, colsSet=None) :
defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
col2index, index2col = readMCD(defaultMCD)
targetColumns = []
for line in open(filename, "r") :
line = line.strip()
if "# global.columns =" in line :
mcd = line.split('=')[-1].strip()
col2index, index2col = readMCD(mcd)
continue
if len(line) == 0 or line[0] == '#' :
continue
if len(targetColumns) == 0 :
if colsSet is None :
targetColumns = list(col2index.keys())
else :
targetColumns = list(colsSet)
self.dicts = {col : {self.unkToken : 0, self.nullToken : 1} for col in targetColumns}
splited = line.split('\t')
for col in targetColumns :
value = splited[col2index[col]]
if value not in self.dicts[col] :
self.dicts[col][value] = len(self.dicts[col])
def get(self, col, value) :
if value in self.dicts[col] :
return self.dicts[col][value]
if value.lower() in self.dicts[col] :
return self.dicts[col][value.lower()]
return self.dicts[col][self.unkToken]
def save(self, target) :
json.dump(self.dicts, open(target, "w"))
def load(self, target) :
self.dicts = json.load(open(target, "r"))
################################################################################