Dicts.py

import json
from readMCD import readMCD

################################################################################
class Dicts :
  def __init__(self) :
    self.dicts = {}
    self.unkToken = "__unknown__"
    self.nullToken = "__null__"
    self.noStackToken = "__nostack__"
    self.oobToken = "__oob__"
    self.noDepLeft = "__nodepleft__"
    self.noDepRight = "__nodepright__"
    self.noGov = "__nogov__"
    self.notSeen = "__notseen__"

  def addDict(self, name, d) :
    if name in self.dicts :
      raise(Exception(name+" already in dicts"))
    self.dicts[name] = d

  def readConllu(self, filename, colsSet=None, minCount=0) :
    defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
    col2index, index2col = readMCD(defaultMCD)

    targetColumns = []

    for line in open(filename, "r") :
      line = line.strip()
      if "# global.columns =" in line :
        mcd = line.split('=')[-1].strip()
        col2index, index2col = readMCD(mcd)
        continue
      if len(line) == 0 or line[0] == '#' :
        continue

      if len(targetColumns) == 0 :
        if colsSet is None :
          targetColumns = list(col2index.keys())
        else :
          targetColumns = list(colsSet)
        self.dicts = {col : {self.unkToken : (0,minCount), self.nullToken : (1,minCount), self.noStackToken : (2,minCount), self.oobToken : (3,minCount), self.noDepLeft : (4,minCount), self.noDepRight : (5,minCount), self.noGov : (6,minCount), self.notSeen : (7,minCount)} for col in targetColumns}

      splited = line.split('\t')
      for col in targetColumns :
        if col == "LETTER" :
          values = [letter for letter in splited[col2index["FORM"]]]
        else :
          values = [splited[col2index[col]]]
        for value in values :
          if value not in self.dicts[col] :
            self.dicts[col][value] = (len(self.dicts[col]),1)
          else :
            self.dicts[col][value] = (self.dicts[col][value][0],self.dicts[col][value][1]+1)

    for name in self.dicts :
      newDict = {}
      for value in self.dicts[name] :
        if self.dicts[name][value][1] >= minCount :
          newDict[value] = (len(newDict),self.dicts[name][value][1])
      self.dicts[name] = newDict

  def get(self, col, value) :
    if col not in self.dicts :
      raise Exception("Unknown dict name '%s' among %s"%(col, str(list(self.dicts.keys()))))
    if value in self.dicts[col] :
      return self.dicts[col][value][0]
    if value.lower() in self.dicts[col] :
      return self.dicts[col][value.lower()][0]
    return self.dicts[col][self.unkToken][0]

  def getElementsOf(self, col) :
    if col not in self.dicts :
      raise Exception("Unknown dict name %s"%col)
    return self.dicts[col].keys()

  def save(self, target) :
    json.dump(self.dicts, open(target, "w"))

  def load(self, target) :
    self.dicts = json.load(open(target, "r"))
################################################################################