Skip to content
Snippets Groups Projects
Dicts.py 2.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • import json
    from readMCD import readMCD
    
    ################################################################################
    class Dicts :
      def __init__(self) :
        self.dicts = {}
        self.unkToken = "__unknown__"
        self.nullToken = "__null__"
    
        self.noStackToken = "__nostack__"
        self.oobToken = "__oob__"
    
        self.noDepLeft = "__nodepleft__"
        self.noDepRight = "__nodepright__"
        self.noGov = "__nogov__"
    
      def readConllu(self, filename, colsSet=None, minCount=0) :
    
        defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
        col2index, index2col = readMCD(defaultMCD)
    
        targetColumns = []
    
        for line in open(filename, "r") :
          line = line.strip()
          if "# global.columns =" in line :
            mcd = line.split('=')[-1].strip()
            col2index, index2col = readMCD(mcd)
            continue
          if len(line) == 0 or line[0] == '#' :
            continue
    
          if len(targetColumns) == 0 :
            if colsSet is None :
              targetColumns = list(col2index.keys())
            else :
              targetColumns = list(colsSet)
    
            self.dicts = {col : {self.unkToken : (0,minCount), self.nullToken : (1,minCount), self.noStackToken : (2,minCount), self.oobToken : (3,minCount), self.noDepLeft : (4,minCount), self.noDepRight : (5,minCount), self.noGov : (6,minCount)} for col in targetColumns}
    
    
          splited = line.split('\t')
          for col in targetColumns :
            value = splited[col2index[col]]
            if value not in self.dicts[col] :
    
              self.dicts[col][value] = (len(self.dicts[col]),1)
            else :
              self.dicts[col][value] = (self.dicts[col][value][0],self.dicts[col][value][1]+1)
    
        for name in self.dicts :
          newDict = {}
          for value in self.dicts[name] :
            if self.dicts[name][value][1] >= minCount :
              newDict[value] = (len(newDict),self.dicts[name][value][1])
          self.dicts[name] = newDict
    
    
      def get(self, col, value) :
        if value in self.dicts[col] :
    
        if value.lower() in self.dicts[col] :
    
          return self.dicts[col][value.lower()][0]
        return self.dicts[col][self.unkToken][0]
    
    
      def save(self, target) :
        json.dump(self.dicts, open(target, "w"))
    
      def load(self, target) :
        self.dicts = json.load(open(target, "r"))
    ################################################################################