Skip to content
Snippets Groups Projects
Select Git revision
  • b70459881ee680ab5772871a5ceaa0db6366055e
  • master default protected
  • erased
  • states
  • negatives
  • temp
  • negativeExamples
  • Rl
8 results

Dicts.py

Blame
  • Dicts.py 2.91 KiB
    import json
    from readMCD import readMCD
    
    ################################################################################
    class Dicts :
      def __init__(self) :
        self.dicts = {}
        self.unkToken = "__unknown__"
        self.nullToken = "__null__"
        self.noStackToken = "__nostack__"
        self.oobToken = "__oob__"
        self.noDepLeft = "__nodepleft__"
        self.noDepRight = "__nodepright__"
        self.noGov = "__nogov__"
        self.notSeen = "__notseen__"
    
      def addDict(self, name, d) :
        if name in self.dicts :
          raise(Exception(name+" already in dicts"))
        self.dicts[name] = d
    
      def readConllu(self, filename, colsSet=None, minCount=0) :
        defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
        col2index, index2col = readMCD(defaultMCD)
    
        targetColumns = []
    
        for line in open(filename, "r") :
          line = line.strip()
          if "# global.columns =" in line :
            mcd = line.split('=')[-1].strip()
            col2index, index2col = readMCD(mcd)
            continue
          if len(line) == 0 or line[0] == '#' :
            continue
    
          if len(targetColumns) == 0 :
            if colsSet is None :
              targetColumns = list(col2index.keys())
            else :
              targetColumns = list(colsSet)
            self.dicts = {col : {self.unkToken : (0,minCount), self.nullToken : (1,minCount), self.noStackToken : (2,minCount), self.oobToken : (3,minCount), self.noDepLeft : (4,minCount), self.noDepRight : (5,minCount), self.noGov : (6,minCount), self.notSeen : (7,minCount)} for col in targetColumns}
    
          splited = line.split('\t')
          for col in targetColumns :
            if col == "LETTER" :
              values = [letter for letter in splited[col2index["FORM"]]]
            else :
              values = [splited[col2index[col]]]
            for value in values :
              if value not in self.dicts[col] :
                self.dicts[col][value] = (len(self.dicts[col]),1)
              else :
                self.dicts[col][value] = (self.dicts[col][value][0],self.dicts[col][value][1]+1)
    
        for name in self.dicts :
          newDict = {}
          for value in self.dicts[name] :
            if self.dicts[name][value][1] >= minCount :
              newDict[value] = (len(newDict),self.dicts[name][value][1])
          self.dicts[name] = newDict
    
      def get(self, col, value) :
        if col not in self.dicts :
          raise Exception("Unknown dict name '%s' among %s"%(col, str(list(self.dicts.keys()))))
        if value in self.dicts[col] :
          return self.dicts[col][value][0]
        if value.lower() in self.dicts[col] :
          return self.dicts[col][value.lower()][0]
        return self.dicts[col][self.unkToken][0]
    
      def getElementsOf(self, col) :
        if col not in self.dicts :
          raise Exception("Unknown dict name %s"%col)
        return self.dicts[col].keys()
    
      def save(self, target) :
        json.dump(self.dicts, open(target, "w"))
    
      def load(self, target) :
        self.dicts = json.load(open(target, "r"))
    ################################################################################