Skip to content
Snippets Groups Projects
Dicts.py 1.66 KiB
Newer Older
  • Learn to ignore specific revisions
  • import json
    from readMCD import readMCD
    
    ################################################################################
    class Dicts :
      def __init__(self) :
        self.dicts = {}
        self.unkToken = "__unknown__"
        self.nullToken = "__null__"
    
        self.noStackToken = "__nostack__"
        self.oobToken = "__oob__"
    
    
      def readConllu(self, filename, colsSet=None) :
        defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
        col2index, index2col = readMCD(defaultMCD)
    
        targetColumns = []
    
        for line in open(filename, "r") :
          line = line.strip()
          if "# global.columns =" in line :
            mcd = line.split('=')[-1].strip()
            col2index, index2col = readMCD(mcd)
            continue
          if len(line) == 0 or line[0] == '#' :
            continue
    
          if len(targetColumns) == 0 :
            if colsSet is None :
              targetColumns = list(col2index.keys())
            else :
              targetColumns = list(colsSet)
            self.dicts = {col : {self.unkToken : 0, self.nullToken : 1} for col in targetColumns}
    
          splited = line.split('\t')
          for col in targetColumns :
            value = splited[col2index[col]]
            if value not in self.dicts[col] :
              self.dicts[col][value] = len(self.dicts[col])
    
      def get(self, col, value) :
        if value in self.dicts[col] :
          return self.dicts[col][value]
        if value.lower() in self.dicts[col] :
          return self.dicts[col][value.lower()]
        return self.dicts[col][self.unkToken]
    
      def save(self, target) :
        json.dump(self.dicts, open(target, "w"))
    
      def load(self, target) :
        self.dicts = json.load(open(target, "r"))
    ################################################################################