Skip to content
Snippets Groups Projects
Config.py 6.77 KiB
Newer Older
Franck Dary's avatar
Franck Dary committed
from readMCD import readMCD
import sys

################################################################################
class Config :
  def __init__(self, col2index, index2col, predicted) :
Franck Dary's avatar
Franck Dary committed
    self.lines = []
    self.goldChilds = []
    self.predChilds = []
Franck Dary's avatar
Franck Dary committed
    self.col2index = col2index
    self.index2col = index2col
    self.predicted = predicted
Franck Dary's avatar
Franck Dary committed
    self.wordIndex = 0
    self.maxWordIndex = 0 #To keep a track of the max value, in case of backtrack
Franck Dary's avatar
Franck Dary committed
    self.state = 0 #State of the analysis (e.g. 0=tagger, 1=parser)
Franck Dary's avatar
Franck Dary committed
    self.stack = []
Franck Dary's avatar
Franck Dary committed
    self.history = []
    self.historyHistory = set()
    self.historyPop = []

  def hasCol(self, colname) :
    return colname in self.col2index
Franck Dary's avatar
Franck Dary committed
  
  def addLine(self, cols) :
    self.lines.append([[val,""] for val in cols])
    self.goldChilds.append([])
    self.predChilds.append([])
Franck Dary's avatar
Franck Dary committed

  def get(self, lineIndex, colname, predicted) :
    if lineIndex not in range(len(self.lines)) :
      raise(Exception("Line index %d is out of range (0,%d)"%(lineIndex, len(self.lines))))
Franck Dary's avatar
Franck Dary committed
    if colname not in self.col2index :
      raise Exception("Unknown colname '%s'"%(colname))
Franck Dary's avatar
Franck Dary committed
    index = 1 if predicted else 0
    return self.lines[lineIndex][self.col2index[colname]][index]

  def set(self, lineIndex, colname, value, predicted=True) :
    if lineIndex not in range(len(self.lines)) :
      raise(Exception("Line index %d is out of range (0,%d)"%(lineIndex, len(self.lines))))
Franck Dary's avatar
Franck Dary committed
    if colname not in self.col2index :
      raise Exception("Unknown colname '%s'"%(colname))
Franck Dary's avatar
Franck Dary committed
    index = 1 if predicted else 0
    self.lines[lineIndex][self.col2index[colname]][index] = value

  def getAsFeature(self, lineIndex, colname) :
    return self.get(lineIndex, colname, colname in self.predicted)

  def getGold(self, lineIndex, colname) :
    return self.get(lineIndex, colname, False)

  def addWordIndexToStack(self) :
    self.stack.append(self.wordIndex)

  def popStack(self) :
    return self.stack.pop()
Franck Dary's avatar
Franck Dary committed

  # Move wordIndex by a relative forward movement if possible. Ignore multiwords.
  # Don't go out of bounds, but don't fail either.
  # Return true if movement was completed.
Franck Dary's avatar
Franck Dary committed
  def moveWordIndex(self, movement) :
    done = 0
    relMov = 1 if movement == 0 else movement // abs(movement)
Franck Dary's avatar
Franck Dary committed
    if self.isMultiword(self.wordIndex) :
      self.wordIndex += relMov
    while done != abs(movement) :
      if self.wordIndex+relMov in range(0, len((self.lines))) :
        self.wordIndex += relMov
        self.maxWordIndex = max(self.maxWordIndex, self.wordIndex)
Franck Dary's avatar
Franck Dary committed
      if self.isMultiword(self.wordIndex) :
        self.wordIndex += relMov
Franck Dary's avatar
Franck Dary committed
      done += 1
    self.maxWordIndex = max(self.maxWordIndex, self.wordIndex)
Franck Dary's avatar
Franck Dary committed

  def isMultiword(self, index) :
    return "-" in self.getAsFeature(index, "ID")

  def __len__(self) :
    return len(self.lines)

  def printForDebug(self, output) :
Franck Dary's avatar
Franck Dary committed
    printedCols = ["ID","FORM","UPOS","HEAD","DEPREL"]
    left = 5
    right = 5
Franck Dary's avatar
Franck Dary committed
    print("state :", self.state, file=output)
Franck Dary's avatar
Franck Dary committed
    print("stack :",[self.getAsFeature(ind, "ID") for ind in self.stack], file=output)
    print("history :",[str(trans) for trans in self.history[-10:]], file=output)
    print("historyPop :",[(str(c[0]),"dat:"+str(c[1]),"mvt:"+str(c[2]),"reward:"+str(c[3]),"state:"+str(c[4])) for c in self.historyPop[-10:]], file=output)
Franck Dary's avatar
Franck Dary committed
    toPrint = []
    for lineIndex in range(self.wordIndex-left, self.wordIndex+right) :
      if lineIndex not in range(len(self.lines)) :
        continue
      toPrint.append(["%s"%("=>" if lineIndex == self.wordIndex else "  ")])
Franck Dary's avatar
Franck Dary committed
      for colIndex in range(len(self.lines[lineIndex])) :
Franck Dary's avatar
Franck Dary committed
        if self.index2col[colIndex] not in printedCols :
          continue
Franck Dary's avatar
Franck Dary committed
        value = str(self.getAsFeature(lineIndex, self.index2col[colIndex]))
        if value == "" :
          value = "_"
        elif self.index2col[colIndex] == "HEAD" and value != "-1":
Franck Dary's avatar
Franck Dary committed
          value = self.getAsFeature(int(value), "ID")
        elif self.index2col[colIndex] == "HEAD" and value == "-1":
          value = "0"
Franck Dary's avatar
Franck Dary committed
        toPrint[-1].append(value)
    maxCol = [max([len(toPrint[i][j]) for i in range(len(toPrint))]) for j in range(len(toPrint[0]))]
    for i in range(len(toPrint)) :
      for j in range(len(toPrint[i])) :
        toPrint[i][j] = "{:{}}".format(toPrint[i][j], maxCol[j])
      toPrint[i] = toPrint[i][0]+" ".join(toPrint[i][1:])
    print("\n".join(toPrint), file=output)
Franck Dary's avatar
Franck Dary committed

  def print(self, output, header=False) :
    if header :
      print("# global.columns = %s"%(" ".join(self.col2index.keys())), file=output)
    if len(self.comments) > 0 :
      print("\n".join(self.comments), file=output)
Franck Dary's avatar
Franck Dary committed
    for index in range(len(self.lines)) :
      toPrint = []
      for colIndex in range(len(self.lines[index])) :
        value = str(self.getAsFeature(index, self.index2col[colIndex]))
        if value == "" :
          value = "_"
        elif self.index2col[colIndex] == "HEAD" and value != "-1":
Franck Dary's avatar
Franck Dary committed
          value = self.getAsFeature(int(value), "ID")
        elif self.index2col[colIndex] == "HEAD" and value == "-1":
          value = "0"
Franck Dary's avatar
Franck Dary committed
        toPrint.append(value)
      print("\t".join(toPrint), file=output)
    print("", file=output)
Franck Dary's avatar
Franck Dary committed
################################################################################
  
################################################################################
def readConllu(filename, predicted) :
Franck Dary's avatar
Franck Dary committed
  configs = []
  defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
  col2index, index2col = readMCD(defaultMCD)
  currentIndex = 0
  id2index = {}
Franck Dary's avatar
Franck Dary committed

  for line in open(filename, "r") :
    line = line.strip()
    if "# global.columns =" in line :
      mcd = line.split('=')[-1].strip()
      col2index, index2col = readMCD(mcd)
Franck Dary's avatar
Franck Dary committed
    if len(line) == 0 :
      for index in range(len(configs[-1])) :
        head = configs[-1].getGold(index, "HEAD") if "HEAD" in col2index else "_"
Franck Dary's avatar
Franck Dary committed
        if head == "_" :
          continue
        if head == "0" :
          continue
        configs[-1].set(index, "HEAD", id2index[head], False)
        configs[-1].goldChilds[int(id2index[head])].append(index)
Franck Dary's avatar
Franck Dary committed

      configs.append(Config(col2index, index2col, predicted))
Franck Dary's avatar
Franck Dary committed
      currentIndex = 0
      id2index = {}
Franck Dary's avatar
Franck Dary committed
      continue
    if line[0] == '#' :
Franck Dary's avatar
Franck Dary committed
      continue

    if len(configs) == 0 :
      configs.append(Config(col2index, index2col, predicted))
Franck Dary's avatar
Franck Dary committed
      currentIndex = 0
Franck Dary's avatar
Franck Dary committed

    splited = line.split('\t')

    ID = splited[col2index["ID"]]
    if '.' in ID :
      continue

    configs[-1].addLine(splited)
    ID = configs[-1].getGold(currentIndex, "ID")
    id2index[ID] = currentIndex
    currentIndex += 1

  if len(configs[-1]) == 0 :
    configs.pop()

  return configs
################################################################################