Skip to content
Snippets Groups Projects
Config.py 5.99 KiB
Newer Older
Franck Dary's avatar
Franck Dary committed
from readMCD import readMCD
import sys

################################################################################
class Config :
  def __init__(self, col2index, index2col) :
    self.lines = []
    self.goldChilds = []
    self.predChilds = []
Franck Dary's avatar
Franck Dary committed
    self.col2index = col2index
    self.index2col = index2col
    self.predicted = set({"HEAD", "DEPREL"})
    self.wordIndex = 0
    self.stack = []
Franck Dary's avatar
Franck Dary committed
  
  def addLine(self, cols) :
    self.lines.append([[val,""] for val in cols])
    self.goldChilds.append([])
    self.predChilds.append([])
Franck Dary's avatar
Franck Dary committed

  def get(self, lineIndex, colname, predicted) :
    if lineIndex not in range(len(self.lines)) :
      print("Line index %d is out of range (0,%d)"%(lineIndex, len(self.lines)), file=sys.stderr)
      exit(1)
    if colname not in self.col2index :
      print("Unknown colname '%s'"%(colname), file=sys.stderr)
      exit(1)
    index = 1 if predicted else 0
    return self.lines[lineIndex][self.col2index[colname]][index]

  def set(self, lineIndex, colname, value, predicted=True) :
    if lineIndex not in range(len(self.lines)) :
      print("Line index %d is out of range (0,%d)"%(lineIndex, len(self.lines)), file=sys.stderr)
      exit(1)
    if colname not in self.col2index :
      print("Unknown colname '%s'"%(colname), file=sys.stderr)
      exit(1)
    index = 1 if predicted else 0
    self.lines[lineIndex][self.col2index[colname]][index] = value

  def getAsFeature(self, lineIndex, colname) :
    return self.get(lineIndex, colname, colname in self.predicted)

  def getGold(self, lineIndex, colname) :
    return self.get(lineIndex, colname, False)

  def addWordIndexToStack(self) :
    self.stack.append(self.wordIndex)

  def popStack(self) :
    self.stack.pop()

  # Move wordIndex by a relative forward movement if possible. Ignore multiwords.
  # Don't go out of bounds, but don't fail either.
  # Return true if movement was completed.
Franck Dary's avatar
Franck Dary committed
  def moveWordIndex(self, movement) :
    done = 0
    if self.isMultiword(self.wordIndex) :
      self.wordIndex += 1
    while done != movement :
      if self.wordIndex < len(self.lines) - 1 :
        self.wordIndex += 1
      else :
        return False
Franck Dary's avatar
Franck Dary committed
      if self.isMultiword(self.wordIndex) :
        self.wordIndex += 1
      done += 1
Franck Dary's avatar
Franck Dary committed

  def isMultiword(self, index) :
    return "-" in self.getAsFeature(index, "ID")

  def __len__(self) :
    return len(self.lines)

  def printForDebug(self, output) :
Franck Dary's avatar
Franck Dary committed
    printedCols = ["ID","FORM","UPOS","HEAD","DEPREL"]
    left = 5
    right = 5
Franck Dary's avatar
Franck Dary committed
    print("stack :",[self.getAsFeature(ind, "ID") for ind in self.stack], file=output)
Franck Dary's avatar
Franck Dary committed
    toPrint = []
    for lineIndex in range(self.wordIndex-left, self.wordIndex+right) :
      if lineIndex not in range(len(self.lines)) :
        continue
      toPrint.append(["%s"%("=>" if lineIndex == self.wordIndex else "  ")])
Franck Dary's avatar
Franck Dary committed
      for colIndex in range(len(self.lines[lineIndex])) :
Franck Dary's avatar
Franck Dary committed
        if self.index2col[colIndex] not in printedCols :
          continue
Franck Dary's avatar
Franck Dary committed
        value = str(self.getAsFeature(lineIndex, self.index2col[colIndex]))
        if value == "" :
          value = "_"
        elif self.index2col[colIndex] == "HEAD" and value != "-1":
Franck Dary's avatar
Franck Dary committed
          value = self.getAsFeature(int(value), "ID")
        elif self.index2col[colIndex] == "HEAD" and value == "-1":
          value = "0"
Franck Dary's avatar
Franck Dary committed
        toPrint[-1].append(value)
    maxCol = [max([len(toPrint[i][j]) for i in range(len(toPrint))]) for j in range(len(toPrint[0]))]
    for i in range(len(toPrint)) :
      for j in range(len(toPrint[i])) :
        toPrint[i][j] = "{:{}}".format(toPrint[i][j], maxCol[j])
      toPrint[i] = toPrint[i][0]+" ".join(toPrint[i][1:])
    print("\n".join(toPrint), file=output)
Franck Dary's avatar
Franck Dary committed

  def print(self, output, header=False) :
    if header :
      print("# global.columns = %s"%(" ".join(self.col2index.keys())), file=output)
    if len(self.comments) > 0 :
      print("\n".join(self.comments), file=output)
Franck Dary's avatar
Franck Dary committed
    for index in range(len(self.lines)) :
      toPrint = []
      for colIndex in range(len(self.lines[index])) :
        value = str(self.getAsFeature(index, self.index2col[colIndex]))
        if value == "" :
          value = "_"
        elif self.index2col[colIndex] == "HEAD" and value != "-1":
Franck Dary's avatar
Franck Dary committed
          value = self.getAsFeature(int(value), "ID")
        elif self.index2col[colIndex] == "HEAD" and value == "-1":
          value = "0"
Franck Dary's avatar
Franck Dary committed
        toPrint.append(value)
      print("\t".join(toPrint), file=output)
    print("", file=output)
Franck Dary's avatar
Franck Dary committed
################################################################################
  
################################################################################
def readConllu(filename) :
  configs = []
  defaultMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
  col2index, index2col = readMCD(defaultMCD)
  currentIndex = 0
  id2index = {}
Franck Dary's avatar
Franck Dary committed

  for line in open(filename, "r") :
    line = line.strip()
    if "# global.columns =" in line :
      mcd = line.split('=')[-1].strip()
      col2index, index2col = readMCD(mcd)
Franck Dary's avatar
Franck Dary committed
    if len(line) == 0 :
      for index in range(len(configs[-1])) :
        head = configs[-1].getGold(index, "HEAD")
        if head == "_" :
          continue
        if head == "0" :
          continue
        configs[-1].set(index, "HEAD", id2index[head], False)
        configs[-1].goldChilds[int(id2index[head])].append(index)
Franck Dary's avatar
Franck Dary committed

Franck Dary's avatar
Franck Dary committed
      configs.append(Config(col2index, index2col))
      currentIndex = 0
      id2index = {}
Franck Dary's avatar
Franck Dary committed
      continue
    if line[0] == '#' :
Franck Dary's avatar
Franck Dary committed
      continue

    if len(configs) == 0 :
      configs.append(Config(col2index, index2col))
      currentIndex = 0
Franck Dary's avatar
Franck Dary committed

    splited = line.split('\t')

    ID = splited[col2index["ID"]]
    if '.' in ID :
      continue

    configs[-1].addLine(splited)
    ID = configs[-1].getGold(currentIndex, "ID")
    id2index[ID] = currentIndex
    currentIndex += 1

  if len(configs[-1]) == 0 :
    configs.pop()

  return configs
################################################################################