Skip to content
Snippets Groups Projects
conllu2latex.py 4.17 KiB
#! /usr/bin/env python3

import argparse
import sys
from readMCD import readMCD

################################################################################
if __name__ == "__main__" :
  parser = argparse.ArgumentParser()
  parser.add_argument("input", type=str,
    help="Input conllu file")
  parser.add_argument("id", type=str,
    help="sent_id of the target sentence in the conllu file.")
  parser.add_argument("--tapes", default="ID,FORM,UPOS,LEMMA,HEAD,DEPREL",
    help="Comma separated list of column names that will be the rows of the table. ID should be the first. FORM should be second.")

  args = parser.parse_args()

  baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
  col2index, index2col = readMCD(baseMCD)

  columns = args.tapes.split(',')

  sentence = []
  text = ""

  reading = False
  for line in open(args.input, "r") :
    line = line.strip()
    if len(line) == 0 :
      if reading :
        break
      continue
    if "# global.columns =" in line :
      col2index, index2col = readMCD(line.split('=')[-1].strip())
      continue
    if "# text =" in line :
      text = line.split('=')[-1].strip()
    if "# sent_id =" in line :
      curSent = line.split('=')[-1].strip()
      if curSent == args.id :
        reading = True
    if line[0] == '#' :
      continue

    if not reading :
      continue

    splited = line.split('\t')
    sentence.append([splited[col2index[col]] for col in columns])

  ranges = [[-1,-1] for _ in sentence]

  curIndex = 0
  toIgnore = 0
  multis = []
  for i in range(len(sentence)) :
    if toIgnore > 0 :
      toIgnore -= 1
      continue
    if len(sentence[i][0].split('-')) != 1 :
      multis.append(i)
      toIgnore = int(sentence[i][0].split('-')[-1])-int(sentence[i][0].split('-')[0])+1
      continue
    word = sentence[i][1]
    begin = text[curIndex:].find(word)
    end = begin + len(word)-1
    ranges[i][0] = curIndex + begin
    ranges[i][1] = curIndex + end
    curIndex = end+1

  sentence = [sentence[i] for i in range(len(sentence)) if i not in multis]
  ranges = [ranges[i] for i in range(len(ranges)) if i not in multis]

  for i in range(len(ranges)) :
    if ranges[i][0] != -1 :
      continue
    start = 0
    if i > 0 :
      start = ranges[i-1][1]+1
    j = i
    while ranges[j][0] == -1 :
      j += 1
    end = ranges[j][0]-1
    size = end-start +1
    each = size // (j-i)
    for k in range(j-i) :
      ranges[i+k][0] = start + k*each
      ranges[i+k][1] = ranges[i+k][0]+each-1
    i = j

  for i in range(len(ranges)-1) :
    if ranges[i][1] != ranges[i+1][0]-1 :
      if ranges[i][1]-ranges[i][0] <= ranges[i+1][1]-ranges[i+1][0] :
        ranges[i][1] = ranges[i+1][0]-1
      else :
        ranges[i+1][0] = ranges[i][1]+1

  maxNbLetters = 45

  parts = [[]]
  first = 0
  for i in range(len(ranges)) :
    if ranges[i][1]-first > maxNbLetters :
      parts.append([])
      first = ranges[i][0]
    parts[-1].append(i)

  partSizes = [-ranges[parts[partId][0]][0]+ranges[parts[partId][-1]][1]+1 for partId in range(len(parts))]

  print("\\begin{figure}[t]")
  print("\centering")
  print("\\footnotesize")
  print("\\tabcolsep=0.40mm")
  print("\\begin{tabular}{|l|%s|}"%("|".join(["c"]*max(partSizes))))
  for partId in range(len(parts)) :
    if partId != 0 :
      print("\multicolumn{0}{c}{}\\\\")
    print("\cline{1-%d}\n"%(partSizes[partId]+1))
    for i in range(len(columns))[::-1] :
      print("\\texttt{\\textbf{%s}}"%columns[i].lower(), end=" &\n")
      for j in parts[partId] :
        value = sentence[j][i]
        if columns[i] not in ["FORM","LEMMA"] :
          value = "\\texttt{%s}"%(value.lower())
        else :
          value = "\\texttt{%s}"%(value)
        print("\multicolumn{%d}{c|}{%s}"%(ranges[j][1]-ranges[j][0]+1, value), end=" &\n" if j != parts[partId][-1] else "")
      print("\\\\ \cline{1-%d}\n"%(partSizes[partId]+1))
  
    print("\\texttt{\\textbf{input}} & %s\\\\ \cline{1-%d}"%(" & ".join(["\\texttt{%s}"%c for c in text[ranges[parts[partId][0]][0]:ranges[parts[partId][-1]][1]+1]]), partSizes[partId]+1))
  print("\end{tabular}")
  print("\label{fig:a}")
  print("\caption{``%s''}"%text)
  print("\end{figure}")
################################################################################