From 07738758e7967201e21e9ab2553edff39b7d7b9e Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Wed, 30 Jun 2021 17:16:05 +0200
Subject: [PATCH] Added script to convert conllu sentence to latex table

---
 scripts/conllu2latex.py | 119 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100755 scripts/conllu2latex.py

diff --git a/scripts/conllu2latex.py b/scripts/conllu2latex.py
new file mode 100755
index 0000000..e93398c
--- /dev/null
+++ b/scripts/conllu2latex.py
@@ -0,0 +1,119 @@
+#! /usr/bin/env python3
+
+import argparse
+import sys
+from readMCD import readMCD
+
+################################################################################
+if __name__ == "__main__" :
+  parser = argparse.ArgumentParser()
+  parser.add_argument("input", type=str,
+    help="Input conllu file")
+  parser.add_argument("id", type=str,
+    help="sent_id of the target sentence in the conllu file.")
+  parser.add_argument("--tapes", default="ID,FORM,UPOS,LEMMA,HEAD,DEPREL",
+    help="Comma separated list of column names that will be the rows of the table. ID should be the first. FORM should be second.")
+
+  args = parser.parse_args()
+
+  baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
+  col2index, index2col = readMCD(baseMCD)
+
+  columns = args.tapes.split(',')
+
+  sentence = []
+  text = ""
+
+  reading = False
+  for line in open(args.input, "r") :
+    line = line.strip()
+    if len(line) == 0 :
+      if reading :
+        break
+      continue
+    if "# global.columns =" in line :
+      col2index, index2col = readMCD(line.split('=')[-1].strip())
+      continue
+    if "# text =" in line :
+      text = line.split('=')[-1].strip()
+    if "# sent_id =" in line :
+      curSent = line.split('=')[-1].strip()
+      if curSent == args.id :
+        reading = True
+    if line[0] == '#' :
+      continue
+
+    if not reading :
+      continue
+
+    splited = line.split('\t')
+    sentence.append([splited[col2index[col]] for col in columns])
+
+  ranges = [[-1,-1] for _ in sentence]
+
+  curIndex = 0
+  toIgnore = 0
+  multis = []
+  for i in range(len(sentence)) :
+    if toIgnore > 0 :
+      toIgnore -= 1
+      continue
+    if len(sentence[i][0].split('-')) != 1 :
+      multis.append(i)
+      toIgnore = int(sentence[i][0].split('-')[-1])-int(sentence[i][0].split('-')[0])+1
+      continue
+    word = sentence[i][1]
+    begin = text[curIndex:].find(word)
+    end = begin + len(word)-1
+    ranges[i][0] = curIndex + begin
+    ranges[i][1] = curIndex + end
+    curIndex = end+1
+
+  sentence = [sentence[i] for i in range(len(sentence)) if i not in multis]
+  ranges = [ranges[i] for i in range(len(ranges)) if i not in multis]
+
+  for i in range(len(ranges)) :
+    if ranges[i][0] != -1 :
+      continue
+    start = 0
+    if i > 0 :
+      start = ranges[i-1][1]+1
+    j = i
+    while ranges[j][0] == -1 :
+      j += 1
+    end = ranges[j][0]-1
+    size = end-start +1
+    each = size // (j-i)
+    for k in range(j-i) :
+      ranges[i+k][0] = start + k*each
+      ranges[i+k][1] = ranges[i+k][0]+each-1
+    i = j
+
+  for i in range(len(ranges)-1) :
+    ranges[i][1] = ranges[i+1][0]-1
+
+  print([text[r[0]:r[1]+1] for r in ranges])
+
+  print("\\newcolumntype{x}[0]{>{\centering\\arraybackslash}m{2.2mm}}") 
+  print("\\begin{table}[t]")
+  print("\centering")
+  print("\\footnotesize")
+  print("\\tabcolsep=0.45mm")
+  print("\\begin{tabular}{|l|%s|}"%("|".join('x'*len(text))))
+  print("\hline\n")
+  
+  for i in range(len(columns))[::-1] :
+    print("\\texttt{\\textsc{%s}}"%columns[i].lower(), end=" &\n")
+    for j in range(len(sentence)) :
+      value = sentence[j][i]
+      if columns[i] not in ["FORM","LEMMA"] :
+        value = "\\textsc{%s}"%(value.lower())
+      print("\multicolumn{%d}{c|}{%s}"%(ranges[j][1]-ranges[j][0]+1, value), end=" &\n" if j != len(sentence)-1 else "")
+    print("\\\\ \hline\n")
+
+  print("\\texttt{\\textsc{input}} & %s\\\\ \hline"%" & ".join(text))
+
+  print("\end{tabular}")
+  print("\end{table}")
+################################################################################
+
-- 
GitLab