Something went wrong on our end
-
Franck Dary authoredFranck Dary authored
conllu2latex.py 4.17 KiB
#! /usr/bin/env python3
import argparse
import sys
from readMCD import readMCD
################################################################################
if __name__ == "__main__" :
parser = argparse.ArgumentParser()
parser.add_argument("input", type=str,
help="Input conllu file")
parser.add_argument("id", type=str,
help="sent_id of the target sentence in the conllu file.")
parser.add_argument("--tapes", default="ID,FORM,UPOS,LEMMA,HEAD,DEPREL",
help="Comma separated list of column names that will be the rows of the table. ID should be the first. FORM should be second.")
args = parser.parse_args()
baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
col2index, index2col = readMCD(baseMCD)
columns = args.tapes.split(',')
sentence = []
text = ""
reading = False
for line in open(args.input, "r") :
line = line.strip()
if len(line) == 0 :
if reading :
break
continue
if "# global.columns =" in line :
col2index, index2col = readMCD(line.split('=')[-1].strip())
continue
if "# text =" in line :
text = line.split('=')[-1].strip()
if "# sent_id =" in line :
curSent = line.split('=')[-1].strip()
if curSent == args.id :
reading = True
if line[0] == '#' :
continue
if not reading :
continue
splited = line.split('\t')
sentence.append([splited[col2index[col]] for col in columns])
ranges = [[-1,-1] for _ in sentence]
curIndex = 0
toIgnore = 0
multis = []
for i in range(len(sentence)) :
if toIgnore > 0 :
toIgnore -= 1
continue
if len(sentence[i][0].split('-')) != 1 :
multis.append(i)
toIgnore = int(sentence[i][0].split('-')[-1])-int(sentence[i][0].split('-')[0])+1
continue
word = sentence[i][1]
begin = text[curIndex:].find(word)
end = begin + len(word)-1
ranges[i][0] = curIndex + begin
ranges[i][1] = curIndex + end
curIndex = end+1
sentence = [sentence[i] for i in range(len(sentence)) if i not in multis]
ranges = [ranges[i] for i in range(len(ranges)) if i not in multis]
for i in range(len(ranges)) :
if ranges[i][0] != -1 :
continue
start = 0
if i > 0 :
start = ranges[i-1][1]+1
j = i
while ranges[j][0] == -1 :
j += 1
end = ranges[j][0]-1
size = end-start +1
each = size // (j-i)
for k in range(j-i) :
ranges[i+k][0] = start + k*each
ranges[i+k][1] = ranges[i+k][0]+each-1
i = j
for i in range(len(ranges)-1) :
if ranges[i][1] != ranges[i+1][0]-1 :
if ranges[i][1]-ranges[i][0] <= ranges[i+1][1]-ranges[i+1][0] :
ranges[i][1] = ranges[i+1][0]-1
else :
ranges[i+1][0] = ranges[i][1]+1
maxNbLetters = 45
parts = [[]]
first = 0
for i in range(len(ranges)) :
if ranges[i][1]-first > maxNbLetters :
parts.append([])
first = ranges[i][0]
parts[-1].append(i)
partSizes = [-ranges[parts[partId][0]][0]+ranges[parts[partId][-1]][1]+1 for partId in range(len(parts))]
print("\\begin{figure}[t]")
print("\centering")
print("\\footnotesize")
print("\\tabcolsep=0.40mm")
print("\\begin{tabular}{|l|%s|}"%("|".join(["c"]*max(partSizes))))
for partId in range(len(parts)) :
if partId != 0 :
print("\multicolumn{0}{c}{}\\\\")
print("\cline{1-%d}\n"%(partSizes[partId]+1))
for i in range(len(columns))[::-1] :
print("\\texttt{\\textbf{%s}}"%columns[i].lower(), end=" &\n")
for j in parts[partId] :
value = sentence[j][i]
if columns[i] not in ["FORM","LEMMA"] :
value = "\\texttt{%s}"%(value.lower())
else :
value = "\\texttt{%s}"%(value)
print("\multicolumn{%d}{c|}{%s}"%(ranges[j][1]-ranges[j][0]+1, value), end=" &\n" if j != parts[partId][-1] else "")
print("\\\\ \cline{1-%d}\n"%(partSizes[partId]+1))
print("\\texttt{\\textbf{input}} & %s\\\\ \cline{1-%d}"%(" & ".join(["\\texttt{%s}"%c for c in text[ranges[parts[partId][0]][0]:ranges[parts[partId][-1]][1]+1]]), partSizes[partId]+1))
print("\end{tabular}")
print("\label{fig:a}")
print("\caption{``%s''}"%text)
print("\end{figure}")
################################################################################