Skip to content
Snippets Groups Projects
Commit 07738758 authored by Franck Dary's avatar Franck Dary
Browse files

Added script to convert conllu sentence to latex table

parent 717f2f25
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3
import argparse
import sys
from readMCD import readMCD
################################################################################
if __name__ == "__main__" :
parser = argparse.ArgumentParser()
parser.add_argument("input", type=str,
help="Input conllu file")
parser.add_argument("id", type=str,
help="sent_id of the target sentence in the conllu file.")
parser.add_argument("--tapes", default="ID,FORM,UPOS,LEMMA,HEAD,DEPREL",
help="Comma separated list of column names that will be the rows of the table. ID should be the first. FORM should be second.")
args = parser.parse_args()
baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
col2index, index2col = readMCD(baseMCD)
columns = args.tapes.split(',')
sentence = []
text = ""
reading = False
for line in open(args.input, "r") :
line = line.strip()
if len(line) == 0 :
if reading :
break
continue
if "# global.columns =" in line :
col2index, index2col = readMCD(line.split('=')[-1].strip())
continue
if "# text =" in line :
text = line.split('=')[-1].strip()
if "# sent_id =" in line :
curSent = line.split('=')[-1].strip()
if curSent == args.id :
reading = True
if line[0] == '#' :
continue
if not reading :
continue
splited = line.split('\t')
sentence.append([splited[col2index[col]] for col in columns])
ranges = [[-1,-1] for _ in sentence]
curIndex = 0
toIgnore = 0
multis = []
for i in range(len(sentence)) :
if toIgnore > 0 :
toIgnore -= 1
continue
if len(sentence[i][0].split('-')) != 1 :
multis.append(i)
toIgnore = int(sentence[i][0].split('-')[-1])-int(sentence[i][0].split('-')[0])+1
continue
word = sentence[i][1]
begin = text[curIndex:].find(word)
end = begin + len(word)-1
ranges[i][0] = curIndex + begin
ranges[i][1] = curIndex + end
curIndex = end+1
sentence = [sentence[i] for i in range(len(sentence)) if i not in multis]
ranges = [ranges[i] for i in range(len(ranges)) if i not in multis]
for i in range(len(ranges)) :
if ranges[i][0] != -1 :
continue
start = 0
if i > 0 :
start = ranges[i-1][1]+1
j = i
while ranges[j][0] == -1 :
j += 1
end = ranges[j][0]-1
size = end-start +1
each = size // (j-i)
for k in range(j-i) :
ranges[i+k][0] = start + k*each
ranges[i+k][1] = ranges[i+k][0]+each-1
i = j
for i in range(len(ranges)-1) :
ranges[i][1] = ranges[i+1][0]-1
print([text[r[0]:r[1]+1] for r in ranges])
print("\\newcolumntype{x}[0]{>{\centering\\arraybackslash}m{2.2mm}}")
print("\\begin{table}[t]")
print("\centering")
print("\\footnotesize")
print("\\tabcolsep=0.45mm")
print("\\begin{tabular}{|l|%s|}"%("|".join('x'*len(text))))
print("\hline\n")
for i in range(len(columns))[::-1] :
print("\\texttt{\\textsc{%s}}"%columns[i].lower(), end=" &\n")
for j in range(len(sentence)) :
value = sentence[j][i]
if columns[i] not in ["FORM","LEMMA"] :
value = "\\textsc{%s}"%(value.lower())
print("\multicolumn{%d}{c|}{%s}"%(ranges[j][1]-ranges[j][0]+1, value), end=" &\n" if j != len(sentence)-1 else "")
print("\\\\ \hline\n")
print("\\texttt{\\textsc{input}} & %s\\\\ \hline"%" & ".join(text))
print("\end{tabular}")
print("\end{table}")
################################################################################
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment