From 07738758e7967201e21e9ab2553edff39b7d7b9e Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Wed, 30 Jun 2021 17:16:05 +0200 Subject: [PATCH] Added script to convert conllu sentence to latex table --- scripts/conllu2latex.py | 119 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100755 scripts/conllu2latex.py diff --git a/scripts/conllu2latex.py b/scripts/conllu2latex.py new file mode 100755 index 0000000..e93398c --- /dev/null +++ b/scripts/conllu2latex.py @@ -0,0 +1,119 @@ +#! /usr/bin/env python3 + +import argparse +import sys +from readMCD import readMCD + +################################################################################ +if __name__ == "__main__" : + parser = argparse.ArgumentParser() + parser.add_argument("input", type=str, + help="Input conllu file") + parser.add_argument("id", type=str, + help="sent_id of the target sentence in the conllu file.") + parser.add_argument("--tapes", default="ID,FORM,UPOS,LEMMA,HEAD,DEPREL", + help="Comma separated list of column names that will be the rows of the table. ID should be the first. FORM should be second.") + + args = parser.parse_args() + + baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" + col2index, index2col = readMCD(baseMCD) + + columns = args.tapes.split(',') + + sentence = [] + text = "" + + reading = False + for line in open(args.input, "r") : + line = line.strip() + if len(line) == 0 : + if reading : + break + continue + if "# global.columns =" in line : + col2index, index2col = readMCD(line.split('=')[-1].strip()) + continue + if "# text =" in line : + text = line.split('=')[-1].strip() + if "# sent_id =" in line : + curSent = line.split('=')[-1].strip() + if curSent == args.id : + reading = True + if line[0] == '#' : + continue + + if not reading : + continue + + splited = line.split('\t') + sentence.append([splited[col2index[col]] for col in columns]) + + ranges = [[-1,-1] for _ in sentence] + + curIndex = 0 + toIgnore = 0 + multis = [] + for i in range(len(sentence)) : + if toIgnore > 0 : + toIgnore -= 1 + continue + if len(sentence[i][0].split('-')) != 1 : + multis.append(i) + toIgnore = int(sentence[i][0].split('-')[-1])-int(sentence[i][0].split('-')[0])+1 + continue + word = sentence[i][1] + begin = text[curIndex:].find(word) + end = begin + len(word)-1 + ranges[i][0] = curIndex + begin + ranges[i][1] = curIndex + end + curIndex = end+1 + + sentence = [sentence[i] for i in range(len(sentence)) if i not in multis] + ranges = [ranges[i] for i in range(len(ranges)) if i not in multis] + + for i in range(len(ranges)) : + if ranges[i][0] != -1 : + continue + start = 0 + if i > 0 : + start = ranges[i-1][1]+1 + j = i + while ranges[j][0] == -1 : + j += 1 + end = ranges[j][0]-1 + size = end-start +1 + each = size // (j-i) + for k in range(j-i) : + ranges[i+k][0] = start + k*each + ranges[i+k][1] = ranges[i+k][0]+each-1 + i = j + + for i in range(len(ranges)-1) : + ranges[i][1] = ranges[i+1][0]-1 + + print([text[r[0]:r[1]+1] for r in ranges]) + + print("\\newcolumntype{x}[0]{>{\centering\\arraybackslash}m{2.2mm}}") + print("\\begin{table}[t]") + print("\centering") + print("\\footnotesize") + print("\\tabcolsep=0.45mm") + print("\\begin{tabular}{|l|%s|}"%("|".join('x'*len(text)))) + print("\hline\n") + + for i in range(len(columns))[::-1] : + print("\\texttt{\\textsc{%s}}"%columns[i].lower(), end=" &\n") + for j in range(len(sentence)) : + value = sentence[j][i] + if columns[i] not in ["FORM","LEMMA"] : + value = "\\textsc{%s}"%(value.lower()) + print("\multicolumn{%d}{c|}{%s}"%(ranges[j][1]-ranges[j][0]+1, value), end=" &\n" if j != len(sentence)-1 else "") + print("\\\\ \hline\n") + + print("\\texttt{\\textsc{input}} & %s\\\\ \hline"%" & ".join(text)) + + print("\end{tabular}") + print("\end{table}") +################################################################################ + -- GitLab