#! /usr/bin/env python3 import argparse import sys from readMCD import readMCD ################################################################################ if __name__ == "__main__" : parser = argparse.ArgumentParser() parser.add_argument("input", type=str, help="Input conllu file") parser.add_argument("id", type=str, help="sent_id of the target sentence in the conllu file.") parser.add_argument("--tapes", default="ID,FORM,UPOS,LEMMA,HEAD,DEPREL", help="Comma separated list of column names that will be the rows of the table. ID should be the first. FORM should be second.") args = parser.parse_args() baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC" col2index, index2col = readMCD(baseMCD) columns = args.tapes.split(',') sentence = [] text = "" reading = False for line in open(args.input, "r") : line = line.strip() if len(line) == 0 : if reading : break continue if "# global.columns =" in line : col2index, index2col = readMCD(line.split('=')[-1].strip()) continue if "# text =" in line : text = line.split('=')[-1].strip() if "# sent_id =" in line : curSent = line.split('=')[-1].strip() if curSent == args.id : reading = True if line[0] == '#' : continue if not reading : continue splited = line.split('\t') sentence.append([splited[col2index[col]] for col in columns]) ranges = [[-1,-1] for _ in sentence] curIndex = 0 toIgnore = 0 multis = [] for i in range(len(sentence)) : if toIgnore > 0 : toIgnore -= 1 continue if len(sentence[i][0].split('-')) != 1 : multis.append(i) toIgnore = int(sentence[i][0].split('-')[-1])-int(sentence[i][0].split('-')[0])+1 continue word = sentence[i][1] begin = text[curIndex:].find(word) end = begin + len(word)-1 ranges[i][0] = curIndex + begin ranges[i][1] = curIndex + end curIndex = end+1 sentence = [sentence[i] for i in range(len(sentence)) if i not in multis] ranges = [ranges[i] for i in range(len(ranges)) if i not in multis] for i in range(len(ranges)) : if ranges[i][0] != -1 : continue start = 0 if i > 0 : start = ranges[i-1][1]+1 j = i while ranges[j][0] == -1 : j += 1 end = ranges[j][0]-1 size = end-start +1 each = size // (j-i) for k in range(j-i) : ranges[i+k][0] = start + k*each ranges[i+k][1] = ranges[i+k][0]+each-1 i = j for i in range(len(ranges)-1) : if ranges[i][1] != ranges[i+1][0]-1 : if ranges[i][1]-ranges[i][0] <= ranges[i+1][1]-ranges[i+1][0] : ranges[i][1] = ranges[i+1][0]-1 else : ranges[i+1][0] = ranges[i][1]+1 maxNbLetters = 45 parts = [[]] first = 0 for i in range(len(ranges)) : if ranges[i][1]-first > maxNbLetters : parts.append([]) first = ranges[i][0] parts[-1].append(i) partSizes = [-ranges[parts[partId][0]][0]+ranges[parts[partId][-1]][1]+1 for partId in range(len(parts))] print("\\begin{figure}[t]") print("\centering") print("\\footnotesize") print("\\tabcolsep=0.40mm") print("\\begin{tabular}{|l|%s|}"%("|".join(["c"]*max(partSizes)))) for partId in range(len(parts)) : if partId != 0 : print("\multicolumn{0}{c}{}\\\\") print("\cline{1-%d}\n"%(partSizes[partId]+1)) for i in range(len(columns))[::-1] : print("\\texttt{\\textbf{%s}}"%columns[i].lower(), end=" &\n") for j in parts[partId] : value = sentence[j][i] if columns[i] not in ["FORM","LEMMA"] : value = "\\texttt{%s}"%(value.lower()) else : value = "\\texttt{%s}"%(value) print("\multicolumn{%d}{c|}{%s}"%(ranges[j][1]-ranges[j][0]+1, value), end=" &\n" if j != parts[partId][-1] else "") print("\\\\ \cline{1-%d}\n"%(partSizes[partId]+1)) print("\\texttt{\\textbf{input}} & %s\\\\ \cline{1-%d}"%(" & ".join(["\\texttt{%s}"%c for c in text[ranges[parts[partId][0]][0]:ranges[parts[partId][-1]][1]+1]]), partSizes[partId]+1)) print("\end{tabular}") print("\label{fig:a}") print("\caption{``%s''}"%text) print("\end{figure}") ################################################################################