Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • franck.dary/macaon_data
1 result
Show changes
Commits on Source (3)
......@@ -67,35 +67,40 @@ def getLayout(sentence, text) :
ranges = [ranges[i] for i in range(len(ranges)) if i not in multis]
for i in range(len(ranges)) :
if ranges[i][0] != -1 :
continue
start = 0
if i > 0 :
start = ranges[i-1][1]+1
j = i
while ranges[j][0] == -1 :
j += 1
end = ranges[j][0]-1
size = end-start +1
each = size // (j-i)
for k in range(j-i) :
ranges[i+k][0] = start + k*each
ranges[i+k][1] = ranges[i+k][0]+each-1
i = j
for i in range(len(ranges)-1) :
if ranges[i][1] != ranges[i+1][0]-1 :
if ranges[i][1]-ranges[i][0] <= ranges[i+1][1]-ranges[i+1][0] :
ranges[i][1] = ranges[i+1][0]-1
else :
ranges[i+1][0] = ranges[i][1]+1
end = ranges[i][-1]
if end+1 in range(len(text)) and text[end+1] == " " :
ranges[i][-1] += 1
# for i in range(len(ranges)) :
# if ranges[i][0] != -1 :
# continue
# start = 0
# if i > 0 :
# start = ranges[i-1][1]+1
# j = i
# while ranges[j][0] == -1 :
# j += 1
# end = ranges[j][0]-1
# size = end-start +1
# each = size // (j-i)
# for k in range(j-i) :
# ranges[i+k][0] = start + k*each
# ranges[i+k][1] = ranges[i+k][0]+each-1
# i = j
#
# for i in range(len(ranges)-1) :
# if ranges[i][1] != ranges[i+1][0]-1 :
# if ranges[i][1]-ranges[i][0] <= ranges[i+1][1]-ranges[i+1][0] :
# ranges[i][1] = ranges[i+1][0]-1
# else :
# ranges[i+1][0] = ranges[i][1]+1
return sentence, ranges
################################################################################
################################################################################
def produceTabular(sentence, ranges, text, columns, nodes, reduce, breakSize, mask=[None,None], hsep=True) :
def produceTabular(sentence, ranges, text, columns, nodes, reduce, breakSize, mask=[None,None], hsep=True, isCenter=lambda _:False, isColored=lambda _:False, colSizes=None, title=None) :
if mask[0] not in [None, "incr", "seq"] :
print("ERROR : invalid mask '%s'"%mask, file=sys.stderr)
......@@ -113,15 +118,23 @@ def produceTabular(sentence, ranges, text, columns, nodes, reduce, breakSize, ma
partSizes = [-ranges[parts[partId][0]][0]+ranges[parts[partId][-1]][1]+1 for partId in range(len(parts))]
curLine = -1
colsep = "|" if hsep else ""
for partId in range(len(parts)) :
if partId != 0 :
print("\\vspace{7pt}\n")
print("\\begin{tabular}{|l|%s|}"%(colsep.join(["c"]*partSizes[partId])))
colsDef = colsep.join(["c"]*partSizes[partId])
print("\\begin{tabular}{|@{\hskip 4pt}l@{\hskip 3pt}|@{\hskip 3pt}%s|}"%(colsDef))
if title is not None :
print("\multicolumn{%d}{c}{\large %s}\\\\"%(partSizes[partId]+1, title))
print("\cline{1-%d}\n"%(partSizes[partId]+1))
for i in range(len(columns))[::-1] :
curLine += 1
curCol = -1
print("\\texttt{\\textbf{\\footnotesize{%s}}}"%columns[i].lower(), end=" &\n")
for j in parts[partId] :
curCol += 1
if columns[i] == "EOS" :
value = "yes" if j == parts[partId][-1] and partId == len(parts)-1 else "no"
else :
......@@ -150,10 +163,23 @@ def produceTabular(sentence, ranges, text, columns, nodes, reduce, breakSize, ma
values[k] = "\\texttt{%s}"%(values[k])
cellContent = "\\\\".join(values)
tcolsep = colsep if j != parts[partId][-1] else "|"
color = ""
if isColored((curLine, curCol)) :
color = r"\cellcolor{green!15}"
if isCenter((curLine, curCol)) :
color = r"\cellcolor{blue!30}"
if nodes :
print("\multicolumn{%d}{c%s}{\makecell[cc]{\\tabnode{%s}}}"%(ranges[j][1]-ranges[j][0]+1, tcolsep, cellContent), end=" &\n" if j != parts[partId][-1] else "")
if colSizes is not None :
cellSize = colSizes[curCol]
if cellSize > 0 :
cellSize = "%dpt"%(3+cellSize*5.5)
print("\multicolumn{%d}{c%s}{%s\makecell[cb]{\parbox{%s}{%s}}}"%(ranges[j][1]-ranges[j][0]+1, tcolsep, color, cellSize, cellContent), end=" &\n" if j != parts[partId][-1] else "")
else :
print("\multicolumn{%d}{c%s}{%s\makecell[cb]{%s}}"%(ranges[j][1]-ranges[j][0]+1, tcolsep, color, cellContent), end=" &\n" if j != parts[partId][-1] else "")
else :
print("\multicolumn{%d}{c%s}{%s\makecell[cb]{%s}}"%(ranges[j][1]-ranges[j][0]+1, tcolsep, color, cellContent), end=" &\n" if j != parts[partId][-1] else "")
else :
print("\multicolumn{%d}{c%s}{\makecell[cc]{%s}}"%(ranges[j][1]-ranges[j][0]+1, tcolsep, cellContent), end=" &\n" if j != parts[partId][-1] else "")
print("\multicolumn{%d}{c%s}{\makecell[cb{%s}}"%(ranges[j][1]-ranges[j][0]+1, tcolsep, cellContent), end=" &\n" if j != parts[partId][-1] else "")
if nodes and i != 0 :
print("\\\\%s\n"%("[-0.1cm]" if i == 1 else "[%scm]"%(breakSize)))
else :
......@@ -323,35 +349,32 @@ def drawPaths(sentence, ranges, text, columns, hsep, isSeq) :
################################################################################
def drawFeatures(sentence, ranges, text, columns) :
print(r"""\makeatletter
\@ifundefined{tabnode}{%
\newcommand\tabnode[1]{\addtocounter{nodecount}{1} \tikz \node[minimum height=0.5cm] (\arabic{nodecount}) {#1};}%
\newcounter{nodecount}%
}{}
\makeatother
\setcounter{nodecount}{0}""")
print(r"\tikzstyle{every picture}+=[remember picture,baseline]")
print(r"\tikzstyle{every node}+=[inner sep=0pt,anchor=base]")
nbLines = len(sentence[0])
nbCols = len(sentence)
center = (nbLines//2, nbCols//2)
isCenter = lambda lc : lc == center
isColored = lambda lc : lc[0] >= center[0] and lc[1] <= center[1]
colSizes = [max(map(len, elem)) for elem in sentence]
print(r"\begin{figure}")
print("\\tabcolsep=0.10mm")
print(r"\centering")
print("\\setlength{\\tabcolsep}{0.00mm}")
print(r"\resizebox{\textwidth}{!}{")
produceTabular(sentence, ranges, text, columns, True, True, "0.1", mask=("seq", center), hsep="")
print(r"\quad", end="")
produceTabular(sentence, ranges, text, columns, True, True, "0.1", mask=("seq", center), hsep="")
isColored = lambda lc : lc[0] >= center[0] and lc[1] <= center[1]
produceTabular(sentence, ranges, text, columns, True, True, "0.1", mask=("seq", center), hsep="", isCenter=isCenter, isColored=isColored, colSizes=colSizes, title=r"Passé-Bas (\palo)")
print(r"\quad", end="")
produceTabular(sentence, ranges, text, columns, True, True, "0.1", mask=("incr", center), hsep="")
print("")
drawRectanglePalo(1, nbLines, nbCols)
drawRectangleFulo(nbLines*nbCols+1, nbLines, nbCols)
drawRectanglePahi(2*nbLines*nbCols+1, nbLines, nbCols)
print("}")
isColored = lambda lc : lc[0] >= center[0] and lc[1] <= center[1] or lc[0] > center[0]
produceTabular(sentence, ranges, text, columns, True, True, "0.1", mask=("seq", center), hsep="", isCenter=isCenter, isColored=isColored, colSizes=colSizes, title=r"Futur-Bas (\fulo)")
print("\n}\n")
print(r"\vspace*{0.25cm}")
print(r"\resizebox{0.5\textwidth}{!}{")
isColored = lambda lc : lc[1] < center[1] or lc[1] == center[1] and lc[0] > center[0]
produceTabular(sentence, ranges, text, columns, True, True, "0.1", mask=("incr", center), hsep="", isCenter=isCenter, isColored=isColored, colSizes=colSizes, title="Passé-Haut (\pahi)")
print("\n}")
print(r"\caption{Caption.}")
print(r"\label{fig:a}")
......
#! /usr/bin/env python3
# Create a w2v formatted embedding file.
# Each line associate a lowercase word with an embedding whose dimmensions are the UD POS.
# The input to this script is a combination of lefff lexicon and conllu UD corpora.
# Example: ./lefff2w2v --lefff lefff.fplm --conllu data/UD_French-GSD/*\.conllu
# Example: ./lefff2w2v --lefff lefff.fplm
# Example: ./lefff2w2v --conllu data/UD_French-GSD/*\.conllu
# We can chose to output binary vector with the option --binary which is a threshold above which values will become 1.
# We can ignore infrequent words in conllu by setting a threshold with --minfreq.
import sys
import argparse
from readMCD import readMCD
# Convert lefff part of speech into UD UPOS.
lefffPOS2UD = {
"adj" : "adj",
"csu" : "sconj",
"que" : "sconj", # Not only ?
"det" : "det",
"pres" : "intj", # Nothing match ? INTJ or X
"v" : "verb",
"nc" : "noun",
"cfi" : "noun",
"advPref" : "x", # No meaning with UD tokenization
"adjPref" : "x", # same
"suffAdj" : "x", # same
"cln" : "pron",
"ce" : "pron",
"clg" : "adp",
"cll" : "pron",
"ilimp" : "pron",
"cla" : "pron",
"cld" : "pron",
"pro" : "pron",
"caimp" : "pron",
"pri" : "adv",
"prel" : "pron",
"clr" : "pron",
"clar" : "pron",
"cldr" : "pron",
"adv" : "adv",
"advm" : "adv",
"advp" : "adv",
"coo" : "cconj",
"ponctw" : "punct",
"advneg" : "adv",
"clneg" : "adv",
"que_restr" : "sconj",
"np" : "propn",
"poncts" : "punct",
"parento" : "punct",
"epsilon" : "punct",
"parentf" : "punct",
"prep" : "adp",
"auxAvoir" : "aux",
"auxEtre" : "aux",
}
if __name__ == "__main__" :
parser = argparse.ArgumentParser()
parser.add_argument("--lefff", type=str,
help="Lefff file in tab separated columns: FORM POS LEMMA MORPHO.")
parser.add_argument("--conllu", nargs="+", type=str,
help="Conllu files to estimate the probability of each POS.")
parser.add_argument("--binary", type=float,
help="A threshold in [0,1] that will separate zeroes from ones.")
parser.add_argument("--minfreq", type=int,
help="A threshold in number of occurrences of words.")
parser.add_argument("--lefffWeight", type=int, default=1,
help="What is the weight, in number of occurrences of the couple (form,POS) in annotated conllu data, that the lefff add ?")
args = parser.parse_args()
if args.lefff is None and args.conllu is None :
print("ERROR: must provide --lefff and/or --conllu", file=sys.stderr)
exit(1)
# Dict with key=FORM and value= dict associationg pos with number of occ
form2pos = {}
# List of all pos (UD format) present in data
allPos = []
# Associate each form with a counter, only for conllu files
formCount = {}
# Read lefff and populate form2pos with # of occ = 1
if args.lefff is not None :
for line in open(args.lefff, "r") :
splited = line.strip().split("\t")
form = splited[0].lower()
pos = lefffPOS2UD[splited[1]]
# In lefff there might be spaces in forms. W2v format don't allow it. We replace space by dotted circle.
form.replace(" ", "")
if pos not in allPos :
allPos.append(pos)
if form not in form2pos :
form2pos[form] = {}
if pos not in form2pos[form] :
form2pos[form][pos] = args.lefffWeight
# If conllu files are provided, count number of occurences into form2pos
if args.conllu is not None :
if args.conllu is not None :
for filename in args.conllu :
baseMCD = "ID FORM LEMMA POS XPOS FEATS HEAD DEPREL"
conllMCD, conllMCDr = readMCD(baseMCD)
for line in open(filename, "r") :
line = line.strip()
if "global.columns =" in line and line[0] == "#" :
splited = line.split("global.columns =")
conllMCD, conllMCDr = readMCD(splited[-1].strip())
continue
if len(line) == 0 or line[0] == "#" :
continue
splited = line.split("\t")
form = splited[conllMCD["FORM"]].lower()
pos = splited[conllMCD["UPOS"]].lower()
form.replace(" ", "")
if pos not in allPos :
allPos.append(pos)
if form not in form2pos :
form2pos[form] = {}
if pos not in form2pos[form] :
form2pos[form][pos] = 0
form2pos[form][pos] += 1
if form not in formCount :
formCount[form] = 0
formCount[form] += 1
outputLines = []
# Compute probability for each pos and form
for form in form2pos :
if args.minfreq is not None and formCount[form] < args.minfreq :
continue
vec = ["0" for _ in allPos]
totalOccs = 0
for pos in form2pos[form] :
totalOccs += form2pos[form][pos]
for pos in form2pos[form] :
vec[allPos.index(pos)] = form2pos[form][pos] / totalOccs
baseVec = vec.copy()
for pos in form2pos[form] :
if args.binary is not None :
if vec[allPos.index(pos)] >= args.binary :
vec[allPos.index(pos)] = 1
else :
vec[allPos.index(pos)] = 0
if args.binary is not None :
vec[allPos.index(pos)] = "%d"%vec[allPos.index(pos)]
else :
vec[allPos.index(pos)] = "%.2f"%vec[allPos.index(pos)]
if sum(map(float, vec)) == 0 :
print("WARNING: word '%s' gets all 0. Original: '%s'"%(form, " ".join(map(str,baseVec))), file=sys.stderr)
outputLines.append(form+" "+" ".join(vec))
# Print the w2v file
print(len(outputLines), len(allPos))
outputLines.sort()
print("\n".join(outputLines))