Compare revisions

Franck Dary · Franck Dary · Franck Dary · 6a093c3e · 6a093c3e
--- a/scripts/conllu2latex.py
+++ b/scripts/conllu2latex.py
@@ -67,35 +67,40 @@ def getLayout(sentence, text) :
  ranges = [ranges[i] for i in range(len(ranges)) if i not in multis]

  for i in range(len(ranges)) :
-    if ranges[i][0] != -1 :
-      continue
-    start = 0
-    if i > 0 :
-      start = ranges[i-1][1]+1
-    j = i
-    while ranges[j][0] == -1 :
-      j += 1
-    end = ranges[j][0]-1
-    size = end-start +1
-    each = size // (j-i)
-    for k in range(j-i) :
-      ranges[i+k][0] = start + k*each
-      ranges[i+k][1] = ranges[i+k][0]+each-1
-    i = j
-
-  for i in range(len(ranges)-1) :
-    if ranges[i][1] != ranges[i+1][0]-1 :
-      if ranges[i][1]-ranges[i][0] <= ranges[i+1][1]-ranges[i+1][0] :
-        ranges[i][1] = ranges[i+1][0]-1
-      else :
-        ranges[i+1][0] = ranges[i][1]+1
+    end = ranges[i][-1]
+    if end+1 in range(len(text)) and text[end+1] == " " :
+      ranges[i][-1] += 1
+
+#  for i in range(len(ranges)) :
+#    if ranges[i][0] != -1 :
+#      continue
+#    start = 0
+#    if i > 0 :
+#      start = ranges[i-1][1]+1
+#    j = i
+#    while ranges[j][0] == -1 :
+#      j += 1
+#    end = ranges[j][0]-1
+#    size = end-start +1
+#    each = size // (j-i)
+#    for k in range(j-i) :
+#      ranges[i+k][0] = start + k*each
+#      ranges[i+k][1] = ranges[i+k][0]+each-1
+#    i = j
+#
+#  for i in range(len(ranges)-1) :
+#    if ranges[i][1] != ranges[i+1][0]-1 :
+#      if ranges[i][1]-ranges[i][0] <= ranges[i+1][1]-ranges[i+1][0] :
+#        ranges[i][1] = ranges[i+1][0]-1
+#      else :
+#        ranges[i+1][0] = ranges[i][1]+1

  return sentence, ranges
 ################################################################################


 ################################################################################
-def produceTabular(sentence, ranges, text, columns, nodes, reduce, breakSize, mask=[None,None], hsep=True) :
+def produceTabular(sentence, ranges, text, columns, nodes, reduce, breakSize, mask=[None,None], hsep=True, isCenter=lambda _:False, isColored=lambda _:False, colSizes=None, title=None) :

  if mask[0] not in [None, "incr", "seq"] :
    print("ERROR : invalid mask '%s'"%mask, file=sys.stderr)
@@ -113,15 +118,23 @@ def produceTabular(sentence, ranges, text, columns, nodes, reduce, breakSize, ma

  partSizes = [-ranges[parts[partId][0]][0]+ranges[parts[partId][-1]][1]+1 for partId in range(len(parts))]

+  curLine = -1
+
  colsep = "|" if hsep else ""
  for partId in range(len(parts)) :
    if partId != 0 :
      print("\\vspace{7pt}\n")
-    print("\\begin{tabular}{|l|%s|}"%(colsep.join(["c"]*partSizes[partId])))
+    colsDef = colsep.join(["c"]*partSizes[partId])
+    print("\\begin{tabular}{|@{\hskip 4pt}l@{\hskip 3pt}|@{\hskip 3pt}%s|}"%(colsDef))
+    if title is not None :
+      print("\multicolumn{%d}{c}{\large %s}\\\\"%(partSizes[partId]+1, title))
    print("\cline{1-%d}\n"%(partSizes[partId]+1))
    for i in range(len(columns))[::-1] :
+      curLine += 1
+      curCol = -1
      print("\\texttt{\\textbf{\\footnotesize{%s}}}"%columns[i].lower(), end=" &\n")
      for j in parts[partId] :
+        curCol += 1
        if columns[i] == "EOS" :
          value = "yes" if j == parts[partId][-1] and partId == len(parts)-1 else "no"
        else :
@@ -150,10 +163,23 @@ def produceTabular(sentence, ranges, text, columns, nodes, reduce, breakSize, ma
            values[k] = "\\texttt{%s}"%(values[k])
        cellContent = "\\\\".join(values)
        tcolsep = colsep if j != parts[partId][-1] else "|"
+        color = ""
+        if isColored((curLine, curCol)) :
+          color = r"\cellcolor{green!15}"
+        if isCenter((curLine, curCol)) :
+          color = r"\cellcolor{blue!30}"
        if nodes :
-          print("\multicolumn{%d}{c%s}{\makecell[cc]{\\tabnode{%s}}}"%(ranges[j][1]-ranges[j][0]+1, tcolsep, cellContent), end=" &\n" if j != parts[partId][-1] else "")
+          if colSizes is not None :
+            cellSize = colSizes[curCol]
+            if cellSize > 0 :
+              cellSize = "%dpt"%(3+cellSize*5.5)
+              print("\multicolumn{%d}{c%s}{%s\makecell[cb]{\parbox{%s}{%s}}}"%(ranges[j][1]-ranges[j][0]+1, tcolsep, color, cellSize, cellContent), end=" &\n" if j != parts[partId][-1] else "")
+            else :
+              print("\multicolumn{%d}{c%s}{%s\makecell[cb]{%s}}"%(ranges[j][1]-ranges[j][0]+1, tcolsep, color, cellContent), end=" &\n" if j != parts[partId][-1] else "")
+          else :
+            print("\multicolumn{%d}{c%s}{%s\makecell[cb]{%s}}"%(ranges[j][1]-ranges[j][0]+1, tcolsep, color, cellContent), end=" &\n" if j != parts[partId][-1] else "")
        else :
-          print("\multicolumn{%d}{c%s}{\makecell[cc]{%s}}"%(ranges[j][1]-ranges[j][0]+1, tcolsep, cellContent), end=" &\n" if j != parts[partId][-1] else "")
+          print("\multicolumn{%d}{c%s}{\makecell[cb{%s}}"%(ranges[j][1]-ranges[j][0]+1, tcolsep, cellContent), end=" &\n" if j != parts[partId][-1] else "")
      if nodes and i != 0 :
        print("\\\\%s\n"%("[-0.1cm]" if i == 1 else "[%scm]"%(breakSize)))
      else :
@@ -323,35 +349,32 @@ def drawPaths(sentence, ranges, text, columns, hsep, isSeq) :

 ################################################################################
 def drawFeatures(sentence, ranges, text, columns) :
-  print(r"""\makeatletter
-\@ifundefined{tabnode}{%
-\newcommand\tabnode[1]{\addtocounter{nodecount}{1} \tikz \node[minimum height=0.5cm] (\arabic{nodecount}) {#1};}%
-\newcounter{nodecount}%
-}{}
-\makeatother
-\setcounter{nodecount}{0}""")
-  print(r"\tikzstyle{every picture}+=[remember picture,baseline]")
-  print(r"\tikzstyle{every node}+=[inner sep=0pt,anchor=base]")
-
  nbLines = len(sentence[0])
  nbCols = len(sentence)

  center = (nbLines//2, nbCols//2)
+  isCenter = lambda lc : lc == center
+
+
+  isColored = lambda lc : lc[0] >= center[0] and lc[1] <= center[1]
+
+  colSizes = [max(map(len, elem)) for elem in sentence]

  print(r"\begin{figure}")
-  print("\\tabcolsep=0.10mm")
+  print(r"\centering")
+  print("\\setlength{\\tabcolsep}{0.00mm}")
  print(r"\resizebox{\textwidth}{!}{")
-  produceTabular(sentence, ranges, text, columns, True, True, "0.1", mask=("seq", center), hsep="")
-  print(r"\quad", end="")
-  produceTabular(sentence, ranges, text, columns, True, True, "0.1", mask=("seq", center), hsep="")
+  isColored = lambda lc : lc[0] >= center[0] and lc[1] <= center[1]
+  produceTabular(sentence, ranges, text, columns, True, True, "0.1", mask=("seq", center), hsep="", isCenter=isCenter, isColored=isColored, colSizes=colSizes, title=r"Passé-Bas (\palo)")
  print(r"\quad", end="")
-  produceTabular(sentence, ranges, text, columns, True, True, "0.1", mask=("incr", center), hsep="")
-  print("")
-
-  drawRectanglePalo(1, nbLines, nbCols)
-  drawRectangleFulo(nbLines*nbCols+1, nbLines, nbCols)
-  drawRectanglePahi(2*nbLines*nbCols+1, nbLines, nbCols)
-  print("}")
+  isColored = lambda lc : lc[0] >= center[0] and lc[1] <= center[1] or lc[0] > center[0]
+  produceTabular(sentence, ranges, text, columns, True, True, "0.1", mask=("seq", center), hsep="", isCenter=isCenter, isColored=isColored, colSizes=colSizes, title=r"Futur-Bas (\fulo)")
+  print("\n}\n")
+  print(r"\vspace*{0.25cm}")
+  print(r"\resizebox{0.5\textwidth}{!}{")
+  isColored = lambda lc : lc[1] < center[1] or lc[1] == center[1] and lc[0] > center[0]
+  produceTabular(sentence, ranges, text, columns, True, True, "0.1", mask=("incr", center), hsep="", isCenter=isCenter, isColored=isColored, colSizes=colSizes, title="Passé-Haut (\pahi)")
+  print("\n}")

  print(r"\caption{Caption.}")
  print(r"\label{fig:a}")

--- a/scripts/lefff2w2v.py
+++ b/scripts/lefff2w2v.py
+#! /usr/bin/env python3
+# Create a w2v formatted embedding file.
+# Each line associate a lowercase word with an embedding whose dimmensions are the UD POS.
+# The input to this script is a combination of lefff lexicon and conllu UD corpora.
+# Example: ./lefff2w2v --lefff lefff.fplm --conllu data/UD_French-GSD/*\.conllu
+# Example: ./lefff2w2v --lefff lefff.fplm
+# Example: ./lefff2w2v --conllu data/UD_French-GSD/*\.conllu
+# We can chose to output binary vector with the option --binary which is a threshold above which values will become 1.
+# We can ignore infrequent words in conllu by setting a threshold with --minfreq.
+
+import sys
+import argparse
+from readMCD import readMCD
+
+# Convert lefff part of speech into UD UPOS.
+lefffPOS2UD = {
+  "adj" : "adj",
+  "csu" : "sconj",
+  "que" : "sconj", # Not only ?
+  "det" : "det",
+  "pres" : "intj", # Nothing match ? INTJ or X
+  "v" : "verb",
+  "nc" : "noun",
+  "cfi" : "noun",
+  "advPref" : "x", # No meaning with UD tokenization
+  "adjPref" : "x", # same
+  "suffAdj" : "x", # same
+  "cln" : "pron",
+  "ce" : "pron",
+  "clg" : "adp",
+  "cll" : "pron",
+  "ilimp" : "pron",
+  "cla" : "pron",
+  "cld" : "pron",
+  "pro" : "pron",
+  "caimp" : "pron",
+  "pri" : "adv",
+  "prel" : "pron",
+  "clr" : "pron",
+  "clar" : "pron",
+  "cldr" : "pron",
+  "adv" : "adv",
+  "advm" : "adv",
+  "advp" : "adv",
+  "coo" : "cconj",
+  "ponctw" : "punct",
+  "advneg" : "adv",
+  "clneg" : "adv",
+  "que_restr" : "sconj",
+  "np" : "propn",
+  "poncts" : "punct",
+  "parento" : "punct",
+  "epsilon" : "punct",
+  "parentf" : "punct",
+  "prep" : "adp",
+  "auxAvoir" : "aux",
+  "auxEtre" : "aux",
+}
+
+if __name__ == "__main__" :
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--lefff", type=str,
+    help="Lefff file in tab separated columns: FORM POS LEMMA MORPHO.")
+  parser.add_argument("--conllu", nargs="+", type=str,
+    help="Conllu files to estimate the probability of each POS.")
+  parser.add_argument("--binary", type=float,
+    help="A threshold in [0,1] that will separate zeroes from ones.")
+  parser.add_argument("--minfreq", type=int,
+    help="A threshold in number of occurrences of words.")
+  parser.add_argument("--lefffWeight", type=int, default=1,
+    help="What is the weight, in number of occurrences of the couple (form,POS) in annotated conllu data, that the lefff add ?")
+
+  args = parser.parse_args()
+
+  if args.lefff is None and args.conllu is None :
+    print("ERROR: must provide --lefff and/or --conllu", file=sys.stderr)
+    exit(1)
+
+  # Dict with key=FORM and value= dict associationg pos with number of occ 
+  form2pos = {}
+  # List of all pos (UD format) present in data
+  allPos = []
+  # Associate each form with a counter, only for conllu files
+  formCount = {}
+
+  # Read lefff and populate form2pos with # of occ = 1
+  if args.lefff is not None :
+    for line in open(args.lefff, "r") :
+      splited = line.strip().split("\t")
+      form = splited[0].lower()
+      pos = lefffPOS2UD[splited[1]]
+      # In lefff there might be spaces in forms. W2v format don't allow it. We replace space by dotted circle.
+      form.replace(" ", "◌")
+      if pos not in allPos :
+        allPos.append(pos)
+      if form not in form2pos :
+        form2pos[form] = {}
+      if pos not in form2pos[form] :
+        form2pos[form][pos] = args.lefffWeight
+
+  # If conllu files are provided, count number of occurences into form2pos
+  if args.conllu is not None :
+    if args.conllu is not None :
+      for filename in args.conllu :
+        baseMCD = "ID FORM LEMMA POS XPOS FEATS HEAD DEPREL"
+        conllMCD, conllMCDr = readMCD(baseMCD)
+        for line in open(filename, "r") :
+          line = line.strip()
+          if "global.columns =" in line and line[0] == "#" :
+            splited = line.split("global.columns =")
+            conllMCD, conllMCDr = readMCD(splited[-1].strip())
+            continue
+          if len(line) == 0 or line[0] == "#" :
+            continue
+          splited = line.split("\t")
+          form = splited[conllMCD["FORM"]].lower()
+          pos = splited[conllMCD["UPOS"]].lower()
+          form.replace(" ", "◌")
+          if pos not in allPos :
+            allPos.append(pos)
+          if form not in form2pos :
+            form2pos[form] = {}
+          if pos not in form2pos[form] :
+            form2pos[form][pos] = 0
+          form2pos[form][pos] += 1
+          if form not in formCount :
+            formCount[form] = 0
+          formCount[form] += 1
+
+  outputLines = []
+
+  # Compute probability for each pos and form
+  for form in form2pos :
+    if args.minfreq is not None and formCount[form] < args.minfreq :
+      continue
+    vec = ["0" for _ in allPos]
+    totalOccs = 0
+    for pos in form2pos[form] :
+      totalOccs += form2pos[form][pos]
+    for pos in form2pos[form] :
+      vec[allPos.index(pos)] = form2pos[form][pos] / totalOccs
+    baseVec = vec.copy()
+    for pos in form2pos[form] :
+      if args.binary is not None :
+        if vec[allPos.index(pos)] >= args.binary :
+          vec[allPos.index(pos)] = 1
+        else :
+          vec[allPos.index(pos)] = 0
+      if args.binary is not None :
+        vec[allPos.index(pos)] = "%d"%vec[allPos.index(pos)]
+      else :
+        vec[allPos.index(pos)] = "%.2f"%vec[allPos.index(pos)]
+    if sum(map(float, vec)) == 0 :
+      print("WARNING: word '%s' gets all 0. Original: '%s'"%(form, " ".join(map(str,baseVec))), file=sys.stderr)
+    outputLines.append(form+" "+" ".join(vec))
+
+  # Print the w2v file
+  print(len(outputLines), len(allPos))
+  outputLines.sort()
+  print("\n".join(outputLines))
+
No results found