From 29b7b6c4674d0140d6e6bca3f8be910f5464c951 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Fri, 20 May 2022 15:12:51 +0200
Subject: [PATCH] Added script to append column containing lexicon pos
 information into conllu file

---
 scripts/addLefff2Conllu.py | 171 +++++++++++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100755 scripts/addLefff2Conllu.py

diff --git a/scripts/addLefff2Conllu.py b/scripts/addLefff2Conllu.py
new file mode 100755
index 0000000..6eb82f2
--- /dev/null
+++ b/scripts/addLefff2Conllu.py
@@ -0,0 +1,171 @@
+#! /usr/bin/env python3
+# Take as input the lefff lexicon and conllu files.
+# For each word form it will compute the possible POS.
+# Then it will add a column encoding these possible POS to output conllu files.
+
+import sys
+import argparse
+from readMCD import readMCD
+
+# List of UD POS tags : https://universaldependencies.org/u/pos/index.html
+allPos = ["adj", "adp", "adv", "aux", "cconj", "det", "intj", "noun", "num", "part", "pron", "propn", "punct", "sconj", "sym", "verb", "x"]
+
+# Convert lefff part of speech into UD UPOS.
+lefffPOS2UD = {
+  "adj" : "adj",
+  "csu" : "sconj",
+  "que" : "sconj", # Not only ?
+  "det" : "det",
+  "pres" : "intj", # Nothing match ? INTJ or X
+  "v" : "verb",
+  "nc" : "noun",
+  "cfi" : "noun",
+  "advPref" : "x", # No meaning with UD tokenization
+  "adjPref" : "x", # same
+  "suffAdj" : "x", # same
+  "cln" : "pron",
+  "ce" : "pron",
+  "clg" : "adp",
+  "cll" : "pron",
+  "ilimp" : "pron",
+  "cla" : "pron",
+  "cld" : "pron",
+  "pro" : "pron",
+  "caimp" : "pron",
+  "pri" : "adv",
+  "prel" : "pron",
+  "clr" : "pron",
+  "clar" : "pron",
+  "cldr" : "pron",
+  "adv" : "adv",
+  "advm" : "adv",
+  "advp" : "adv",
+  "coo" : "cconj",
+  "ponctw" : "punct",
+  "advneg" : "adv",
+  "clneg" : "adv",
+  "que_restr" : "sconj",
+  "np" : "propn",
+  "poncts" : "punct",
+  "parento" : "punct",
+  "epsilon" : "punct",
+  "parentf" : "punct",
+  "prep" : "adp",
+  "auxAvoir" : "aux",
+  "auxEtre" : "aux",
+}
+
+if __name__ == "__main__" :
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--lefff", type=str,
+    help="Lefff file in tab separated columns: FORM POS LEMMA MORPHO.")
+  parser.add_argument("--conllu", nargs="+", type=str,
+    help="Input conllu files, to find possible POS for each word.")
+  parser.add_argument("--output", nargs="+", type=str,
+    help="Output conllu files. Must be existing conllu files, this script adds a new column in place.")
+  parser.add_argument("--colName", type=str, default="LEXICON",
+    help="Name of the column that will be added by the script. If the column already exists, it will be replaced.")
+
+  args = parser.parse_args()
+
+  if args.lefff is None and args.conllu is None :
+    print("ERROR: must provide --lefff and/or --conllu", file=sys.stderr)
+    exit(1)
+  if args.output is None :
+    print("ERROR: must provide --output", file=sys.stderr)
+    exit(1)
+
+  # Dict with key=FORM and value= dict associationg pos with number of occ 
+  form2pos = {}
+  # Associate each form with a counter, only for conllu files
+  formCount = {}
+
+  # Read lefff and populate form2pos with # of occ = 1
+  if args.lefff is not None :
+    for line in open(args.lefff, "r") :
+      splited = line.strip().split("\t")
+      form = splited[0].lower()
+      pos = lefffPOS2UD[splited[1]]
+      # In lefff there might be spaces in forms. W2v format don't allow it. We replace space by dotted circle.
+      form = form.replace(" ", "â—Œ")
+      if " " in form :
+        print("HERE '%s'"%form, file=sys.stderr)
+      if pos not in allPos :
+        print("ERROR: Unknown pos '%s' (check allPos in the script)"%pos, file=sys.stderr)
+      if form not in form2pos :
+        form2pos[form] = {}
+      if form not in formCount :
+        formCount[form] = 0
+      if pos not in form2pos[form] :
+        form2pos[form][pos] = 1
+
+  # If conllu files are provided, count number of occurences into form2pos
+  if args.conllu is not None :
+    if args.conllu is not None :
+      for filename in args.conllu :
+        baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
+        conllMCD, conllMCDr = readMCD(baseMCD)
+        for line in open(filename, "r") :
+          line = line.strip()
+          if "global.columns =" in line and line[0] == "#" :
+            splited = line.split("global.columns =")
+            conllMCD, conllMCDr = readMCD(splited[-1].strip())
+            continue
+          if len(line) == 0 or line[0] == "#" :
+            continue
+          splited = line.split("\t")
+          wordId = splited[conllMCD["ID"]].lower()
+          if "-" in wordId :
+            continue
+          form = splited[conllMCD["FORM"]].lower()
+          pos = splited[conllMCD["UPOS"]].lower()
+          form = form.replace(" ", "â—Œ")
+          if pos not in allPos :
+            print("ERROR: Unknown pos '%s' (check allPos in the script)"%pos, file=sys.stderr)
+          if form not in form2pos :
+            form2pos[form] = {}
+          if pos not in form2pos[form] :
+            form2pos[form][pos] = 0
+          form2pos[form][pos] += 1
+          if form not in formCount :
+            formCount[form] = 0
+          formCount[form] += 1
+
+
+  # Reshape form2pos to be form -> pos set as string (ex. adj|verb)
+  for form in form2pos :
+    posSetStr = "|".join([pos for pos in form2pos[form]])
+    form2pos[form] = posSetStr
+
+  # Read all output conllu files and rewrite them in place with the new column
+  for filename in args.output :
+    baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
+    conllMCD, conllMCDr = readMCD(baseMCD)
+    if args.colName not in conllMCD :
+      conllMCD[args.colName] = len(conllMCD)
+      conllMCDr[len(conllMCDr)] = args.colName
+    newLines = ["# global.columns = %s"%(" ".join([conllMCDr[i] for i in range(len(conllMCDr))]))]
+    for line in open(filename, "r") :
+      line = line.strip()
+      if "global.columns =" in line and line[0] == "#" :
+        splited = line.split("global.columns =")
+        conllMCD, conllMCDr = readMCD(splited[-1].strip())
+        if args.colName not in conllMCD :
+          conllMCD[args.colName] = len(conllMCD)
+          conllMCDr[len(conllMCDr)] = args.colName
+          newLines = ["# global.columns = %s"%(" ".join([conllMCDr[i] for i in range(len(conllMCDr))]))]
+        continue
+      if len(line) == 0 or line[0] == "#" :
+        newLines.append(line)
+        continue
+      splited = line.split("\t")
+      form = splited[conllMCD["FORM"]].lower()
+      form = form.replace(" ", "â—Œ")
+      posSetStr = form2pos.get(form, "none")
+      newColIndex = conllMCD[args.colName]
+      if newColIndex not in range(len(splited)) :
+        splited.append("")
+      splited[newColIndex] = posSetStr
+      newLines.append("\t".join(splited))
+    print("\n".join(newLines), file=open(filename, "w"))
+
-- 
GitLab