Added script to create word embeddings based on what POS can a word have

5c1b287c · Franck Dary · 0574caf5 · 5c1b287c
Commit 5c1b287c authored 3 years ago by Franck Dary
--- a/scripts/lefff2w2v.py
+++ b/scripts/lefff2w2v.py
+#! /usr/bin/env python3
+# Create a w2v formatted embedding file.
+# Each line associate a lowercase word with an embedding whose dimmensions are the UD POS.
+# The input to this script is a combination of lefff lexicon and conllu UD corpora.
+# Example: ./lefff2w2v --lefff lefff.fplm --conllu data/UD_French-GSD/*\.conllu
+# Example: ./lefff2w2v --lefff lefff.fplm
+# Example: ./lefff2w2v --conllu data/UD_French-GSD/*\.conllu
+# We can chose to output binary vector with the option --binary which is a threshold above which values will become 1.
+# We can ignore infrequent words in conllu by setting a threshold with --minfreq.
+import sys
+import argparse
+from readMCD import readMCD
+# Convert lefff part of speech into UD UPOS.
+lefffPOS2UD = {
+  "adj" : "adj",
+  "csu" : "sconj",
+  "que" : "sconj", # Not only ?
+  "det" : "det",
+  "pres" : "intj", # Nothing match ? INTJ or X
+  "v" : "verb",
+  "nc" : "noun",
+  "cfi" : "noun",
+  "advPref" : "x", # No meaning with UD tokenization
+  "adjPref" : "x", # same
+  "suffAdj" : "x", # same
+  "cln" : "pron",
+  "ce" : "pron",
+  "clg" : "adp",
+  "cll" : "pron",
+  "ilimp" : "pron",
+  "cla" : "pron",
+  "cld" : "pron",
+  "pro" : "pron",
+  "caimp" : "pron",
+  "pri" : "adv",
+  "prel" : "pron",
+  "clr" : "pron",
+  "clar" : "pron",
+  "cldr" : "pron",
+  "adv" : "adv",
+  "advm" : "adv",
+  "advp" : "adv",
+  "coo" : "cconj",
+  "ponctw" : "punct",
+  "advneg" : "adv",
+  "clneg" : "adv",
+  "que_restr" : "sconj",
+  "np" : "propn",
+  "poncts" : "punct",
+  "parento" : "punct",
+  "epsilon" : "punct",
+  "parentf" : "punct",
+  "prep" : "adp",
+  "auxAvoir" : "aux",
+  "auxEtre" : "aux",
+}
+if __name__ == "__main__" :
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--lefff", type=str,
+    help="Lefff file in tab separated columns: FORM POS LEMMA MORPHO.")
+  parser.add_argument("--conllu", nargs="+", type=str,
+    help="Conllu files to estimate the probability of each POS.")
+  parser.add_argument("--binary", type=float,
+    help="A threshold in [0,1] that will separate zeroes from ones.")
+  parser.add_argument("--minfreq", type=int,
+    help="A threshold in number of occurrences of words.")
+  parser.add_argument("--lefffWeight", type=int, default=1,
+    help="What is the weight, in number of occurrences of the couple (form,POS) in annotated conllu data, that the lefff add ?")
+  args = parser.parse_args()
+  if args.lefff is None and args.conllu is None :
+    print("ERROR: must provide --lefff and/or --conllu", file=sys.stderr)
+    exit(1)
+  # Dict with key=FORM and value= dict associationg pos with number of occ 
+  form2pos = {}
+  # List of all pos (UD format) present in data
+  allPos = []
+  # Associate each form with a counter, only for conllu files
+  formCount = {}
+  # Read lefff and populate form2pos with # of occ = 1
+  if args.lefff is not None :
+    for line in open(args.lefff, "r") :
+      splited = line.strip().split("\t")
+      form = splited[0].lower()
+      pos = lefffPOS2UD[splited[1]]
+      # In lefff there might be spaces in forms. W2v format don't allow it. We replace space by dotted circle.
+      form.replace(" ", "◌")
+      if pos not in allPos :
+        allPos.append(pos)
+      if form not in form2pos :
+        form2pos[form] = {}
+      if pos not in form2pos[form] :
+        form2pos[form][pos] = args.lefffWeight
+  # If conllu files are provided, count number of occurences into form2pos
+  if args.conllu is not None :
+    if args.conllu is not None :
+      for filename in args.conllu :
+        baseMCD = "ID FORM LEMMA POS XPOS FEATS HEAD DEPREL"
+        conllMCD, conllMCDr = readMCD(baseMCD)
+        for line in open(filename, "r") :
+          line = line.strip()
+          if "global.columns =" in line and line[0] == "#" :
+            splited = line.split("global.columns =")
+            conllMCD, conllMCDr = readMCD(splited[-1].strip())
+            continue
+          if len(line) == 0 or line[0] == "#" :
+            continue
+          splited = line.split("\t")
+          form = splited[conllMCD["FORM"]].lower()
+          pos = splited[conllMCD["UPOS"]].lower()
+          form.replace(" ", "◌")
+          if pos not in allPos :
+            allPos.append(pos)
+          if form not in form2pos :
+            form2pos[form] = {}
+          if pos not in form2pos[form] :
+            form2pos[form][pos] = 0
+          form2pos[form][pos] += 1
+          if form not in formCount :
+            formCount[form] = 0
+          formCount[form] += 1
+  outputLines = []
+  # Compute probability for each pos and form
+  for form in form2pos :
+    if args.minfreq is not None and formCount[form] < args.minfreq :
+      continue
+    vec = ["0" for _ in allPos]
+    totalOccs = 0
+    for pos in form2pos[form] :
+      totalOccs += form2pos[form][pos]
+    for pos in form2pos[form] :
+      vec[allPos.index(pos)] = form2pos[form][pos] / totalOccs
+    baseVec = vec.copy()
+    for pos in form2pos[form] :
+      if args.binary is not None :
+        if vec[allPos.index(pos)] >= args.binary :
+          vec[allPos.index(pos)] = 1
+        else :
+          vec[allPos.index(pos)] = 0
+      if args.binary is not None :
+        vec[allPos.index(pos)] = "%d"%vec[allPos.index(pos)]
+      else :
+        vec[allPos.index(pos)] = "%.2f"%vec[allPos.index(pos)]
+    if sum(map(float, vec)) == 0 :
+      print("WARNING: word '%s' gets all 0. Original: '%s'"%(form, " ".join(map(str,baseVec))), file=sys.stderr)
+    outputLines.append(form+" "+" ".join(vec))
+  # Print the w2v file
+  print(len(outputLines), len(allPos))
+  outputLines.sort()
+  print("\n".join(outputLines))