Added default MCD in filterEmbeddings script and added script to compute word coverage

24005f0f · Franck Dary · 1d1cf159 · 24005f0f · 24005f0f
Commit 24005f0f authored 4 years ago by Franck Dary
--- a/scripts/filterEmbeddings.py
+++ b/scripts/filterEmbeddings.py
 #! /usr/bin/env python3
 import sys
+from readMCD import readMCD
 ################################################################################
 def printUsageAndExit() :
@@ -14,19 +15,22 @@ if __name__ == "__main__" :
  if len(sys.argv) < 3 :
    printUsageAndExit()
+  baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
  vocab = {}
  for filename in sys.argv[2:] :
-    formIndex = None
+    col2index, index2col = readMCD(baseMCD)
    for line in open(filename, "r") :
      line = line.strip()
      if len(line) == 0 :
        continue
      if "# global.columns =" in line :
-        formIndex = line.split('=')[-1].strip().split(' ').index("FORM")
+        col2index, index2col = readMCD(line.split('=')[-1].strip())
+        continue
      if line[0] == '#' :
        continue
-      word = line.split('\t')[formIndex]
+      word = line.split('\t')[col2index["FORM"]]
      vocab[word] = True
  print("Vocabulary size = %d words"%len(vocab), file=sys.stderr)

--- a/scripts/getWordsCoverage.py
+++ b/scripts/getWordsCoverage.py
+#! /usr/bin/env python3
+import sys
+import os
+from readMCD import readMCD
+################################################################################
+def printUsageAndExit() :
+  print("USAGE : %s UD_Directory"%sys.argv[0], file=sys.stderr)
+  exit(1)
+################################################################################
+################################################################################
+if __name__ == "__main__" :
+  if len(sys.argv) != 2 :
+    printUsageAndExit()
+  baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
+  train = None
+  dev = None
+  test = None
+  for dir in os.walk(sys.argv[1]) :
+    for file in dir[2] :
+      if "train" in file and ".conllu" in file :
+        train = file
+      elif "dev" in file and ".conllu" in file :
+        dev = file
+      elif "test" in file and ".conllu" in file :
+        test = file
+  if train is None :
+    print("ERROR : train corpus not found", file=sys.stderr)
+    exit(1)
+  vocab = set()
+  col2index, index2col = readMCD(baseMCD)
+  for line in open(sys.argv[1]+"/"+train, "r") :
+    line = line.strip()
+    if "# global.columns =" in line :
+      col2index, index2col = readMCD(line.split('=')[-1].strip())
+      continue
+    if len(line) == 0 or line[0] == '#' :
+      continue
+    form = line.split('\t')[col2index["FORM"]]
+    if form not in vocab :
+      vocab.add(form)
+  for file in [dev, test] :
+    nbWords = 0
+    nbIn = 0
+    if file is not None :
+      col2index, index2col = readMCD(baseMCD)
+      for line in open(sys.argv[1]+"/"+file, "r") :
+        line = line.strip()
+        if "# global.columns =" in line :
+          col2index, index2col = readMCD(line.split('=')[-1].strip())
+          continue
+        if len(line) == 0 or line[0] == '#' :
+          continue
+        form = line.split('\t')[col2index["FORM"]]
+        nbWords += 1
+        if form in vocab :
+          nbIn += 1
+      print("%s\t%.2f"%(file, 100.0*nbIn/nbWords))
+################################################################################