From 24005f0f2ffb823b73439b91ca18b1776c2a47c3 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Wed, 21 Apr 2021 13:34:01 +0200
Subject: [PATCH] Added default MCD in filterEmbeddings script and added script
 to compute word coverage

---
 scripts/filterEmbeddings.py | 10 ++++--
 scripts/getWordsCoverage.py | 68 +++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100755 scripts/getWordsCoverage.py

diff --git a/scripts/filterEmbeddings.py b/scripts/filterEmbeddings.py
index 83c138a..7dd11e5 100755
--- a/scripts/filterEmbeddings.py
+++ b/scripts/filterEmbeddings.py
@@ -1,6 +1,7 @@
 #! /usr/bin/env python3
 
 import sys
+from readMCD import readMCD
 
 ################################################################################
 def printUsageAndExit() :
@@ -14,19 +15,22 @@ if __name__ == "__main__" :
   if len(sys.argv) < 3 :
     printUsageAndExit()
 
+  baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
+
   vocab = {}
 
   for filename in sys.argv[2:] :
-    formIndex = None
+    col2index, index2col = readMCD(baseMCD)
     for line in open(filename, "r") :
       line = line.strip()
       if len(line) == 0 :
         continue
       if "# global.columns =" in line :
-        formIndex = line.split('=')[-1].strip().split(' ').index("FORM")
+        col2index, index2col = readMCD(line.split('=')[-1].strip())
+        continue
       if line[0] == '#' :
         continue
-      word = line.split('\t')[formIndex]
+      word = line.split('\t')[col2index["FORM"]]
       vocab[word] = True
     
   print("Vocabulary size = %d words"%len(vocab), file=sys.stderr)
diff --git a/scripts/getWordsCoverage.py b/scripts/getWordsCoverage.py
new file mode 100755
index 0000000..74f5d22
--- /dev/null
+++ b/scripts/getWordsCoverage.py
@@ -0,0 +1,68 @@
+#! /usr/bin/env python3
+
+import sys
+import os
+from readMCD import readMCD
+
+################################################################################
+def printUsageAndExit() :
+  print("USAGE : %s UD_Directory"%sys.argv[0], file=sys.stderr)
+  exit(1)
+################################################################################
+
+################################################################################
+if __name__ == "__main__" :
+  if len(sys.argv) != 2 :
+    printUsageAndExit()
+
+  baseMCD = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
+
+  train = None
+  dev = None
+  test = None
+
+  for dir in os.walk(sys.argv[1]) :
+    for file in dir[2] :
+      if "train" in file and ".conllu" in file :
+        train = file
+      elif "dev" in file and ".conllu" in file :
+        dev = file
+      elif "test" in file and ".conllu" in file :
+        test = file
+
+  if train is None :
+    print("ERROR : train corpus not found", file=sys.stderr)
+    exit(1)
+
+  vocab = set()
+  col2index, index2col = readMCD(baseMCD)
+  for line in open(sys.argv[1]+"/"+train, "r") :
+    line = line.strip()
+    if "# global.columns =" in line :
+      col2index, index2col = readMCD(line.split('=')[-1].strip())
+      continue
+    if len(line) == 0 or line[0] == '#' :
+      continue
+    form = line.split('\t')[col2index["FORM"]]
+    if form not in vocab :
+      vocab.add(form)
+
+  for file in [dev, test] :
+    nbWords = 0
+    nbIn = 0
+    if file is not None :
+      col2index, index2col = readMCD(baseMCD)
+      for line in open(sys.argv[1]+"/"+file, "r") :
+        line = line.strip()
+        if "# global.columns =" in line :
+          col2index, index2col = readMCD(line.split('=')[-1].strip())
+          continue
+        if len(line) == 0 or line[0] == '#' :
+          continue
+        form = line.split('\t')[col2index["FORM"]]
+        nbWords += 1
+        if form in vocab :
+          nbIn += 1
+      print("%s\t%.2f"%(file, 100.0*nbIn/nbWords))
+################################################################################
+
-- 
GitLab