From f811c17b37009086e0e4e144e32970a4ddfcbfb4 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Thu, 21 Jan 2021 09:41:04 +0100
Subject: [PATCH] Added script to filter embeddings file based on vocabulary
 from conllu files

---
 scripts/filterEmbeddings.py | 44 +++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100755 scripts/filterEmbeddings.py

diff --git a/scripts/filterEmbeddings.py b/scripts/filterEmbeddings.py
new file mode 100755
index 0000000..83c138a
--- /dev/null
+++ b/scripts/filterEmbeddings.py
@@ -0,0 +1,44 @@
+#! /usr/bin/env python3
+
+import sys
+
+################################################################################
+def printUsageAndExit() :
+  print("USAGE : %s embeddings.w2v vocabFile1.conllu vocabFile2.conllu..."
+    %sys.argv[0], file=sys.stderr)
+  exit(1)
+################################################################################
+
+################################################################################
+if __name__ == "__main__" :
+  if len(sys.argv) < 3 :
+    printUsageAndExit()
+
+  vocab = {}
+
+  for filename in sys.argv[2:] :
+    formIndex = None
+    for line in open(filename, "r") :
+      line = line.strip()
+      if len(line) == 0 :
+        continue
+      if "# global.columns =" in line :
+        formIndex = line.split('=')[-1].strip().split(' ').index("FORM")
+      if line[0] == '#' :
+        continue
+      word = line.split('\t')[formIndex]
+      vocab[word] = True
+    
+  print("Vocabulary size = %d words"%len(vocab), file=sys.stderr)
+  
+  for line in open(sys.argv[1]) :
+    line = line.strip()
+    splited = line.split(' ')
+    # Ignore optional w2v header
+    if len(splited) == 2 :
+      continue
+    word = splited[0]
+    if word in vocab :
+      print(line)
+################################################################################
+
-- 
GitLab