From f811c17b37009086e0e4e144e32970a4ddfcbfb4 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 21 Jan 2021 09:41:04 +0100 Subject: [PATCH] Added script to filter embeddings file based on vocabulary from conllu files --- scripts/filterEmbeddings.py | 44 +++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100755 scripts/filterEmbeddings.py diff --git a/scripts/filterEmbeddings.py b/scripts/filterEmbeddings.py new file mode 100755 index 0000000..83c138a --- /dev/null +++ b/scripts/filterEmbeddings.py @@ -0,0 +1,44 @@ +#! /usr/bin/env python3 + +import sys + +################################################################################ +def printUsageAndExit() : + print("USAGE : %s embeddings.w2v vocabFile1.conllu vocabFile2.conllu..." + %sys.argv[0], file=sys.stderr) + exit(1) +################################################################################ + +################################################################################ +if __name__ == "__main__" : + if len(sys.argv) < 3 : + printUsageAndExit() + + vocab = {} + + for filename in sys.argv[2:] : + formIndex = None + for line in open(filename, "r") : + line = line.strip() + if len(line) == 0 : + continue + if "# global.columns =" in line : + formIndex = line.split('=')[-1].strip().split(' ').index("FORM") + if line[0] == '#' : + continue + word = line.split('\t')[formIndex] + vocab[word] = True + + print("Vocabulary size = %d words"%len(vocab), file=sys.stderr) + + for line in open(sys.argv[1]) : + line = line.strip() + splited = line.split(' ') + # Ignore optional w2v header + if len(splited) == 2 : + continue + word = splited[0] + if word in vocab : + print(line) +################################################################################ + -- GitLab