diff --git a/scripts/filterEmbeddings.py b/scripts/filterEmbeddings.py new file mode 100755 index 0000000000000000000000000000000000000000..83c138abf3070d2b71151e72d94ca81747fe8f61 --- /dev/null +++ b/scripts/filterEmbeddings.py @@ -0,0 +1,44 @@ +#! /usr/bin/env python3 + +import sys + +################################################################################ +def printUsageAndExit() : + print("USAGE : %s embeddings.w2v vocabFile1.conllu vocabFile2.conllu..." + %sys.argv[0], file=sys.stderr) + exit(1) +################################################################################ + +################################################################################ +if __name__ == "__main__" : + if len(sys.argv) < 3 : + printUsageAndExit() + + vocab = {} + + for filename in sys.argv[2:] : + formIndex = None + for line in open(filename, "r") : + line = line.strip() + if len(line) == 0 : + continue + if "# global.columns =" in line : + formIndex = line.split('=')[-1].strip().split(' ').index("FORM") + if line[0] == '#' : + continue + word = line.split('\t')[formIndex] + vocab[word] = True + + print("Vocabulary size = %d words"%len(vocab), file=sys.stderr) + + for line in open(sys.argv[1]) : + line = line.strip() + splited = line.split(' ') + # Ignore optional w2v header + if len(splited) == 2 : + continue + word = splited[0] + if word in vocab : + print(line) +################################################################################ +