Skip to content
Snippets Groups Projects
Commit f811c17b authored by Franck Dary's avatar Franck Dary
Browse files

Added script to filter embeddings file based on vocabulary from conllu files

parent 6e33d91c
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3
import sys
################################################################################
def printUsageAndExit() :
print("USAGE : %s embeddings.w2v vocabFile1.conllu vocabFile2.conllu..."
%sys.argv[0], file=sys.stderr)
exit(1)
################################################################################
################################################################################
if __name__ == "__main__" :
if len(sys.argv) < 3 :
printUsageAndExit()
vocab = {}
for filename in sys.argv[2:] :
formIndex = None
for line in open(filename, "r") :
line = line.strip()
if len(line) == 0 :
continue
if "# global.columns =" in line :
formIndex = line.split('=')[-1].strip().split(' ').index("FORM")
if line[0] == '#' :
continue
word = line.split('\t')[formIndex]
vocab[word] = True
print("Vocabulary size = %d words"%len(vocab), file=sys.stderr)
for line in open(sys.argv[1]) :
line = line.strip()
splited = line.split(' ')
# Ignore optional w2v header
if len(splited) == 2 :
continue
word = splited[0]
if word in vocab :
print(line)
################################################################################
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment