Commit 424b106e authored by Benoit Favre's avatar Benoit Favre
Browse files

initial commit

Word embedding similarity tool
Benoit Favre 2019
This python script can be used as a library or from the commandline.
It is compatible with python 2 and 3, and uses numpy as only requirement.
To install numpy, you can do it this way:
$ pip install --user numpy
This program uses textual embeddings in the word2vec format:
<num-words> <num-dimentions>
<word1> <value-1> ... <value-n>
Recommanded: for fast loading, this format can be converted to a binary format:
$ python convert <txt-embedding-file> <bin-embedding-file>
Compute the similarity between two words
$ python sim <embedding-file> <word1> <word2>
It can also read a list of pairs of words from stdin:
$ echo -e "dog cat\napple pear\nking queen" | python sim <embedding-file>
Compute the closest words to a given word
$ python closest <embedding-file> <n> <word>
`n` is the number of nighbors to show
It can also read a list of words from stdin:
$ echo -e "dog\ncat\napple\npear\nking\nqueen" | python closest <embedding-file> 10
Provided model
A model for French can be downloaded at:
It was trained on French wikipedia with word2vec skipgrams given a window of 30 words.
from __future__ import print_function
import sys
import numpy as np
import json
def normalize(v):
norm = np.linalg.norm(v, ord=2, axis=1), norm==0, 1)
v /= norm[:,None]
def load(filename):
return load_bin(filename)
return load_txt(filename)
def load_txt(filename):
with open(filename, 'r') as fp:
num_words, num_dims = [int(x) for x in fp.readline().split()]
vectors = np.zeros((num_words, num_dims), dtype=np.float32)
vocab = {}
for i, line in enumerate(fp):
tokens = line.strip().split()
word = tokens[0]#.decode('utf8')
vectors[i] = np.array([float(x) for x in tokens[1: num_dims + 1]], np.float32)
vocab[word] = i
return vectors, vocab
def load_bin(filename):
with open(filename + '.json') as fp:
info = json.loads(
vectors = np.memmap(filename, mode='r', dtype=np.float32, shape=(info['num_words'], info['dims']))
return vectors, info['vocab']
def write_bin(filename, vectors, vocab):
with open(filename + '.json', 'w') as fp:
fp.write(json.dumps({'num_words': vectors.shape[0], 'dims': vectors.shape[1], 'vocab': vocab}))
fp = np.memmap(filename, mode='w+', shape=vectors.shape, dtype=np.float32)
fp[:] = vectors[:]
del fp
def make_rev_vocab(vocab):
rev_vocab = {}
for word, i in vocab.items():
rev_vocab[i] = word
return rev_vocab
def closest(vectors, vocab, rev_vocab, word, n=10):
if word not in vocab:
return []
v = vectors[vocab[word]]
scores =, v.T)
best = np.max(scores)
if n > len(vectors):
n = len(vectors)
indices = np.argpartition(scores, -n)[-n:]
indices = indices[np.argsort(scores[indices])]
output = []
for i in [int(x) for x in indices]:
output.append((scores[i], rev_vocab[i]))
return reversed(output)
def sim(vectors, vocab, word1, word2):
if word1 not in vocab or word2 not in vocab:
return -2
v1 = vectors[vocab[word1]]
v2 = vectors[vocab[word2]]
return, v2)
if __name__ == '__main__':
if len(sys.argv) <= 1 or sys.argv[1] not in ['sim', 'closest', 'convert']:
print('''Usage: %s command [args]
sim <embeddings> [<word1> <word2>] compute similarity between two words (-2 if not found)
closest <embeddings> <n> [<word>] list n closest words to target
convert <embeddings> <output> convert embeddings to binary format
if words are not specified for sim and closest, they are read from stdin''' % sys.argv[0], file=sys.stderr)
vectors, vocab = load(sys.argv[2])
command = sys.argv[1]
if command == 'sim':
if len(sys.argv) == 5:
print(sys.argv[3], sys.argv[4], sim(vectors, vocab, sys.argv[3], sys.argv[4]))
for line in sys.stdin:
tokens = line.strip().split()
print(tokens[0], tokens[1], sim(vectors, vocab, tokens[0], tokens[1]))
elif command == 'closest':
rev_vocab = make_rev_vocab(vocab)
n = int(sys.argv[3])
if len(sys.argv) == 5:
for score, word in closest(vectors, vocab, rev_vocab, sys.argv[4], n):
print(sys.argv[4], word, score)
for line in sys.stdin:
tokens = line.strip().split()
for score, word in closest(vectors, vocab, rev_vocab, tokens[0], n):
print(tokens[0], word, score)
elif command == 'convert':
write_bin(sys.argv[3], vectors, vocab)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment