Skip to content
Snippets Groups Projects
Commit 424b106e authored by Benoit Favre's avatar Benoit Favre
Browse files

initial commit

parents
Branches master
No related tags found
No related merge requests found
Word embedding similarity tool
==============================
Benoit Favre 2019
Install
=======
This python script can be used as a library or from the commandline.
It is compatible with python 2 and 3, and uses numpy as only requirement.
To install numpy, you can do it this way:
```
$ pip install --user numpy
```
This program uses textual embeddings in the word2vec format:
```
<num-words> <num-dimentions>
<word1> <value-1> ... <value-n>
...
```
Recommanded: for fast loading, this format can be converted to a binary format:
```
$ python word2vec.py convert <txt-embedding-file> <bin-embedding-file>
```
Compute the similarity between two words
========================================
```
$ python word2vec.py sim <embedding-file> <word1> <word2>
```
It can also read a list of pairs of words from stdin:
```
$ echo -e "dog cat\napple pear\nking queen" | python word2vec.py sim <embedding-file>
```
Compute the closest words to a given word
=========================================
```
$ python word2vec.py closest <embedding-file> <n> <word>
```
`n` is the number of nighbors to show
It can also read a list of words from stdin:
```
$ echo -e "dog\ncat\napple\npear\nking\nqueen" | python word2vec.py closest <embedding-file> 10
```
Provided model
==============
A model for French can be downloaded at:
https://pageperso.lis-lab.fr/benoit.favre/files/most-similar-words-fr.zip
It was trained on French wikipedia with word2vec skipgrams given a window of 30 words.
from __future__ import print_function
import sys
import numpy as np
import json
def normalize(v):
norm = np.linalg.norm(v, ord=2, axis=1)
np.place(norm, norm==0, 1)
v /= norm[:,None]
def load(filename):
try:
return load_bin(filename)
except:
pass
return load_txt(filename)
def load_txt(filename):
with open(filename, 'r') as fp:
num_words, num_dims = [int(x) for x in fp.readline().split()]
vectors = np.zeros((num_words, num_dims), dtype=np.float32)
vocab = {}
for i, line in enumerate(fp):
tokens = line.strip().split()
word = tokens[0]#.decode('utf8')
vectors[i] = np.array([float(x) for x in tokens[1: num_dims + 1]], np.float32)
vocab[word] = i
normalize(vectors)
return vectors, vocab
def load_bin(filename):
with open(filename + '.json') as fp:
info = json.loads(fp.read())
vectors = np.memmap(filename, mode='r', dtype=np.float32, shape=(info['num_words'], info['dims']))
return vectors, info['vocab']
def write_bin(filename, vectors, vocab):
with open(filename + '.json', 'w') as fp:
fp.write(json.dumps({'num_words': vectors.shape[0], 'dims': vectors.shape[1], 'vocab': vocab}))
fp = np.memmap(filename, mode='w+', shape=vectors.shape, dtype=np.float32)
fp[:] = vectors[:]
del fp
def make_rev_vocab(vocab):
rev_vocab = {}
for word, i in vocab.items():
rev_vocab[i] = word
return rev_vocab
def closest(vectors, vocab, rev_vocab, word, n=10):
if word not in vocab:
return []
v = vectors[vocab[word]]
scores = np.dot(vectors, v.T)
best = np.max(scores)
if n > len(vectors):
n = len(vectors)
indices = np.argpartition(scores, -n)[-n:]
indices = indices[np.argsort(scores[indices])]
output = []
for i in [int(x) for x in indices]:
output.append((scores[i], rev_vocab[i]))
return reversed(output)
def sim(vectors, vocab, word1, word2):
if word1 not in vocab or word2 not in vocab:
return -2
v1 = vectors[vocab[word1]]
v2 = vectors[vocab[word2]]
return np.dot(v1, v2)
if __name__ == '__main__':
if len(sys.argv) <= 1 or sys.argv[1] not in ['sim', 'closest', 'convert']:
print('''Usage: %s command [args]
Commands:
sim <embeddings> [<word1> <word2>] compute similarity between two words (-2 if not found)
closest <embeddings> <n> [<word>] list n closest words to target
convert <embeddings> <output> convert embeddings to binary format
Note:
if words are not specified for sim and closest, they are read from stdin''' % sys.argv[0], file=sys.stderr)
sys.exit(1)
vectors, vocab = load(sys.argv[2])
command = sys.argv[1]
if command == 'sim':
if len(sys.argv) == 5:
print(sys.argv[3], sys.argv[4], sim(vectors, vocab, sys.argv[3], sys.argv[4]))
else:
for line in sys.stdin:
tokens = line.strip().split()
print(tokens[0], tokens[1], sim(vectors, vocab, tokens[0], tokens[1]))
elif command == 'closest':
rev_vocab = make_rev_vocab(vocab)
n = int(sys.argv[3])
if len(sys.argv) == 5:
for score, word in closest(vectors, vocab, rev_vocab, sys.argv[4], n):
print(sys.argv[4], word, score)
else:
for line in sys.stdin:
tokens = line.strip().split()
for score, word in closest(vectors, vocab, rev_vocab, tokens[0], n):
print(tokens[0], word, score)
elif command == 'convert':
write_bin(sys.argv[3], vectors, vocab)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment