Commit 424b106e authored by Benoit Favre's avatar Benoit Favre
Browse files

initial commit

parents
Word embedding similarity tool
==============================
Benoit Favre 2019
Install
=======
This python script can be used as a library or from the commandline.
It is compatible with python 2 and 3, and uses numpy as only requirement.
To install numpy, you can do it this way:
```
$ pip install --user numpy
```
This program uses textual embeddings in the word2vec format:
```
<num-words> <num-dimentions>
<word1> <value-1> ... <value-n>
...
```
Recommanded: for fast loading, this format can be converted to a binary format:
```
$ python word2vec.py convert <txt-embedding-file> <bin-embedding-file>
```
Compute the similarity between two words
========================================
```
$ python word2vec.py sim <embedding-file> <word1> <word2>
```
It can also read a list of pairs of words from stdin:
```
$ echo -e "dog cat\napple pear\nking queen" | python word2vec.py sim <embedding-file>
```
Compute the closest words to a given word
=========================================
```
$ python word2vec.py closest <embedding-file> <n> <word>
```
`n` is the number of nighbors to show
It can also read a list of words from stdin:
```
$ echo -e "dog\ncat\napple\npear\nking\nqueen" | python word2vec.py closest <embedding-file> 10
```
Provided model
==============
A model for French can be downloaded at:
https://pageperso.lis-lab.fr/benoit.favre/files/most-similar-words-fr.zip
It was trained on French wikipedia with word2vec skipgrams given a window of 30 words.
from __future__ import print_function
import sys
import numpy as np
import json
def normalize(v):
norm = np.linalg.norm(v, ord=2, axis=1)
np.place(norm, norm==0, 1)
v /= norm[:,None]
def load(filename):
try:
return load_bin(filename)
except:
pass
return load_txt(filename)
def load_txt(filename):
with open(filename, 'r') as fp:
num_words, num_dims = [int(x) for x in fp.readline().split()]
vectors = np.zeros((num_words, num_dims), dtype=np.float32)
vocab = {}
for i, line in enumerate(fp):
tokens = line.strip().split()
word = tokens[0]#.decode('utf8')
vectors[i] = np.array([float(x) for x in tokens[1: num_dims + 1]], np.float32)
vocab[word] = i
normalize(vectors)
return vectors, vocab
def load_bin(filename):
with open(filename + '.json') as fp:
info = json.loads(fp.read())
vectors = np.memmap(filename, mode='r', dtype=np.float32, shape=(info['num_words'], info['dims']))
return vectors, info['vocab']
def write_bin(filename, vectors, vocab):
with open(filename + '.json', 'w') as fp:
fp.write(json.dumps({'num_words': vectors.shape[0], 'dims': vectors.shape[1], 'vocab': vocab}))
fp = np.memmap(filename, mode='w+', shape=vectors.shape, dtype=np.float32)
fp[:] = vectors[:]
del fp
def make_rev_vocab(vocab):
rev_vocab = {}
for word, i in vocab.items():
rev_vocab[i] = word
return rev_vocab
def closest(vectors, vocab, rev_vocab, word, n=10):
if word not in vocab:
return []
v = vectors[vocab[word]]
scores = np.dot(vectors, v.T)
best = np.max(scores)
if n > len(vectors):
n = len(vectors)
indices = np.argpartition(scores, -n)[-n:]
indices = indices[np.argsort(scores[indices])]
output = []
for i in [int(x) for x in indices]:
output.append((scores[i], rev_vocab[i]))
return reversed(output)
def sim(vectors, vocab, word1, word2):
if word1 not in vocab or word2 not in vocab:
return -2
v1 = vectors[vocab[word1]]
v2 = vectors[vocab[word2]]
return np.dot(v1, v2)
if __name__ == '__main__':
if len(sys.argv) <= 1 or sys.argv[1] not in ['sim', 'closest', 'convert']:
print('''Usage: %s command [args]
Commands:
sim <embeddings> [<word1> <word2>] compute similarity between two words (-2 if not found)
closest <embeddings> <n> [<word>] list n closest words to target
convert <embeddings> <output> convert embeddings to binary format
Note:
if words are not specified for sim and closest, they are read from stdin''' % sys.argv[0], file=sys.stderr)
sys.exit(1)
vectors, vocab = load(sys.argv[2])
command = sys.argv[1]
if command == 'sim':
if len(sys.argv) == 5:
print(sys.argv[3], sys.argv[4], sim(vectors, vocab, sys.argv[3], sys.argv[4]))
else:
for line in sys.stdin:
tokens = line.strip().split()
print(tokens[0], tokens[1], sim(vectors, vocab, tokens[0], tokens[1]))
elif command == 'closest':
rev_vocab = make_rev_vocab(vocab)
n = int(sys.argv[3])
if len(sys.argv) == 5:
for score, word in closest(vectors, vocab, rev_vocab, sys.argv[4], n):
print(sys.argv[4], word, score)
else:
for line in sys.stdin:
tokens = line.strip().split()
for score, word in closest(vectors, vocab, rev_vocab, tokens[0], n):
print(tokens[0], word, score)
elif command == 'convert':
write_bin(sys.argv[3], vectors, vocab)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment