Skip to content
Snippets Groups Projects
Commit 76da121d authored by Franck Dary's avatar Franck Dary
Browse files

Added script to generate embeddings with lefff

parent d9fc23a0
No related branches found
No related tags found
No related merge requests found
......@@ -84,7 +84,7 @@ class Dicts :
continue
splited = line.split(' ')
word = splited[0]
word = splited[0].replace("", " ")
if word not in self.dicts[col] :
self.dicts[col][word] = (len(self.dicts[col]), 1)
......
*\.w2v
#! /usr/bin/env python3
import sys
import subprocess
generate = "~/macaon_data/scripts/lefff2w2v.py"
lefffFile = "../../lefff.fplm"
conlluFiles = "../../data/UD_French-GSD_0/*\.conllu"
commands = []
# With lefff alone
name = "lefff.w2v"
command = "%s --lefff %s > %s"%(generate, lefffFile, name)
commands.append((name, command))
name = "lefff+binary0.0.w2v"
command = "%s --lefff %s --binary 0.0 > %s"%(generate, lefffFile, name)
commands.append((name, command))
# With lefff and conllu
for conllu in [("--conllu %s"%conlluFiles, "+conllu")] :
for binary in [("",""), ("--binary 0.0","+binary0.0"), ("--binary 0.1","+binary0.1"), ("--binary 0.2","+binary0.2")] :
for minfreq in [("",""), ("--minfreq 2","+minfreq2"), ("--minfreq 3","+minfreq3"), ("--minfreq 4","+minfreq4")] :
for lefffweight in [("",""), ("--lefffWeight 2", "+lefffWeight2"), ("--lefffWeight 3", "+lefffWeight3"), ("--lefffWeight 4", "+lefffWeight4")] :
name = "lefff"+conllu[1]+binary[1]+minfreq[1]+lefffweight[1] + ".w2v"
command = "%s --lefff %s %s %s %s %s > %s"%(generate, lefffFile, conllu[0], binary[0], minfreq[0], lefffweight[0], name)
commands.append((name, command))
# Without lefff
for conllu in [("--conllu %s"%conlluFiles, "conllu")] :
for binary in [("",""), ("--binary 0.0","+binary0.0"), ("--binary 0.1","+binary0.1"), ("--binary 0.2","+binary0.2")] :
for minfreq in [("",""), ("--minfreq 2","+minfreq2"), ("--minfreq 3","+minfreq3"), ("--minfreq 4","+minfreq4")] :
name = conllu[1]+binary[1]+minfreq[1] + ".w2v"
command = "%s %s %s %s > %s"%(generate, conllu[0], binary[0], minfreq[0], name)
commands.append((name, command))
nbDone = 0
for name, command in commands :
print("\r%s\r%5.2f%% Generating %s"%(" "*80, 100*nbDone/len(commands), name), end="")
sys.stdout.flush()
err = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stderr.read().decode()
if len(err) > 0 :
print(err, sys.stderr)
nbDone += 1
print()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment