From 76da121d3c106affbd22a1d3b0ebc0a61b1c4137 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 19 May 2022 16:06:50 +0200 Subject: [PATCH] Added script to generate embeddings with lefff --- Dicts.py | 2 +- embeddings/.gitignore | 1 + .../generateLefffEmbeddings.py | 48 +++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 embeddings/.gitignore create mode 100755 embeddings/lefffEmbeddings/generateLefffEmbeddings.py diff --git a/Dicts.py b/Dicts.py index f1fe228..93dd56f 100644 --- a/Dicts.py +++ b/Dicts.py @@ -84,7 +84,7 @@ class Dicts : continue splited = line.split(' ') - word = splited[0] + word = splited[0].replace("◌", " ") if word not in self.dicts[col] : self.dicts[col][word] = (len(self.dicts[col]), 1) diff --git a/embeddings/.gitignore b/embeddings/.gitignore new file mode 100644 index 0000000..3628142 --- /dev/null +++ b/embeddings/.gitignore @@ -0,0 +1 @@ +*\.w2v diff --git a/embeddings/lefffEmbeddings/generateLefffEmbeddings.py b/embeddings/lefffEmbeddings/generateLefffEmbeddings.py new file mode 100755 index 0000000..edfd35e --- /dev/null +++ b/embeddings/lefffEmbeddings/generateLefffEmbeddings.py @@ -0,0 +1,48 @@ +#! /usr/bin/env python3 +import sys +import subprocess + +generate = "~/macaon_data/scripts/lefff2w2v.py" +lefffFile = "../../lefff.fplm" +conlluFiles = "../../data/UD_French-GSD_0/*\.conllu" + +commands = [] + +# With lefff alone +name = "lefff.w2v" +command = "%s --lefff %s > %s"%(generate, lefffFile, name) +commands.append((name, command)) +name = "lefff+binary0.0.w2v" +command = "%s --lefff %s --binary 0.0 > %s"%(generate, lefffFile, name) +commands.append((name, command)) + +# With lefff and conllu +for conllu in [("--conllu %s"%conlluFiles, "+conllu")] : + for binary in [("",""), ("--binary 0.0","+binary0.0"), ("--binary 0.1","+binary0.1"), ("--binary 0.2","+binary0.2")] : + for minfreq in [("",""), ("--minfreq 2","+minfreq2"), ("--minfreq 3","+minfreq3"), ("--minfreq 4","+minfreq4")] : + for lefffweight in [("",""), ("--lefffWeight 2", "+lefffWeight2"), ("--lefffWeight 3", "+lefffWeight3"), ("--lefffWeight 4", "+lefffWeight4")] : + name = "lefff"+conllu[1]+binary[1]+minfreq[1]+lefffweight[1] + ".w2v" + command = "%s --lefff %s %s %s %s %s > %s"%(generate, lefffFile, conllu[0], binary[0], minfreq[0], lefffweight[0], name) + commands.append((name, command)) + +# Without lefff +for conllu in [("--conllu %s"%conlluFiles, "conllu")] : + for binary in [("",""), ("--binary 0.0","+binary0.0"), ("--binary 0.1","+binary0.1"), ("--binary 0.2","+binary0.2")] : + for minfreq in [("",""), ("--minfreq 2","+minfreq2"), ("--minfreq 3","+minfreq3"), ("--minfreq 4","+minfreq4")] : + name = conllu[1]+binary[1]+minfreq[1] + ".w2v" + command = "%s %s %s %s > %s"%(generate, conllu[0], binary[0], minfreq[0], name) + commands.append((name, command)) + +nbDone = 0 +for name, command in commands : + print("\r%s\r%5.2f%% Generating %s"%(" "*80, 100*nbDone/len(commands), name), end="") + sys.stdout.flush() + + err = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stderr.read().decode() + if len(err) > 0 : + print(err, sys.stderr) + + nbDone += 1 + +print() + -- GitLab