diff --git a/Dicts.py b/Dicts.py index f1fe2288e2ed197048b700012ddc18dcb0aae219..93dd56f0466521aa829961158ef7fc8aff91c817 100644 --- a/Dicts.py +++ b/Dicts.py @@ -84,7 +84,7 @@ class Dicts : continue splited = line.split(' ') - word = splited[0] + word = splited[0].replace("◌", " ") if word not in self.dicts[col] : self.dicts[col][word] = (len(self.dicts[col]), 1) diff --git a/embeddings/.gitignore b/embeddings/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..36281428d3201e30d2c342aa2145f52222a1f60b --- /dev/null +++ b/embeddings/.gitignore @@ -0,0 +1 @@ +*\.w2v diff --git a/embeddings/lefffEmbeddings/generateLefffEmbeddings.py b/embeddings/lefffEmbeddings/generateLefffEmbeddings.py new file mode 100755 index 0000000000000000000000000000000000000000..edfd35ec82640acdcf7dd8f719128b616b2b6a99 --- /dev/null +++ b/embeddings/lefffEmbeddings/generateLefffEmbeddings.py @@ -0,0 +1,48 @@ +#! /usr/bin/env python3 +import sys +import subprocess + +generate = "~/macaon_data/scripts/lefff2w2v.py" +lefffFile = "../../lefff.fplm" +conlluFiles = "../../data/UD_French-GSD_0/*\.conllu" + +commands = [] + +# With lefff alone +name = "lefff.w2v" +command = "%s --lefff %s > %s"%(generate, lefffFile, name) +commands.append((name, command)) +name = "lefff+binary0.0.w2v" +command = "%s --lefff %s --binary 0.0 > %s"%(generate, lefffFile, name) +commands.append((name, command)) + +# With lefff and conllu +for conllu in [("--conllu %s"%conlluFiles, "+conllu")] : + for binary in [("",""), ("--binary 0.0","+binary0.0"), ("--binary 0.1","+binary0.1"), ("--binary 0.2","+binary0.2")] : + for minfreq in [("",""), ("--minfreq 2","+minfreq2"), ("--minfreq 3","+minfreq3"), ("--minfreq 4","+minfreq4")] : + for lefffweight in [("",""), ("--lefffWeight 2", "+lefffWeight2"), ("--lefffWeight 3", "+lefffWeight3"), ("--lefffWeight 4", "+lefffWeight4")] : + name = "lefff"+conllu[1]+binary[1]+minfreq[1]+lefffweight[1] + ".w2v" + command = "%s --lefff %s %s %s %s %s > %s"%(generate, lefffFile, conllu[0], binary[0], minfreq[0], lefffweight[0], name) + commands.append((name, command)) + +# Without lefff +for conllu in [("--conllu %s"%conlluFiles, "conllu")] : + for binary in [("",""), ("--binary 0.0","+binary0.0"), ("--binary 0.1","+binary0.1"), ("--binary 0.2","+binary0.2")] : + for minfreq in [("",""), ("--minfreq 2","+minfreq2"), ("--minfreq 3","+minfreq3"), ("--minfreq 4","+minfreq4")] : + name = conllu[1]+binary[1]+minfreq[1] + ".w2v" + command = "%s %s %s %s > %s"%(generate, conllu[0], binary[0], minfreq[0], name) + commands.append((name, command)) + +nbDone = 0 +for name, command in commands : + print("\r%s\r%5.2f%% Generating %s"%(" "*80, 100*nbDone/len(commands), name), end="") + sys.stdout.flush() + + err = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stderr.read().decode() + if len(err) > 0 : + print(err, sys.stderr) + + nbDone += 1 + +print() +