From b17524f7a8e55ec9d82cc80e7753e275999d912f Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Sun, 17 May 2020 23:06:50 +0200
Subject: [PATCH] Added script to pretrain embeddings on w2v

---
 UD_any/data/Makefile              |  5 ++++-
 UD_any/data/pretrainEmbeddings.py | 22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100755 UD_any/data/pretrainEmbeddings.py

diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile
index 926906c..ba99664 100644
--- a/UD_any/data/Makefile
+++ b/UD_any/data/Makefile
@@ -12,7 +12,7 @@ TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu')
 THRESHOLD=10
 FPLM_FILENAME=fplm
 
-all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns
+all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns pretrain
 	rm -f col_*\.txt
 	rm -f all_no_test.conllu
 
@@ -49,6 +49,9 @@ columns: all_no_test.conllu $(MCD)
 texts:
 	./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
 
+pretrain: texts
+	./pretrainEmbeddings.py $(shell find $(CORPUS) -type f -name '*train*.txt') 64
+
 $(FPLM_FILENAME): all_no_test.conllu $(MCD)
 	$(SCRIPTS)/conllu2fplm.py $< $(MCD) > $@
 
diff --git a/UD_any/data/pretrainEmbeddings.py b/UD_any/data/pretrainEmbeddings.py
new file mode 100755
index 0000000..d0a96b4
--- /dev/null
+++ b/UD_any/data/pretrainEmbeddings.py
@@ -0,0 +1,22 @@
+#! /usr/bin/env python3
+
+import sys
+import os
+import subprocess
+
+def printUsageAndExit() :
+  print("USAGE : %s file.conllu embeddingsSize"%sys.argv[0], file=sys.stderr)
+  exit(1)
+
+if __name__ == "__main__" :
+  if len(sys.argv) != 3 :
+    printUsageAndExit()
+
+  pathToFile = sys.argv[1]
+  embeddingsSize = int(sys.argv[2])
+  splited = os.path.splitext(pathToFile)
+  target = splited[0] + ".w2v"
+
+  p = subprocess.Popen("word2vec -cbow 0 -size %s -window 10 -negative 5 -hs 0 -sample 1e-1 -threads 2 -binary 0 -iter 15 -min-count 2 -train %s -output %s"%(embeddingsSize, pathToFile, target), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+  p.wait()
+
-- 
GitLab