From 18f1683c29131111ebd22cdeb0b361e1fb02543a Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Sat, 13 Jun 2020 17:16:41 +0200
Subject: [PATCH] Using w2v on output from udpipe horizontal output instead of
 on raw text

---
 UD_any/data/Makefile              |  4 ++--
 UD_any/data/pretrainEmbeddings.py | 21 ++++++++++++++++++++-
 UD_any/train.sh                   | 11 ++---------
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile
index 5e8ee60..6dedd6a 100644
--- a/UD_any/data/Makefile
+++ b/UD_any/data/Makefile
@@ -42,8 +42,8 @@ transitions: all_no_test.conllu
 texts:
 	./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
 
-pretrain: texts
-	./pretrainEmbeddings.py train.txt 64 pretrained.w2v
+pretrain:
+	./pretrainEmbeddings.py $(TRAIN_FILES) 64 pretrained.w2v
 
 $(FPLM_FILENAME): all_no_test.conllu
 	$(SCRIPTS)/conllu2fplm.py $< > $@
diff --git a/UD_any/data/pretrainEmbeddings.py b/UD_any/data/pretrainEmbeddings.py
index 11a777f..4eddbf0 100755
--- a/UD_any/data/pretrainEmbeddings.py
+++ b/UD_any/data/pretrainEmbeddings.py
@@ -21,6 +21,25 @@ if __name__ == "__main__" :
   if which("word2vec") is None :
     exit(0)
 
-  p = subprocess.Popen("word2vec -cbow 0 -size %s -window 10 -negative 5 -hs 0 -sample 1e-1 -threads 2 -binary 0 -iter 15 -min-count 2 -train %s -output %s"%(embeddingsSize, pathToFile, target), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+  p = subprocess.Popen("~/Documents/ud_pipe/src/udpipe --output=horizontal none %s > in.text"%pathToFile, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+  p.wait()
+
+  args = [
+    "word2vec",
+    "-cbow 0",
+    "-size %s"%embeddingsSize,
+    "-window 10",
+    "-negative 5",
+    "-hs 0",
+    "-sample 1e-1",
+    "-threads 4",
+    "-binary 0",
+    "-iter 15",
+    "-min-count 2",
+    "-train in.text",
+    "-output %s"%target,
+  ]
+
+  p = subprocess.Popen(" ".join(args), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
   p.wait()
 
diff --git a/UD_any/train.sh b/UD_any/train.sh
index 88c9112..064bbef 100755
--- a/UD_any/train.sh
+++ b/UD_any/train.sh
@@ -40,7 +40,6 @@ DEV=$EXPPATH"/data/dev.conllu"
 DEVRAW=$EXPPATH"/data/dev.txt"
 TEST=$EXPPATH"/data/test.conllu"
 TESTRAW=$EXPPATH"/data/test.txt"
-W2V=$EXPPATH"/data/pretrained.w2v"
 
 if test ! -f $TRAIN;
 then
@@ -67,19 +66,13 @@ if [ "$MODE" = "txt" ]; then
 	fi
 fi
 
-if test -f $W2V;
-then
-	>&2 echo "Using W2V :" $W2V
-	W2V="--pretrainedEmbeddings "$W2V
-fi
-
 if [ "$MODE" = "tsv" ]; then
-macaon train --model $EXPPATH --trainTSV $TRAIN --devTSV $DEV $W2V "$@" || exit 1
+macaon train --model $EXPPATH --trainTSV $TRAIN --devTSV $DEV "$@" || exit 1
 exit 0
 fi
 
 if [ "$MODE" = "txt" ]; then
-macaon train --model $EXPPATH --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $W2V "$@" || exit 1
+macaon train --model $EXPPATH --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW "$@" || exit 1
 exit 0
 fi
 
-- 
GitLab