From 2b06219898adf5e1fb560bcebcbc49e83d5d356b Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Mon, 1 Jun 2020 18:49:11 +0200
Subject: [PATCH] If w2v is installed on the machine, the use of pretrained
 embeddings is now the default behaviour

---
 UD_any/data/pretrainEmbeddings.py |  4 ++++
 UD_any/train.sh                   | 11 +++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/UD_any/data/pretrainEmbeddings.py b/UD_any/data/pretrainEmbeddings.py
index d0a96b4..f6d3dfa 100755
--- a/UD_any/data/pretrainEmbeddings.py
+++ b/UD_any/data/pretrainEmbeddings.py
@@ -3,6 +3,7 @@
 import sys
 import os
 import subprocess
+from shutil import which
 
 def printUsageAndExit() :
   print("USAGE : %s file.conllu embeddingsSize"%sys.argv[0], file=sys.stderr)
@@ -17,6 +18,9 @@ if __name__ == "__main__" :
   splited = os.path.splitext(pathToFile)
   target = splited[0] + ".w2v"
 
+  if which("w2v") is None :
+    exit(0)
+
   p = subprocess.Popen("word2vec -cbow 0 -size %s -window 10 -negative 5 -hs 0 -sample 1e-1 -threads 2 -binary 0 -iter 15 -min-count 2 -train %s -output %s"%(embeddingsSize, pathToFile, target), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
   p.wait()
 
diff --git a/UD_any/train.sh b/UD_any/train.sh
index a611cad..8747a28 100755
--- a/UD_any/train.sh
+++ b/UD_any/train.sh
@@ -44,6 +44,7 @@ DEVRAW=$(find $CORPUS -type f -name '*dev*.txt')
 TEST=$(find $CORPUS -type f -name '*test*.conllu')
 TESTRAW=$(find $CORPUS -type f -name '*test*.txt')
 MCD=$(find $CORPUS -type f -name '*.mcd')
+W2V=$(find $CORPUS -type f -name '*.w2v')
 
 if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST" || has_space "$MCD";
 then
@@ -78,13 +79,19 @@ fi
 
 >&2 echo "Using MCD :" $MCD
 
+if test -f $W2V;
+then
+	>&2 echo "Using W2V :" $W2V
+	W2V="--pretrainedEmbeddings "$W2V
+fi
+
 if [ "$MODE" = "tsv" ]; then
-macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --devTSV $DEV "$@" || exit 1
+macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --devTSV $DEV $W2V "$@" || exit 1
 exit 0
 fi
 
 if [ "$MODE" = "txt" ]; then
-macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW "$@" || exit 1
+macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $W2V "$@" || exit 1
 exit 0
 fi
 
-- 
GitLab