diff --git a/UD_any/data/pretrainEmbeddings.py b/UD_any/data/pretrainEmbeddings.py index d0a96b4e33d767c4627dea68d2f6ab4aceca441e..f6d3dfa2dda116a4b2a180f6249e366c8d6f5e2d 100755 --- a/UD_any/data/pretrainEmbeddings.py +++ b/UD_any/data/pretrainEmbeddings.py @@ -3,6 +3,7 @@ import sys import os import subprocess +from shutil import which def printUsageAndExit() : print("USAGE : %s file.conllu embeddingsSize"%sys.argv[0], file=sys.stderr) @@ -17,6 +18,9 @@ if __name__ == "__main__" : splited = os.path.splitext(pathToFile) target = splited[0] + ".w2v" + if which("w2v") is None : + exit(0) + p = subprocess.Popen("word2vec -cbow 0 -size %s -window 10 -negative 5 -hs 0 -sample 1e-1 -threads 2 -binary 0 -iter 15 -min-count 2 -train %s -output %s"%(embeddingsSize, pathToFile, target), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) p.wait() diff --git a/UD_any/train.sh b/UD_any/train.sh index a611cad7dc803801973012c352202b00dd36f17f..8747a2812d90e277f4e9a7aa4778edc56022b9d4 100755 --- a/UD_any/train.sh +++ b/UD_any/train.sh @@ -44,6 +44,7 @@ DEVRAW=$(find $CORPUS -type f -name '*dev*.txt') TEST=$(find $CORPUS -type f -name '*test*.conllu') TESTRAW=$(find $CORPUS -type f -name '*test*.txt') MCD=$(find $CORPUS -type f -name '*.mcd') +W2V=$(find $CORPUS -type f -name '*.w2v') if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST" || has_space "$MCD"; then @@ -78,13 +79,19 @@ fi >&2 echo "Using MCD :" $MCD +if test -f $W2V; +then + >&2 echo "Using W2V :" $W2V + W2V="--pretrainedEmbeddings "$W2V +fi + if [ "$MODE" = "tsv" ]; then -macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --devTSV $DEV "$@" || exit 1 +macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --devTSV $DEV $W2V "$@" || exit 1 exit 0 fi if [ "$MODE" = "txt" ]; then -macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW "$@" || exit 1 +macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $W2V "$@" || exit 1 exit 0 fi