diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 5e8ee609c87b3f1b8047f737594d53c2be6d95fd..6dedd6a3932c8dc2762b296eb2e846e795d99914 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -42,8 +42,8 @@ transitions: all_no_test.conllu texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) -pretrain: texts - ./pretrainEmbeddings.py train.txt 64 pretrained.w2v +pretrain: + ./pretrainEmbeddings.py $(TRAIN_FILES) 64 pretrained.w2v $(FPLM_FILENAME): all_no_test.conllu $(SCRIPTS)/conllu2fplm.py $< > $@ diff --git a/UD_any/data/pretrainEmbeddings.py b/UD_any/data/pretrainEmbeddings.py index 11a777ffd80a4a4cecb605a5cc66b208a5410e5c..4eddbf03de361fd8531f54d2a90852c679822dd4 100755 --- a/UD_any/data/pretrainEmbeddings.py +++ b/UD_any/data/pretrainEmbeddings.py @@ -21,6 +21,25 @@ if __name__ == "__main__" : if which("word2vec") is None : exit(0) - p = subprocess.Popen("word2vec -cbow 0 -size %s -window 10 -negative 5 -hs 0 -sample 1e-1 -threads 2 -binary 0 -iter 15 -min-count 2 -train %s -output %s"%(embeddingsSize, pathToFile, target), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + p = subprocess.Popen("~/Documents/ud_pipe/src/udpipe --output=horizontal none %s > in.text"%pathToFile, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + p.wait() + + args = [ + "word2vec", + "-cbow 0", + "-size %s"%embeddingsSize, + "-window 10", + "-negative 5", + "-hs 0", + "-sample 1e-1", + "-threads 4", + "-binary 0", + "-iter 15", + "-min-count 2", + "-train in.text", + "-output %s"%target, + ] + + p = subprocess.Popen(" ".join(args), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) p.wait() diff --git a/UD_any/train.sh b/UD_any/train.sh index 88c91126a42501e8b81bd74be625d40f3abd76b9..064bbef6da152b3385b31f52bfac05d5c2387065 100755 --- a/UD_any/train.sh +++ b/UD_any/train.sh @@ -40,7 +40,6 @@ DEV=$EXPPATH"/data/dev.conllu" DEVRAW=$EXPPATH"/data/dev.txt" TEST=$EXPPATH"/data/test.conllu" TESTRAW=$EXPPATH"/data/test.txt" -W2V=$EXPPATH"/data/pretrained.w2v" if test ! -f $TRAIN; then @@ -67,19 +66,13 @@ if [ "$MODE" = "txt" ]; then fi fi -if test -f $W2V; -then - >&2 echo "Using W2V :" $W2V - W2V="--pretrainedEmbeddings "$W2V -fi - if [ "$MODE" = "tsv" ]; then -macaon train --model $EXPPATH --trainTSV $TRAIN --devTSV $DEV $W2V "$@" || exit 1 +macaon train --model $EXPPATH --trainTSV $TRAIN --devTSV $DEV "$@" || exit 1 exit 0 fi if [ "$MODE" = "txt" ]; then -macaon train --model $EXPPATH --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $W2V "$@" || exit 1 +macaon train --model $EXPPATH --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW "$@" || exit 1 exit 0 fi