From 18f1683c29131111ebd22cdeb0b361e1fb02543a Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Sat, 13 Jun 2020 17:16:41 +0200 Subject: [PATCH] Using w2v on output from udpipe horizontal output instead of on raw text --- UD_any/data/Makefile | 4 ++-- UD_any/data/pretrainEmbeddings.py | 21 ++++++++++++++++++++- UD_any/train.sh | 11 ++--------- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 5e8ee60..6dedd6a 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -42,8 +42,8 @@ transitions: all_no_test.conllu texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) -pretrain: texts - ./pretrainEmbeddings.py train.txt 64 pretrained.w2v +pretrain: + ./pretrainEmbeddings.py $(TRAIN_FILES) 64 pretrained.w2v $(FPLM_FILENAME): all_no_test.conllu $(SCRIPTS)/conllu2fplm.py $< > $@ diff --git a/UD_any/data/pretrainEmbeddings.py b/UD_any/data/pretrainEmbeddings.py index 11a777f..4eddbf0 100755 --- a/UD_any/data/pretrainEmbeddings.py +++ b/UD_any/data/pretrainEmbeddings.py @@ -21,6 +21,25 @@ if __name__ == "__main__" : if which("word2vec") is None : exit(0) - p = subprocess.Popen("word2vec -cbow 0 -size %s -window 10 -negative 5 -hs 0 -sample 1e-1 -threads 2 -binary 0 -iter 15 -min-count 2 -train %s -output %s"%(embeddingsSize, pathToFile, target), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + p = subprocess.Popen("~/Documents/ud_pipe/src/udpipe --output=horizontal none %s > in.text"%pathToFile, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + p.wait() + + args = [ + "word2vec", + "-cbow 0", + "-size %s"%embeddingsSize, + "-window 10", + "-negative 5", + "-hs 0", + "-sample 1e-1", + "-threads 4", + "-binary 0", + "-iter 15", + "-min-count 2", + "-train in.text", + "-output %s"%target, + ] + + p = subprocess.Popen(" ".join(args), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) p.wait() diff --git a/UD_any/train.sh b/UD_any/train.sh index 88c9112..064bbef 100755 --- a/UD_any/train.sh +++ b/UD_any/train.sh @@ -40,7 +40,6 @@ DEV=$EXPPATH"/data/dev.conllu" DEVRAW=$EXPPATH"/data/dev.txt" TEST=$EXPPATH"/data/test.conllu" TESTRAW=$EXPPATH"/data/test.txt" -W2V=$EXPPATH"/data/pretrained.w2v" if test ! -f $TRAIN; then @@ -67,19 +66,13 @@ if [ "$MODE" = "txt" ]; then fi fi -if test -f $W2V; -then - >&2 echo "Using W2V :" $W2V - W2V="--pretrainedEmbeddings "$W2V -fi - if [ "$MODE" = "tsv" ]; then -macaon train --model $EXPPATH --trainTSV $TRAIN --devTSV $DEV $W2V "$@" || exit 1 +macaon train --model $EXPPATH --trainTSV $TRAIN --devTSV $DEV "$@" || exit 1 exit 0 fi if [ "$MODE" = "txt" ]; then -macaon train --model $EXPPATH --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $W2V "$@" || exit 1 +macaon train --model $EXPPATH --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW "$@" || exit 1 exit 0 fi -- GitLab