Skip to content
Snippets Groups Projects
Commit 18f1683c authored by Franck Dary's avatar Franck Dary
Browse files

Using w2v on output from udpipe horizontal output instead of on raw text

parent 26f25b22
No related branches found
No related tags found
No related merge requests found
...@@ -42,8 +42,8 @@ transitions: all_no_test.conllu ...@@ -42,8 +42,8 @@ transitions: all_no_test.conllu
texts: texts:
./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
pretrain: texts pretrain:
./pretrainEmbeddings.py train.txt 64 pretrained.w2v ./pretrainEmbeddings.py $(TRAIN_FILES) 64 pretrained.w2v
$(FPLM_FILENAME): all_no_test.conllu $(FPLM_FILENAME): all_no_test.conllu
$(SCRIPTS)/conllu2fplm.py $< > $@ $(SCRIPTS)/conllu2fplm.py $< > $@
......
...@@ -21,6 +21,25 @@ if __name__ == "__main__" : ...@@ -21,6 +21,25 @@ if __name__ == "__main__" :
if which("word2vec") is None : if which("word2vec") is None :
exit(0) exit(0)
p = subprocess.Popen("word2vec -cbow 0 -size %s -window 10 -negative 5 -hs 0 -sample 1e-1 -threads 2 -binary 0 -iter 15 -min-count 2 -train %s -output %s"%(embeddingsSize, pathToFile, target), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) p = subprocess.Popen("~/Documents/ud_pipe/src/udpipe --output=horizontal none %s > in.text"%pathToFile, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
p.wait()
args = [
"word2vec",
"-cbow 0",
"-size %s"%embeddingsSize,
"-window 10",
"-negative 5",
"-hs 0",
"-sample 1e-1",
"-threads 4",
"-binary 0",
"-iter 15",
"-min-count 2",
"-train in.text",
"-output %s"%target,
]
p = subprocess.Popen(" ".join(args), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
p.wait() p.wait()
...@@ -40,7 +40,6 @@ DEV=$EXPPATH"/data/dev.conllu" ...@@ -40,7 +40,6 @@ DEV=$EXPPATH"/data/dev.conllu"
DEVRAW=$EXPPATH"/data/dev.txt" DEVRAW=$EXPPATH"/data/dev.txt"
TEST=$EXPPATH"/data/test.conllu" TEST=$EXPPATH"/data/test.conllu"
TESTRAW=$EXPPATH"/data/test.txt" TESTRAW=$EXPPATH"/data/test.txt"
W2V=$EXPPATH"/data/pretrained.w2v"
if test ! -f $TRAIN; if test ! -f $TRAIN;
then then
...@@ -67,19 +66,13 @@ if [ "$MODE" = "txt" ]; then ...@@ -67,19 +66,13 @@ if [ "$MODE" = "txt" ]; then
fi fi
fi fi
if test -f $W2V;
then
>&2 echo "Using W2V :" $W2V
W2V="--pretrainedEmbeddings "$W2V
fi
if [ "$MODE" = "tsv" ]; then if [ "$MODE" = "tsv" ]; then
macaon train --model $EXPPATH --trainTSV $TRAIN --devTSV $DEV $W2V "$@" || exit 1 macaon train --model $EXPPATH --trainTSV $TRAIN --devTSV $DEV "$@" || exit 1
exit 0 exit 0
fi fi
if [ "$MODE" = "txt" ]; then if [ "$MODE" = "txt" ]; then
macaon train --model $EXPPATH --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $W2V "$@" || exit 1 macaon train --model $EXPPATH --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW "$@" || exit 1
exit 0 exit 0
fi fi
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment