From b17524f7a8e55ec9d82cc80e7753e275999d912f Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Sun, 17 May 2020 23:06:50 +0200 Subject: [PATCH] Added script to pretrain embeddings on w2v --- UD_any/data/Makefile | 5 ++++- UD_any/data/pretrainEmbeddings.py | 22 ++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100755 UD_any/data/pretrainEmbeddings.py diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 926906c..ba99664 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -12,7 +12,7 @@ TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu') THRESHOLD=10 FPLM_FILENAME=fplm -all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns +all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns pretrain rm -f col_*\.txt rm -f all_no_test.conllu @@ -49,6 +49,9 @@ columns: all_no_test.conllu $(MCD) texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) +pretrain: texts + ./pretrainEmbeddings.py $(shell find $(CORPUS) -type f -name '*train*.txt') 64 + $(FPLM_FILENAME): all_no_test.conllu $(MCD) $(SCRIPTS)/conllu2fplm.py $< $(MCD) > $@ diff --git a/UD_any/data/pretrainEmbeddings.py b/UD_any/data/pretrainEmbeddings.py new file mode 100755 index 0000000..d0a96b4 --- /dev/null +++ b/UD_any/data/pretrainEmbeddings.py @@ -0,0 +1,22 @@ +#! /usr/bin/env python3 + +import sys +import os +import subprocess + +def printUsageAndExit() : + print("USAGE : %s file.conllu embeddingsSize"%sys.argv[0], file=sys.stderr) + exit(1) + +if __name__ == "__main__" : + if len(sys.argv) != 3 : + printUsageAndExit() + + pathToFile = sys.argv[1] + embeddingsSize = int(sys.argv[2]) + splited = os.path.splitext(pathToFile) + target = splited[0] + ".w2v" + + p = subprocess.Popen("word2vec -cbow 0 -size %s -window 10 -negative 5 -hs 0 -sample 1e-1 -threads 2 -binary 0 -iter 15 -min-count 2 -train %s -output %s"%(embeddingsSize, pathToFile, target), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + p.wait() + -- GitLab