diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 926906cd7530888c29ec89a9203eb50533433b1f..ba996647fd7ab8eee980cea24703a76039743d66 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -12,7 +12,7 @@ TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu') THRESHOLD=10 FPLM_FILENAME=fplm -all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns +all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns pretrain rm -f col_*\.txt rm -f all_no_test.conllu @@ -49,6 +49,9 @@ columns: all_no_test.conllu $(MCD) texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) +pretrain: texts + ./pretrainEmbeddings.py $(shell find $(CORPUS) -type f -name '*train*.txt') 64 + $(FPLM_FILENAME): all_no_test.conllu $(MCD) $(SCRIPTS)/conllu2fplm.py $< $(MCD) > $@ diff --git a/UD_any/data/pretrainEmbeddings.py b/UD_any/data/pretrainEmbeddings.py new file mode 100755 index 0000000000000000000000000000000000000000..d0a96b4e33d767c4627dea68d2f6ab4aceca441e --- /dev/null +++ b/UD_any/data/pretrainEmbeddings.py @@ -0,0 +1,22 @@ +#! /usr/bin/env python3 + +import sys +import os +import subprocess + +def printUsageAndExit() : + print("USAGE : %s file.conllu embeddingsSize"%sys.argv[0], file=sys.stderr) + exit(1) + +if __name__ == "__main__" : + if len(sys.argv) != 3 : + printUsageAndExit() + + pathToFile = sys.argv[1] + embeddingsSize = int(sys.argv[2]) + splited = os.path.splitext(pathToFile) + target = splited[0] + ".w2v" + + p = subprocess.Popen("word2vec -cbow 0 -size %s -window 10 -negative 5 -hs 0 -sample 1e-1 -threads 2 -binary 0 -iter 15 -min-count 2 -train %s -output %s"%(embeddingsSize, pathToFile, target), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + p.wait() +