Skip to content
Snippets Groups Projects
Commit b17524f7 authored by Franck Dary's avatar Franck Dary
Browse files

Added script to pretrain embeddings on w2v

parent d92690d4
No related branches found
No related tags found
No related merge requests found
......@@ -12,7 +12,7 @@ TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu')
THRESHOLD=10
FPLM_FILENAME=fplm
all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns
all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns pretrain
rm -f col_*\.txt
rm -f all_no_test.conllu
......@@ -49,6 +49,9 @@ columns: all_no_test.conllu $(MCD)
texts:
./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
pretrain: texts
./pretrainEmbeddings.py $(shell find $(CORPUS) -type f -name '*train*.txt') 64
$(FPLM_FILENAME): all_no_test.conllu $(MCD)
$(SCRIPTS)/conllu2fplm.py $< $(MCD) > $@
......
#! /usr/bin/env python3
import sys
import os
import subprocess
def printUsageAndExit() :
print("USAGE : %s file.conllu embeddingsSize"%sys.argv[0], file=sys.stderr)
exit(1)
if __name__ == "__main__" :
if len(sys.argv) != 3 :
printUsageAndExit()
pathToFile = sys.argv[1]
embeddingsSize = int(sys.argv[2])
splited = os.path.splitext(pathToFile)
target = splited[0] + ".w2v"
p = subprocess.Popen("word2vec -cbow 0 -size %s -window 10 -negative 5 -hs 0 -sample 1e-1 -threads 2 -binary 0 -iter 15 -min-count 2 -train %s -output %s"%(embeddingsSize, pathToFile, target), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
p.wait()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment