Skip to content
Snippets Groups Projects
Commit 70f6b20a authored by Franck Dary's avatar Franck Dary
Browse files

Using GloVe by default to pretrain word embeddings

parent 19a30264
No related branches found
No related tags found
No related merge requests found
[submodule "GloVe"]
path = GloVe
url = https://github.com/stanfordnlp/GloVe
Subproject commit 5187fa82f35348dab5d0d9d9af1ad70449a427d2
......@@ -48,7 +48,7 @@ texts:
./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
pretrain:
./pretrainEmbeddings.py $(TRAIN_FILES) 64 pretrained.w2v
./pretrainEmbeddings.sh $(TRAIN_FILES) 64 pretrained.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 )
$(FPLM_FILENAME): all_no_test.conllu
$(SCRIPTS)/conllu2fplm.py $< > $@
......
#! /usr/bin/env python3
import sys
import os
import subprocess
from shutil import which
def printUsageAndExit() :
print("USAGE : %s file.conllu embeddingsSize outputFile"%sys.argv[0], file=sys.stderr)
exit(1)
if __name__ == "__main__" :
if len(sys.argv) != 4 :
printUsageAndExit()
pathToFile = sys.argv[1]
embeddingsSize = int(sys.argv[2])
splited = os.path.splitext(pathToFile)
target = sys.argv[3]
if which("word2vec") is None :
print("word2vec not installed")
exit(0)
if which("udpipe") is None :
print("udpipe not installed")
exit(0)
p = subprocess.Popen("udpipe --output=horizontal none %s > in.text"%pathToFile, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
p.wait()
args = [
"word2vec",
"-cbow 0",
"-size %s"%embeddingsSize,
"-window 10",
"-negative 5",
"-hs 0",
"-sample 1e-1",
"-threads 4",
"-binary 0",
"-iter 15",
"-min-count 2",
"-train in.text",
"-output %s"%target,
"&& rm in.text"
]
p = subprocess.Popen(" ".join(args), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
p.wait()
#! /usr/bin/env bash
GLOVE="../../../../GloVe/"
if [ "$#" -ne 3 ]; then
echo "USAGE : $0 input.conllu embeddingsSize output.w2v"
exit 1
fi
CURDIR="$(pwd)"
cd $GLOVE && make && cd $CURDIR \
&& udpipe --output=horizontal none $1 > in.text \
&& $GLOVE"build/vocab_count" -min-count 2 < in.text > vocab.txt \
&& $GLOVE"build/cooccur" -symmetric 0 -window-size 10 -vocab-file vocab.txt -memory 8.0 -overflow-file tempoverflow < in.text > cooccurrences.bin \
&& $GLOVE"build/shuffle" -memory 8.0 -seed 100 < cooccurrences.bin > cooccurrence.shuf.bin \
&& $GLOVE"build/glove" -iter 50 -save_gradsq 0 -write-header 1 -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file out -gradsq-file gradsq -vector-size $2 -seed 100 -threads 1 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 1 \
&& mv out.txt $3
rm in.text 2> /dev/null
rm vocab.txt 2> /dev/null
rm cooccurrences.bin 2> /dev/null
rm cooccurrence.shuf.bin 2> /dev/null
rm overflow_*\.bin 2> /dev/null
exit 0
......@@ -32,7 +32,8 @@ if [ ! -d "$EXPPATH" ]; then
fi
CURDIR=$(pwd)
cd $EXPPATH"/"data && make -s clean && make -s && cd $CURDIR
cd $EXPPATH"/"data && make -s clean && make -s
cd $CURDIR
TRAIN=$EXPPATH"/data/train.conllu"
TRAINRAW=$EXPPATH"/data/train.txt"
......@@ -43,6 +44,7 @@ TESTRAW=$EXPPATH"/data/test.txt"
if test ! -f $TRAIN;
then
pwd
>&2 echo "ERROR : no train file found in" $EXPPATH
>&2 echo "$TRAIN"
print_usage_and_exit
......
......@@ -527,6 +527,8 @@ def evaluate(gold_ud, system_ud) :
result["UAS"] = alignment_score(alignment, lambda w, ga : ga(w.parent))
if "DEPREL" in col2index :
result["LAS"] = alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[col2index["DEPREL"]]))
if "DEPREL" in col2index and "UPOS" in col2index and "FEATS" in col2index :
result["MLAS"] = alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[col2index["DEPREL"]], w.columns[col2index["UPOS"]], w.columns[col2index["FEATS"]], [(ga(c), c.columns[col2index["DEPREL"]], c.columns[col2index["UPOS"]], c.columns[col2index["FEATS"]]) for c in w.functional_children]), filter_fn=lambda w: w.is_content_deprel)
if "ID" in col2index :
result["Sentences"] = spans_score(gold_ud.sentences, system_ud.sentences)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment