From 70f6b20ae6de0b4decced09d34f1f9e7839b623d Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Tue, 28 Jul 2020 17:56:01 +0200 Subject: [PATCH] Using GloVe by default to pretrain word embeddings --- .gitmodules | 3 ++ GloVe | 1 + UD_any/data/Makefile | 2 +- UD_any/data/pretrainEmbeddings.py | 51 ------------------------------- UD_any/data/pretrainEmbeddings.sh | 25 +++++++++++++++ UD_any/train.sh | 4 ++- scripts/conll18_ud_eval.py | 2 ++ 7 files changed, 35 insertions(+), 53 deletions(-) create mode 100644 .gitmodules create mode 160000 GloVe delete mode 100755 UD_any/data/pretrainEmbeddings.py create mode 100755 UD_any/data/pretrainEmbeddings.sh diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..1298720 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "GloVe"] + path = GloVe + url = https://github.com/stanfordnlp/GloVe diff --git a/GloVe b/GloVe new file mode 160000 index 0000000..5187fa8 --- /dev/null +++ b/GloVe @@ -0,0 +1 @@ +Subproject commit 5187fa82f35348dab5d0d9d9af1ad70449a427d2 diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index f8896e0..d5082f5 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -48,7 +48,7 @@ texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) pretrain: - ./pretrainEmbeddings.py $(TRAIN_FILES) 64 pretrained.w2v + ./pretrainEmbeddings.sh $(TRAIN_FILES) 64 pretrained.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) $(FPLM_FILENAME): all_no_test.conllu $(SCRIPTS)/conllu2fplm.py $< > $@ diff --git a/UD_any/data/pretrainEmbeddings.py b/UD_any/data/pretrainEmbeddings.py deleted file mode 100755 index 7e1d9f1..0000000 --- a/UD_any/data/pretrainEmbeddings.py +++ /dev/null @@ -1,51 +0,0 @@ -#! /usr/bin/env python3 - -import sys -import os -import subprocess -from shutil import which - -def printUsageAndExit() : - print("USAGE : %s file.conllu embeddingsSize outputFile"%sys.argv[0], file=sys.stderr) - exit(1) - -if __name__ == "__main__" : - if len(sys.argv) != 4 : - printUsageAndExit() - - pathToFile = sys.argv[1] - embeddingsSize = int(sys.argv[2]) - splited = os.path.splitext(pathToFile) - target = sys.argv[3] - - if which("word2vec") is None : - print("word2vec not installed") - exit(0) - - if which("udpipe") is None : - print("udpipe not installed") - exit(0) - - p = subprocess.Popen("udpipe --output=horizontal none %s > in.text"%pathToFile, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - p.wait() - - args = [ - "word2vec", - "-cbow 0", - "-size %s"%embeddingsSize, - "-window 10", - "-negative 5", - "-hs 0", - "-sample 1e-1", - "-threads 4", - "-binary 0", - "-iter 15", - "-min-count 2", - "-train in.text", - "-output %s"%target, - "&& rm in.text" - ] - - p = subprocess.Popen(" ".join(args), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - p.wait() - diff --git a/UD_any/data/pretrainEmbeddings.sh b/UD_any/data/pretrainEmbeddings.sh new file mode 100755 index 0000000..dc84f55 --- /dev/null +++ b/UD_any/data/pretrainEmbeddings.sh @@ -0,0 +1,25 @@ +#! /usr/bin/env bash + +GLOVE="../../../../GloVe/" + +if [ "$#" -ne 3 ]; then + echo "USAGE : $0 input.conllu embeddingsSize output.w2v" + exit 1 +fi + +CURDIR="$(pwd)" +cd $GLOVE && make && cd $CURDIR \ +&& udpipe --output=horizontal none $1 > in.text \ +&& $GLOVE"build/vocab_count" -min-count 2 < in.text > vocab.txt \ +&& $GLOVE"build/cooccur" -symmetric 0 -window-size 10 -vocab-file vocab.txt -memory 8.0 -overflow-file tempoverflow < in.text > cooccurrences.bin \ +&& $GLOVE"build/shuffle" -memory 8.0 -seed 100 < cooccurrences.bin > cooccurrence.shuf.bin \ +&& $GLOVE"build/glove" -iter 50 -save_gradsq 0 -write-header 1 -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file out -gradsq-file gradsq -vector-size $2 -seed 100 -threads 1 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 1 \ +&& mv out.txt $3 + +rm in.text 2> /dev/null +rm vocab.txt 2> /dev/null +rm cooccurrences.bin 2> /dev/null +rm cooccurrence.shuf.bin 2> /dev/null +rm overflow_*\.bin 2> /dev/null + +exit 0 diff --git a/UD_any/train.sh b/UD_any/train.sh index 064bbef..da5ac4a 100755 --- a/UD_any/train.sh +++ b/UD_any/train.sh @@ -32,7 +32,8 @@ if [ ! -d "$EXPPATH" ]; then fi CURDIR=$(pwd) -cd $EXPPATH"/"data && make -s clean && make -s && cd $CURDIR +cd $EXPPATH"/"data && make -s clean && make -s +cd $CURDIR TRAIN=$EXPPATH"/data/train.conllu" TRAINRAW=$EXPPATH"/data/train.txt" @@ -43,6 +44,7 @@ TESTRAW=$EXPPATH"/data/test.txt" if test ! -f $TRAIN; then + pwd >&2 echo "ERROR : no train file found in" $EXPPATH >&2 echo "$TRAIN" print_usage_and_exit diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py index 7d1f1ff..d57179a 100755 --- a/scripts/conll18_ud_eval.py +++ b/scripts/conll18_ud_eval.py @@ -527,6 +527,8 @@ def evaluate(gold_ud, system_ud) : result["UAS"] = alignment_score(alignment, lambda w, ga : ga(w.parent)) if "DEPREL" in col2index : result["LAS"] = alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[col2index["DEPREL"]])) + if "DEPREL" in col2index and "UPOS" in col2index and "FEATS" in col2index : + result["MLAS"] = alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[col2index["DEPREL"]], w.columns[col2index["UPOS"]], w.columns[col2index["FEATS"]], [(ga(c), c.columns[col2index["DEPREL"]], c.columns[col2index["UPOS"]], c.columns[col2index["FEATS"]]) for c in w.functional_children]), filter_fn=lambda w: w.is_content_deprel) if "ID" in col2index : result["Sentences"] = spans_score(gold_ud.sentences, system_ud.sentences) -- GitLab