From 70f6b20ae6de0b4decced09d34f1f9e7839b623d Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Tue, 28 Jul 2020 17:56:01 +0200
Subject: [PATCH] Using GloVe by default to pretrain word embeddings

---
 .gitmodules                       |  3 ++
 GloVe                             |  1 +
 UD_any/data/Makefile              |  2 +-
 UD_any/data/pretrainEmbeddings.py | 51 -------------------------------
 UD_any/data/pretrainEmbeddings.sh | 25 +++++++++++++++
 UD_any/train.sh                   |  4 ++-
 scripts/conll18_ud_eval.py        |  2 ++
 7 files changed, 35 insertions(+), 53 deletions(-)
 create mode 100644 .gitmodules
 create mode 160000 GloVe
 delete mode 100755 UD_any/data/pretrainEmbeddings.py
 create mode 100755 UD_any/data/pretrainEmbeddings.sh

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..1298720
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "GloVe"]
+	path = GloVe
+	url = https://github.com/stanfordnlp/GloVe
diff --git a/GloVe b/GloVe
new file mode 160000
index 0000000..5187fa8
--- /dev/null
+++ b/GloVe
@@ -0,0 +1 @@
+Subproject commit 5187fa82f35348dab5d0d9d9af1ad70449a427d2
diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile
index f8896e0..d5082f5 100644
--- a/UD_any/data/Makefile
+++ b/UD_any/data/Makefile
@@ -48,7 +48,7 @@ texts:
 	./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
 
 pretrain:
-	./pretrainEmbeddings.py $(TRAIN_FILES) 64 pretrained.w2v
+	./pretrainEmbeddings.sh $(TRAIN_FILES) 64 pretrained.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 )
 
 $(FPLM_FILENAME): all_no_test.conllu
 	$(SCRIPTS)/conllu2fplm.py $< > $@
diff --git a/UD_any/data/pretrainEmbeddings.py b/UD_any/data/pretrainEmbeddings.py
deleted file mode 100755
index 7e1d9f1..0000000
--- a/UD_any/data/pretrainEmbeddings.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#! /usr/bin/env python3
-
-import sys
-import os
-import subprocess
-from shutil import which
-
-def printUsageAndExit() :
-  print("USAGE : %s file.conllu embeddingsSize outputFile"%sys.argv[0], file=sys.stderr)
-  exit(1)
-
-if __name__ == "__main__" :
-  if len(sys.argv) != 4 :
-    printUsageAndExit()
-
-  pathToFile = sys.argv[1]
-  embeddingsSize = int(sys.argv[2])
-  splited = os.path.splitext(pathToFile)
-  target = sys.argv[3]
-
-  if which("word2vec") is None :
-    print("word2vec not installed")
-    exit(0)
-
-  if which("udpipe") is None :
-    print("udpipe not installed")
-    exit(0)
-
-  p = subprocess.Popen("udpipe --output=horizontal none %s > in.text"%pathToFile, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-  p.wait()
-
-  args = [
-    "word2vec",
-    "-cbow 0",
-    "-size %s"%embeddingsSize,
-    "-window 10",
-    "-negative 5",
-    "-hs 0",
-    "-sample 1e-1",
-    "-threads 4",
-    "-binary 0",
-    "-iter 15",
-    "-min-count 2",
-    "-train in.text",
-    "-output %s"%target,
-    "&& rm in.text"
-  ]
-
-  p = subprocess.Popen(" ".join(args), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-  p.wait()
-
diff --git a/UD_any/data/pretrainEmbeddings.sh b/UD_any/data/pretrainEmbeddings.sh
new file mode 100755
index 0000000..dc84f55
--- /dev/null
+++ b/UD_any/data/pretrainEmbeddings.sh
@@ -0,0 +1,25 @@
+#! /usr/bin/env bash
+
+GLOVE="../../../../GloVe/"
+
+if [ "$#" -ne 3 ]; then
+    echo "USAGE : $0 input.conllu embeddingsSize output.w2v"
+    exit 1
+fi
+
+CURDIR="$(pwd)"
+cd $GLOVE && make && cd $CURDIR \
+&& udpipe --output=horizontal none $1 > in.text \
+&& $GLOVE"build/vocab_count" -min-count 2 < in.text > vocab.txt \
+&& $GLOVE"build/cooccur" -symmetric 0 -window-size 10 -vocab-file vocab.txt -memory 8.0 -overflow-file tempoverflow < in.text > cooccurrences.bin \
+&& $GLOVE"build/shuffle" -memory 8.0 -seed 100 < cooccurrences.bin > cooccurrence.shuf.bin \
+&& $GLOVE"build/glove" -iter 50 -save_gradsq 0 -write-header 1 -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file out -gradsq-file gradsq -vector-size $2 -seed 100 -threads 1 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 1 \
+&& mv out.txt $3
+
+rm in.text 2> /dev/null
+rm vocab.txt 2> /dev/null
+rm cooccurrences.bin 2> /dev/null
+rm cooccurrence.shuf.bin 2> /dev/null
+rm overflow_*\.bin 2> /dev/null
+
+exit 0
diff --git a/UD_any/train.sh b/UD_any/train.sh
index 064bbef..da5ac4a 100755
--- a/UD_any/train.sh
+++ b/UD_any/train.sh
@@ -32,7 +32,8 @@ if [ ! -d "$EXPPATH" ]; then
 fi
 
 CURDIR=$(pwd)
-cd $EXPPATH"/"data && make -s clean && make -s && cd $CURDIR
+cd $EXPPATH"/"data && make -s clean && make -s
+cd $CURDIR
 
 TRAIN=$EXPPATH"/data/train.conllu"
 TRAINRAW=$EXPPATH"/data/train.txt"
@@ -43,6 +44,7 @@ TESTRAW=$EXPPATH"/data/test.txt"
 
 if test ! -f $TRAIN;
 then
+  pwd
   >&2 echo "ERROR : no train file found in" $EXPPATH
   >&2 echo "$TRAIN"
   print_usage_and_exit
diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py
index 7d1f1ff..d57179a 100755
--- a/scripts/conll18_ud_eval.py
+++ b/scripts/conll18_ud_eval.py
@@ -527,6 +527,8 @@ def evaluate(gold_ud, system_ud) :
     result["UAS"] = alignment_score(alignment, lambda w, ga : ga(w.parent))
   if "DEPREL" in col2index :
     result["LAS"] = alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[col2index["DEPREL"]]))
+  if "DEPREL" in col2index and "UPOS" in col2index and "FEATS" in col2index :
+    result["MLAS"] = alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[col2index["DEPREL"]], w.columns[col2index["UPOS"]], w.columns[col2index["FEATS"]], [(ga(c), c.columns[col2index["DEPREL"]], c.columns[col2index["UPOS"]], c.columns[col2index["FEATS"]]) for c in w.functional_children]), filter_fn=lambda w: w.is_content_deprel)
   if "ID" in col2index :
     result["Sentences"] = spans_score(gold_ud.sentences, system_ud.sentences)
 
-- 
GitLab