From 26f25b222e4ee73cfcca40952d9a7b95cfeb68b0 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Wed, 10 Jun 2020 10:49:00 +0200
Subject: [PATCH] Corpora are now copied into exp dir to avoid conflicts
 between multiple experiences

---
 UD_any/data/Makefile              | 13 +++-----
 UD_any/data/pretrainEmbeddings.py |  6 ++--
 UD_any/evaluate.sh                | 26 ++++-----------
 UD_any/prepareExperiment.sh       | 32 ++++++++++++++++---
 UD_any/train.sh                   | 53 +++++++++++++++----------------
 5 files changed, 67 insertions(+), 63 deletions(-)

diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile
index 37640dd..5e8ee60 100644
--- a/UD_any/data/Makefile
+++ b/UD_any/data/Makefile
@@ -1,11 +1,9 @@
-include ../config
-
 SCRIPTS=../../../../scripts
 CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
 
-TRAIN_FILES=$(shell find $(CORPUS) -type f -name '*train*.conllu')
-DEV_FILES=$(shell find $(CORPUS) -type f -name '*dev*.conllu')
-TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu')
+TRAIN_FILES=$(shell find . -type f -name '*train*.conllu')
+DEV_FILES=$(shell find . -type f -name '*dev*.conllu')
+TEST_FILES=$(shell find . -type f -name '*test*.conllu')
 
 #This part is for lemmatizer rules and excpetions computation
 THRESHOLD=10
@@ -45,14 +43,13 @@ texts:
 	./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
 
 pretrain: texts
-	./pretrainEmbeddings.py $(shell find $(CORPUS) -type f -name '*train*.txt') 64
+	./pretrainEmbeddings.py train.txt 64 pretrained.w2v
 
 $(FPLM_FILENAME): all_no_test.conllu
 	$(SCRIPTS)/conllu2fplm.py $< > $@
 
 clean:
-	- rm -f *\.txt
-	- rm -f *\.conll*
 	- rm -f *\.ts
+	- rm -f ambiguities\.txt
 	- rm -f $(FPLM_FILENAME)
 
diff --git a/UD_any/data/pretrainEmbeddings.py b/UD_any/data/pretrainEmbeddings.py
index d9a7f6d..11a777f 100755
--- a/UD_any/data/pretrainEmbeddings.py
+++ b/UD_any/data/pretrainEmbeddings.py
@@ -6,17 +6,17 @@ import subprocess
 from shutil import which
 
 def printUsageAndExit() :
-  print("USAGE : %s file.conllu embeddingsSize"%sys.argv[0], file=sys.stderr)
+  print("USAGE : %s file.conllu embeddingsSize outputFile"%sys.argv[0], file=sys.stderr)
   exit(1)
 
 if __name__ == "__main__" :
-  if len(sys.argv) != 3 :
+  if len(sys.argv) != 4 :
     printUsageAndExit()
 
   pathToFile = sys.argv[1]
   embeddingsSize = int(sys.argv[2])
   splited = os.path.splitext(pathToFile)
-  target = splited[0] + ".w2v"
+  target = sys.argv[3]
 
   if which("word2vec") is None :
     exit(0)
diff --git a/UD_any/evaluate.sh b/UD_any/evaluate.sh
index 4fcaa38..8b8bdda 100755
--- a/UD_any/evaluate.sh
+++ b/UD_any/evaluate.sh
@@ -1,9 +1,5 @@
 #! /usr/bin/env bash
 
-function has_space {
-  [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
-}
-
 function print_usage_and_exit {
   >&2 echo "USAGE : (tsv | txt) expPath [arguments]"
   exit 1
@@ -32,26 +28,16 @@ if [ ! -d "$EXPPATH" ]; then
   print_usage_and_exit
 fi
 
-source $EXPPATH"/config"
-
-TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
-TRAINRAW=$(find $CORPUS -type f -name '*train*.txt')
-DEV=$(find $CORPUS -type f -name '*dev*.conllu')
-DEVRAW=$(find $CORPUS -type f -name '*dev*.txt')
-TEST=$(find $CORPUS -type f -name '*test*.conllu')
-TESTRAW=$(find $CORPUS -type f -name '*test*.txt')
+TRAIN=$EXPPATH"/data/train.conllu"
+TRAINRAW=$EXPPATH"/data/train.txt"
+DEV=$EXPPATH"/data/dev.conllu"
+DEVRAW=$EXPPATH"/data/dev.txt"
+TEST=$EXPPATH"/data/test.conllu"
+TESTRAW=$EXPPATH"/data/test.txt"
 
 REF=$TEST
 REFRAW=$TESTRAW
 
-if has_space "$REF" || has_space "$REFRAW";
-then
-  >&2 echo "ERROR : more than 1 match"
-  >&2 echo "REF : " $REF
-  >&2 echo "REFRAW : " $REFRAW
-  print_usage_and_exit
-fi
-
 if test ! -f $REF;
 then
   >&2 echo "ERROR : no ref file found in" $CORPUS
diff --git a/UD_any/prepareExperiment.sh b/UD_any/prepareExperiment.sh
index e89c54a..bcd32dd 100755
--- a/UD_any/prepareExperiment.sh
+++ b/UD_any/prepareExperiment.sh
@@ -7,6 +7,10 @@ function print_usage_and_exit {
   exit 1
 }
 
+function has_space {
+  [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
+}
+
 LANG=$1
 TEMPLATENAME=$2
 EXPNAME=$3
@@ -41,12 +45,32 @@ if [ ! -d "$CORPUS" ]; then
   print_usage_and_exit
 fi
 
-mkdir -p bin
+TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
+DEV=$(find $CORPUS -type f -name '*dev*.conllu')
+TEST=$(find $CORPUS -type f -name '*test*.conllu')
+
+if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
+then
+  >&2 echo "ERROR : more than 1 match with keyword" $KEYWORD
+  >&2 echo "TRAIN : " $TRAIN
+  >&2 echo "DEV : " $DEV
+  >&2 echo "TEST : " $TEST
+  print_usage_and_exit
+fi
 
+mkdir -p bin
 
 if [ ! -d "bin/$EXPNAME" ]; then
-cp -r $TEMPLATENAME bin/$EXPNAME
-cp -r "data" bin/$EXPNAME/.
-echo "CORPUS="$CORPUS > bin/$EXPNAME/config
+	cp -r $TEMPLATENAME bin/$EXPNAME
+	cp -r "data" bin/$EXPNAME/.
+	if [ -f "$TRAIN" ]; then
+		cp $TRAIN bin/$EXPNAME/data/train.conllu
+	fi
+	if [ -f "$DEV" ]; then
+		cp $DEV bin/$EXPNAME/data/dev.conllu
+	fi
+	if [ -f "$TEST" ]; then
+		cp $TEST bin/$EXPNAME/data/test.conllu
+	fi
 fi
 
diff --git a/UD_any/train.sh b/UD_any/train.sh
index 3bc401b..88c9112 100755
--- a/UD_any/train.sh
+++ b/UD_any/train.sh
@@ -1,9 +1,5 @@
 #! /usr/bin/env bash
 
-function has_space {
-  [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
-}
-
 function print_usage_and_exit {
   >&2 echo "USAGE : (tsv | txt) expPath [arguments]"
   exit 1
@@ -12,6 +8,9 @@ function print_usage_and_exit {
 MODE=$1
 EXPPATH=$2
 
+>&2 echo "********************************************************************************"
+>&2 echo "Training : "$EXPPATH
+
 if [ -z "$MODE" ];
 then
   >&2 echo "ERROR : missing argument 1 (mode)"
@@ -32,42 +31,40 @@ if [ ! -d "$EXPPATH" ]; then
   print_usage_and_exit
 fi
 
-source $EXPPATH"/config"
-
 CURDIR=$(pwd)
 cd $EXPPATH"/"data && make -s clean && make -s && cd $CURDIR
 
-TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
-TRAINRAW=$(find $CORPUS -type f -name '*train*.txt')
-DEV=$(find $CORPUS -type f -name '*dev*.conllu')
-DEVRAW=$(find $CORPUS -type f -name '*dev*.txt')
-TEST=$(find $CORPUS -type f -name '*test*.conllu')
-TESTRAW=$(find $CORPUS -type f -name '*test*.txt')
-W2V=$(find $CORPUS -type f -name '*.w2v')
+TRAIN=$EXPPATH"/data/train.conllu"
+TRAINRAW=$EXPPATH"/data/train.txt"
+DEV=$EXPPATH"/data/dev.conllu"
+DEVRAW=$EXPPATH"/data/dev.txt"
+TEST=$EXPPATH"/data/test.conllu"
+TESTRAW=$EXPPATH"/data/test.txt"
+W2V=$EXPPATH"/data/pretrained.w2v"
 
-if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
+if test ! -f $TRAIN;
 then
-  >&2 echo "ERROR : more than 1 match with keyword" $KEYWORD
-  >&2 echo "TRAIN : " $TRAIN
-  >&2 echo "DEV : " $DEV
-  >&2 echo "TEST : " $TEST
+  >&2 echo "ERROR : no train file found in" $EXPPATH
+  >&2 echo "$TRAIN"
   print_usage_and_exit
 fi
 
-if test -z $TRAIN;
+if test ! -f $DEV;
 then
-  >&2 echo "ERROR : no train file found in" $CORPUS
-  >&2 echo "$TRAIN"
-  print_usage_and_exit
+	DEV=""
 fi
 
 if [ "$MODE" = "txt" ]; then
-if test -z $TRAINRAW;
-then
-  >&2 echo "ERROR : no train file found in" $CORPUS
-  >&2 echo "$TRAINRAW"
-  print_usage_and_exit
-fi
+	if test ! -f $TRAINRAW;
+	then
+	  >&2 echo "ERROR : no train file found in" $EXPPATH
+	  >&2 echo "$TRAINRAW"
+	  print_usage_and_exit
+	fi
+	if test ! -f $DEVRAW;
+	then
+		DEVRAW=""
+	fi
 fi
 
 if test -f $W2V;
-- 
GitLab