From 9074e66fce7103fb43fbbdd94683cf3dcfa72dec Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Tue, 31 Mar 2020 17:20:52 +0200
Subject: [PATCH] Changed the way we train and evaluate

---
 UD_any/data/Makefile             | 26 ++++++-------
 UD_any/data/getTransitionSets.py |  2 +-
 UD_any/evaluate.sh               | 67 +++++++++++++++++---------------
 UD_any/prepareExperiment.sh      | 43 ++++++++++++++++++++
 UD_any/train.sh                  | 66 +++++++++++--------------------
 5 files changed, 116 insertions(+), 88 deletions(-)
 create mode 100755 UD_any/prepareExperiment.sh

diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile
index 48ee4c0..b15a589 100644
--- a/UD_any/data/Makefile
+++ b/UD_any/data/Makefile
@@ -1,12 +1,12 @@
 include ../config
 
-SCRIPTS=../../scripts
+SCRIPTS=../../../../scripts
 CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
 MCD=conllu.mcd
 
-TRAIN_FILES=$(shell find $(UD_ROOT) -type f -name '*train*.conllu')
-DEV_FILES=$(shell find $(UD_ROOT) -type f -name '*dev*.conllu')
-TEST_FILES=$(shell find $(UD_ROOT) -type f -name '*test*.conllu')
+TRAIN_FILES=$(shell find $(CORPUS) -type f -name '*train*.conllu')
+DEV_FILES=$(shell find $(CORPUS) -type f -name '*dev*.conllu')
+TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu')
 
 #This part is for lemmatizer rules and excpetions computation
 THRESHOLD=10
@@ -15,8 +15,8 @@ RULES_FILENAME=lemmatizer_rules.ts
 EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm
 
 all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns $(FPLM_FILENAME) $(RULES_FILENAME)
-	rm col_*\.txt
-	rm all_no_test.conllu
+	rm -f col_*\.txt
+	rm -f all_no_test.conllu
 
 all_no_test.conllu:
 	cat $(TRAIN_FILES) > $@
@@ -47,14 +47,14 @@ $(FPLM_FILENAME): all_no_test.conllu $(MCD)
 
 $(RULES_FILENAME): $(FPLM_FILENAME)
 	macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt -t $(THRESHOLD)
-	rm tmp.txt
+	rm -f tmp.txt
 	echo -e "Default : NOTHING\nTOLOWER b.0 LEMMA\nTOUPPER b.0 LEMMA" > lemmatizer_case.ts
 
 clean:
-	- rm *\.txt
-	- rm *\.conll*
-	- rm *\.ts
-	- rm $(RULES_FILENAME)
-	- rm $(EXCEPTIONS_FPLM_FILENAME)
-	- rm $(FPLM_FILENAME)
+	- rm -f *\.txt
+	- rm -f *\.conll*
+	- rm -f *\.ts
+	- rm -f $(RULES_FILENAME)
+	- rm -f $(EXCEPTIONS_FPLM_FILENAME)
+	- rm -f $(FPLM_FILENAME)
 
diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py
index ca01d68..6d06e7b 100755
--- a/UD_any/data/getTransitionSets.py
+++ b/UD_any/data/getTransitionSets.py
@@ -2,7 +2,7 @@
 
 import sys
 
-sys.path.insert(1, '../../scripts')
+sys.path.insert(1, '../../../../scripts')
 
 from readMCD import readMCD
 
diff --git a/UD_any/evaluate.sh b/UD_any/evaluate.sh
index e298885..9ef891a 100755
--- a/UD_any/evaluate.sh
+++ b/UD_any/evaluate.sh
@@ -1,76 +1,81 @@
 #! /bin/bash
 
-source config
-
 function has_space {
   [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
 }
 
 function print_usage_and_exit {
-  >&2 echo "USAGE : (tsv | txt) language_keyword templateName expName [arguments]"
+  >&2 echo "USAGE : (tsv | txt) expPath [arguments]"
   exit 1
 }
 
-MCD=data/conllu.mcd
 MODE=$1
-KEYWORD=$2
-EXPNAME=$3
+EXPPATH=$2
 
-if [ -z "$KEYWORD" ];
+if [ -z "$MODE" ];
 then
-  >&2 echo "ERROR : missing argument 1 (keyword)"
+  >&2 echo "ERROR : missing argument 1 (mode)"
   print_usage_and_exit
 fi
 
-if [ -z "$EXPNAME" ];
+if [ -z "$EXPPATH" ];
 then
-  >&2 echo "ERROR : missing argument 2 (expName)"
+  >&2 echo "ERROR : missing argument 2 (expPath)"
   print_usage_and_exit
 fi
 
-shift
 shift
 shift
 
-if [ "$KEYWORD" = "." ]
-then
-  KEYWORD=""
+if [ ! -d "$EXPPATH" ]; then
+  >&2 echo "ERROR : directory $EXPPATH doesn't exist"
+  print_usage_and_exit
 fi
 
-TEST=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.conllu')
-TESTRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.txt')
-DEV=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.conllu')
-DEVRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.txt')
-TRAIN=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.conllu')
-TRAINRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.txt')
+source $EXPPATH"/config"
 
-EVALTARGET=$TEST
-EVALTARGETRAW=$TESTRAW
+TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
+TRAINRAW=$(find $CORPUS -type f -name '*train*.txt')
+DEV=$(find $CORPUS -type f -name '*dev*.conllu')
+DEVRAW=$(find $CORPUS -type f -name '*dev*.txt')
+TEST=$(find $CORPUS -type f -name '*test*.conllu')
+TESTRAW=$(find $CORPUS -type f -name '*test*.txt')
 
-if has_space "$EVALTARGET";
+REF=$TEST
+REFRAW=$TESTRAW
+
+if has_space "$REF" || has_space "$REFRAW";
 then
-  >&2 echo "ERROR : more than 1 match with keyword" $KEYWORD
-  >&2 echo "TEST : " $EVALTARGET
+  >&2 echo "ERROR : more than 1 match"
+  >&2 echo "REF : " $REF
+  >&2 echo "REFRAW : " $REFRAW
   print_usage_and_exit
 fi
 
-if test ! -f $EVALTARGET;
+if test ! -f $REF;
+then
+  >&2 echo "ERROR : no ref file found in" $CORPUS
+  >&2 echo "$REF"
+  print_usage_and_exit
+fi
+if test ! -f $REFRAW;
 then
-  >&2 echo "ERROR : no target file found with keyword" $KEYWORD
-  >&2 echo "$EVALTARGET"
+  >&2 echo "ERROR : no ref file found in" $CORPUS
+  >&2 echo "$REFRAW"
   print_usage_and_exit
 fi
 
+MCD=$EXPPATH"/data/*\.mcd"
 EVALCONLL="../scripts/conll18_ud_eval.py"
-OUTPUT=$EXPNAME"/predicted_eval.tsv"
+OUTPUT=$EXPPATH"/predicted_eval.tsv"
 
 if [ "$MODE" = "tsv" ]; then
-macaon decode --model $EXPNAME --mcd $MCD --inputTSV $EVALTARGET $@ > $OUTPUT && $EVALCONLL $EVALTARGET $OUTPUT -v || exit 1
+macaon decode --model $EXPPATH --mcd $MCD --inputTSV $REF $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT -v || exit 1
 exit 0
 fi
 
 if [ "$MODE" = "txt" ]; then
-macaon decode --model $EXPNAME --mcd $MCD --inputTXT $EVALTARGETRAW $@ > $OUTPUT && $EVALCONLL $EVALTARGET $OUTPUT -v || exit 1
+macaon decode --model $EXPPATH --mcd $MCD --inputTXT $REFRAW $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT -v || exit 1
 exit 0
 fi
 
diff --git a/UD_any/prepareExperiment.sh b/UD_any/prepareExperiment.sh
new file mode 100755
index 0000000..4468aef
--- /dev/null
+++ b/UD_any/prepareExperiment.sh
@@ -0,0 +1,43 @@
+#! /bin/bash
+
+source config
+
+function print_usage_and_exit {
+  >&2 echo "USAGE : language templateName expName"
+  exit 1
+}
+
+LANG=$1
+TEMPLATENAME=$2
+EXPNAME=$3
+
+if [ -z "$LANG" ];
+then
+  >&2 echo "ERROR : missing argument 1 (lang)"
+  print_usage_and_exit
+fi
+
+if [ -z "$TEMPLATENAME" ];
+then
+  >&2 echo "ERROR : missing argument 2 (templateName)"
+  print_usage_and_exit
+fi
+
+if [ -z "$EXPNAME" ];
+then
+  >&2 echo "ERROR : missing argument 3 (expName)"
+  print_usage_and_exit
+fi
+
+
+if [ ! -d "$TEMPLATENAME" ]; then
+  >&2 echo "ERROR : directory $TEMPLATENAME doesn't exist"
+  print_usage_and_exit
+fi
+
+mkdir -p bin
+rm -rf bin/$EXPNAME
+cp -r $TEMPLATENAME bin/$EXPNAME
+cp -r "data" bin/$EXPNAME/.
+echo "CORPUS="$UD_ROOT"/"$LANG > bin/$EXPNAME/config
+
diff --git a/UD_any/train.sh b/UD_any/train.sh
index 0a6fbc2..265662b 100755
--- a/UD_any/train.sh
+++ b/UD_any/train.sh
@@ -1,56 +1,45 @@
 #! /bin/bash
 
-source config
-
 function has_space {
   [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
 }
 
 function print_usage_and_exit {
-  >&2 echo "USAGE : (tsv | txt) language_keyword templateName expName [arguments]"
+  >&2 echo "USAGE : (tsv | txt) expPath [arguments]"
   exit 1
 }
 
-MCD=data/conllu.mcd
 MODE=$1
-KEYWORD=$2
-TEMPLATENAME=$3
-EXPNAME=$4
-
-if [ -z "$KEYWORD" ];
-then
-  >&2 echo "ERROR : missing argument 1 (keyword)"
-  print_usage_and_exit
-fi
+EXPPATH=$2
 
-if [ -z "$TEMPLATENAME" ];
+if [ -z "$MODE" ];
 then
-  >&2 echo "ERROR : missing argument 2 (templateName)"
+  >&2 echo "ERROR : missing argument 1 (mode)"
   print_usage_and_exit
 fi
 
-if [ -z "$EXPNAME" ];
+if [ -z "$EXPPATH" ];
 then
-  >&2 echo "ERROR : missing argument 3 (expName)"
+  >&2 echo "ERROR : missing argument 2 (expPath)"
   print_usage_and_exit
 fi
 
-shift
-shift
 shift
 shift
 
-if [ "$KEYWORD" = "." ]
-then
-  KEYWORD=""
+if [ ! -d "$EXPPATH" ]; then
+  >&2 echo "ERROR : directory $EXPPATH doesn't exist"
+  print_usage_and_exit
 fi
 
-TRAIN=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.conllu')
-TRAINRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.txt')
-DEV=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.conllu')
-DEVRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.txt')
-TEST=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.conllu')
-TESTRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.txt')
+source $EXPPATH"/config"
+
+TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
+TRAINRAW=$(find $CORPUS -type f -name '*train*.txt')
+DEV=$(find $CORPUS -type f -name '*dev*.conllu')
+DEVRAW=$(find $CORPUS -type f -name '*dev*.txt')
+TEST=$(find $CORPUS -type f -name '*test*.conllu')
+TESTRAW=$(find $CORPUS -type f -name '*test*.txt')
 
 if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
 then
@@ -63,32 +52,23 @@ fi
 
 if test ! -f $TRAIN;
 then
-  >&2 echo "ERROR : no train file found with keyword" $KEYWORD
+  >&2 echo "ERROR : no train file found in" $CORPUS
   >&2 echo "$TRAIN"
   print_usage_and_exit
 fi
 
-mkdir -p bin
-
-if [ ! -d "$TEMPLATENAME" ]; then
-  >&2 echo "ERROR : directory $TEMPLATENAME doesn't exist"
-  print_usage_and_exit
-fi
-
-rm -rf bin/$EXPNAME
-cp -r $TEMPLATENAME bin/$EXPNAME
-cp -r "data" bin/$EXPNAME/.
-
-EVALCONLL="../scripts/conll18_ud_eval.py"
+CURDIR=$(pwd)
+cd $EXPPATH"/"data && make -s clean && make -s && cd $CURDIR
 
+MCD=$EXPPATH"/data/*\.mcd"
 
 if [ "$MODE" = "tsv" ]; then
-macaon train --model bin/$EXPNAME --mcd $MCD --trainTSV $TRAIN --devTSV $DEV $@ || exit 1
+macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --devTSV $DEV $@ || exit 1
 exit 0
 fi
 
 if [ "$MODE" = "txt" ]; then
-macaon train --model bin/$EXPNAME --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $@ || exit 1
+macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $@ || exit 1
 exit 0
 fi
 
-- 
GitLab