From 9074e66fce7103fb43fbbdd94683cf3dcfa72dec Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Tue, 31 Mar 2020 17:20:52 +0200 Subject: [PATCH] Changed the way we train and evaluate --- UD_any/data/Makefile | 26 ++++++------- UD_any/data/getTransitionSets.py | 2 +- UD_any/evaluate.sh | 67 +++++++++++++++++--------------- UD_any/prepareExperiment.sh | 43 ++++++++++++++++++++ UD_any/train.sh | 66 +++++++++++-------------------- 5 files changed, 116 insertions(+), 88 deletions(-) create mode 100755 UD_any/prepareExperiment.sh diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 48ee4c0..b15a589 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -1,12 +1,12 @@ include ../config -SCRIPTS=../../scripts +SCRIPTS=../../../../scripts CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl MCD=conllu.mcd -TRAIN_FILES=$(shell find $(UD_ROOT) -type f -name '*train*.conllu') -DEV_FILES=$(shell find $(UD_ROOT) -type f -name '*dev*.conllu') -TEST_FILES=$(shell find $(UD_ROOT) -type f -name '*test*.conllu') +TRAIN_FILES=$(shell find $(CORPUS) -type f -name '*train*.conllu') +DEV_FILES=$(shell find $(CORPUS) -type f -name '*dev*.conllu') +TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu') #This part is for lemmatizer rules and excpetions computation THRESHOLD=10 @@ -15,8 +15,8 @@ RULES_FILENAME=lemmatizer_rules.ts EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns $(FPLM_FILENAME) $(RULES_FILENAME) - rm col_*\.txt - rm all_no_test.conllu + rm -f col_*\.txt + rm -f all_no_test.conllu all_no_test.conllu: cat $(TRAIN_FILES) > $@ @@ -47,14 +47,14 @@ $(FPLM_FILENAME): all_no_test.conllu $(MCD) $(RULES_FILENAME): $(FPLM_FILENAME) macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt -t $(THRESHOLD) - rm tmp.txt + rm -f tmp.txt echo -e "Default : NOTHING\nTOLOWER b.0 LEMMA\nTOUPPER b.0 LEMMA" > lemmatizer_case.ts clean: - - rm *\.txt - - rm *\.conll* - - rm *\.ts - - rm $(RULES_FILENAME) - - rm $(EXCEPTIONS_FPLM_FILENAME) - - rm $(FPLM_FILENAME) + - rm -f *\.txt + - rm -f *\.conll* + - rm -f *\.ts + - rm -f $(RULES_FILENAME) + - rm -f $(EXCEPTIONS_FPLM_FILENAME) + - rm -f $(FPLM_FILENAME) diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py index ca01d68..6d06e7b 100755 --- a/UD_any/data/getTransitionSets.py +++ b/UD_any/data/getTransitionSets.py @@ -2,7 +2,7 @@ import sys -sys.path.insert(1, '../../scripts') +sys.path.insert(1, '../../../../scripts') from readMCD import readMCD diff --git a/UD_any/evaluate.sh b/UD_any/evaluate.sh index e298885..9ef891a 100755 --- a/UD_any/evaluate.sh +++ b/UD_any/evaluate.sh @@ -1,76 +1,81 @@ #! /bin/bash -source config - function has_space { [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1 } function print_usage_and_exit { - >&2 echo "USAGE : (tsv | txt) language_keyword templateName expName [arguments]" + >&2 echo "USAGE : (tsv | txt) expPath [arguments]" exit 1 } -MCD=data/conllu.mcd MODE=$1 -KEYWORD=$2 -EXPNAME=$3 +EXPPATH=$2 -if [ -z "$KEYWORD" ]; +if [ -z "$MODE" ]; then - >&2 echo "ERROR : missing argument 1 (keyword)" + >&2 echo "ERROR : missing argument 1 (mode)" print_usage_and_exit fi -if [ -z "$EXPNAME" ]; +if [ -z "$EXPPATH" ]; then - >&2 echo "ERROR : missing argument 2 (expName)" + >&2 echo "ERROR : missing argument 2 (expPath)" print_usage_and_exit fi -shift shift shift -if [ "$KEYWORD" = "." ] -then - KEYWORD="" +if [ ! -d "$EXPPATH" ]; then + >&2 echo "ERROR : directory $EXPPATH doesn't exist" + print_usage_and_exit fi -TEST=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.conllu') -TESTRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.txt') -DEV=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.conllu') -DEVRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.txt') -TRAIN=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.conllu') -TRAINRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.txt') +source $EXPPATH"/config" -EVALTARGET=$TEST -EVALTARGETRAW=$TESTRAW +TRAIN=$(find $CORPUS -type f -name '*train*.conllu') +TRAINRAW=$(find $CORPUS -type f -name '*train*.txt') +DEV=$(find $CORPUS -type f -name '*dev*.conllu') +DEVRAW=$(find $CORPUS -type f -name '*dev*.txt') +TEST=$(find $CORPUS -type f -name '*test*.conllu') +TESTRAW=$(find $CORPUS -type f -name '*test*.txt') -if has_space "$EVALTARGET"; +REF=$TEST +REFRAW=$TESTRAW + +if has_space "$REF" || has_space "$REFRAW"; then - >&2 echo "ERROR : more than 1 match with keyword" $KEYWORD - >&2 echo "TEST : " $EVALTARGET + >&2 echo "ERROR : more than 1 match" + >&2 echo "REF : " $REF + >&2 echo "REFRAW : " $REFRAW print_usage_and_exit fi -if test ! -f $EVALTARGET; +if test ! -f $REF; +then + >&2 echo "ERROR : no ref file found in" $CORPUS + >&2 echo "$REF" + print_usage_and_exit +fi +if test ! -f $REFRAW; then - >&2 echo "ERROR : no target file found with keyword" $KEYWORD - >&2 echo "$EVALTARGET" + >&2 echo "ERROR : no ref file found in" $CORPUS + >&2 echo "$REFRAW" print_usage_and_exit fi +MCD=$EXPPATH"/data/*\.mcd" EVALCONLL="../scripts/conll18_ud_eval.py" -OUTPUT=$EXPNAME"/predicted_eval.tsv" +OUTPUT=$EXPPATH"/predicted_eval.tsv" if [ "$MODE" = "tsv" ]; then -macaon decode --model $EXPNAME --mcd $MCD --inputTSV $EVALTARGET $@ > $OUTPUT && $EVALCONLL $EVALTARGET $OUTPUT -v || exit 1 +macaon decode --model $EXPPATH --mcd $MCD --inputTSV $REF $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT -v || exit 1 exit 0 fi if [ "$MODE" = "txt" ]; then -macaon decode --model $EXPNAME --mcd $MCD --inputTXT $EVALTARGETRAW $@ > $OUTPUT && $EVALCONLL $EVALTARGET $OUTPUT -v || exit 1 +macaon decode --model $EXPPATH --mcd $MCD --inputTXT $REFRAW $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT -v || exit 1 exit 0 fi diff --git a/UD_any/prepareExperiment.sh b/UD_any/prepareExperiment.sh new file mode 100755 index 0000000..4468aef --- /dev/null +++ b/UD_any/prepareExperiment.sh @@ -0,0 +1,43 @@ +#! /bin/bash + +source config + +function print_usage_and_exit { + >&2 echo "USAGE : language templateName expName" + exit 1 +} + +LANG=$1 +TEMPLATENAME=$2 +EXPNAME=$3 + +if [ -z "$LANG" ]; +then + >&2 echo "ERROR : missing argument 1 (lang)" + print_usage_and_exit +fi + +if [ -z "$TEMPLATENAME" ]; +then + >&2 echo "ERROR : missing argument 2 (templateName)" + print_usage_and_exit +fi + +if [ -z "$EXPNAME" ]; +then + >&2 echo "ERROR : missing argument 3 (expName)" + print_usage_and_exit +fi + + +if [ ! -d "$TEMPLATENAME" ]; then + >&2 echo "ERROR : directory $TEMPLATENAME doesn't exist" + print_usage_and_exit +fi + +mkdir -p bin +rm -rf bin/$EXPNAME +cp -r $TEMPLATENAME bin/$EXPNAME +cp -r "data" bin/$EXPNAME/. +echo "CORPUS="$UD_ROOT"/"$LANG > bin/$EXPNAME/config + diff --git a/UD_any/train.sh b/UD_any/train.sh index 0a6fbc2..265662b 100755 --- a/UD_any/train.sh +++ b/UD_any/train.sh @@ -1,56 +1,45 @@ #! /bin/bash -source config - function has_space { [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1 } function print_usage_and_exit { - >&2 echo "USAGE : (tsv | txt) language_keyword templateName expName [arguments]" + >&2 echo "USAGE : (tsv | txt) expPath [arguments]" exit 1 } -MCD=data/conllu.mcd MODE=$1 -KEYWORD=$2 -TEMPLATENAME=$3 -EXPNAME=$4 - -if [ -z "$KEYWORD" ]; -then - >&2 echo "ERROR : missing argument 1 (keyword)" - print_usage_and_exit -fi +EXPPATH=$2 -if [ -z "$TEMPLATENAME" ]; +if [ -z "$MODE" ]; then - >&2 echo "ERROR : missing argument 2 (templateName)" + >&2 echo "ERROR : missing argument 1 (mode)" print_usage_and_exit fi -if [ -z "$EXPNAME" ]; +if [ -z "$EXPPATH" ]; then - >&2 echo "ERROR : missing argument 3 (expName)" + >&2 echo "ERROR : missing argument 2 (expPath)" print_usage_and_exit fi -shift -shift shift shift -if [ "$KEYWORD" = "." ] -then - KEYWORD="" +if [ ! -d "$EXPPATH" ]; then + >&2 echo "ERROR : directory $EXPPATH doesn't exist" + print_usage_and_exit fi -TRAIN=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.conllu') -TRAINRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.txt') -DEV=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.conllu') -DEVRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.txt') -TEST=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.conllu') -TESTRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.txt') +source $EXPPATH"/config" + +TRAIN=$(find $CORPUS -type f -name '*train*.conllu') +TRAINRAW=$(find $CORPUS -type f -name '*train*.txt') +DEV=$(find $CORPUS -type f -name '*dev*.conllu') +DEVRAW=$(find $CORPUS -type f -name '*dev*.txt') +TEST=$(find $CORPUS -type f -name '*test*.conllu') +TESTRAW=$(find $CORPUS -type f -name '*test*.txt') if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST"; then @@ -63,32 +52,23 @@ fi if test ! -f $TRAIN; then - >&2 echo "ERROR : no train file found with keyword" $KEYWORD + >&2 echo "ERROR : no train file found in" $CORPUS >&2 echo "$TRAIN" print_usage_and_exit fi -mkdir -p bin - -if [ ! -d "$TEMPLATENAME" ]; then - >&2 echo "ERROR : directory $TEMPLATENAME doesn't exist" - print_usage_and_exit -fi - -rm -rf bin/$EXPNAME -cp -r $TEMPLATENAME bin/$EXPNAME -cp -r "data" bin/$EXPNAME/. - -EVALCONLL="../scripts/conll18_ud_eval.py" +CURDIR=$(pwd) +cd $EXPPATH"/"data && make -s clean && make -s && cd $CURDIR +MCD=$EXPPATH"/data/*\.mcd" if [ "$MODE" = "tsv" ]; then -macaon train --model bin/$EXPNAME --mcd $MCD --trainTSV $TRAIN --devTSV $DEV $@ || exit 1 +macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --devTSV $DEV $@ || exit 1 exit 0 fi if [ "$MODE" = "txt" ]; then -macaon train --model bin/$EXPNAME --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $@ || exit 1 +macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $@ || exit 1 exit 0 fi -- GitLab