diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 48ee4c02c39197d054b7b08cbb66b887172caa2f..b15a589833ef964cc34d302da4e3163e68c31e76 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -1,12 +1,12 @@ include ../config -SCRIPTS=../../scripts +SCRIPTS=../../../../scripts CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl MCD=conllu.mcd -TRAIN_FILES=$(shell find $(UD_ROOT) -type f -name '*train*.conllu') -DEV_FILES=$(shell find $(UD_ROOT) -type f -name '*dev*.conllu') -TEST_FILES=$(shell find $(UD_ROOT) -type f -name '*test*.conllu') +TRAIN_FILES=$(shell find $(CORPUS) -type f -name '*train*.conllu') +DEV_FILES=$(shell find $(CORPUS) -type f -name '*dev*.conllu') +TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu') #This part is for lemmatizer rules and excpetions computation THRESHOLD=10 @@ -15,8 +15,8 @@ RULES_FILENAME=lemmatizer_rules.ts EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns $(FPLM_FILENAME) $(RULES_FILENAME) - rm col_*\.txt - rm all_no_test.conllu + rm -f col_*\.txt + rm -f all_no_test.conllu all_no_test.conllu: cat $(TRAIN_FILES) > $@ @@ -47,14 +47,14 @@ $(FPLM_FILENAME): all_no_test.conllu $(MCD) $(RULES_FILENAME): $(FPLM_FILENAME) macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt -t $(THRESHOLD) - rm tmp.txt + rm -f tmp.txt echo -e "Default : NOTHING\nTOLOWER b.0 LEMMA\nTOUPPER b.0 LEMMA" > lemmatizer_case.ts clean: - - rm *\.txt - - rm *\.conll* - - rm *\.ts - - rm $(RULES_FILENAME) - - rm $(EXCEPTIONS_FPLM_FILENAME) - - rm $(FPLM_FILENAME) + - rm -f *\.txt + - rm -f *\.conll* + - rm -f *\.ts + - rm -f $(RULES_FILENAME) + - rm -f $(EXCEPTIONS_FPLM_FILENAME) + - rm -f $(FPLM_FILENAME) diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py index ca01d68dfc8b1f47a9d6326bd5c5da916089210b..6d06e7b78a346caca5cf0dce725affecfa2ef582 100755 --- a/UD_any/data/getTransitionSets.py +++ b/UD_any/data/getTransitionSets.py @@ -2,7 +2,7 @@ import sys -sys.path.insert(1, '../../scripts') +sys.path.insert(1, '../../../../scripts') from readMCD import readMCD diff --git a/UD_any/evaluate.sh b/UD_any/evaluate.sh index e2988856b4f50788ec0e9feca8cd0b81859a8c5b..9ef891ae11e046010bf0ec77d7a027883ff2818b 100755 --- a/UD_any/evaluate.sh +++ b/UD_any/evaluate.sh @@ -1,76 +1,81 @@ #! /bin/bash -source config - function has_space { [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1 } function print_usage_and_exit { - >&2 echo "USAGE : (tsv | txt) language_keyword templateName expName [arguments]" + >&2 echo "USAGE : (tsv | txt) expPath [arguments]" exit 1 } -MCD=data/conllu.mcd MODE=$1 -KEYWORD=$2 -EXPNAME=$3 +EXPPATH=$2 -if [ -z "$KEYWORD" ]; +if [ -z "$MODE" ]; then - >&2 echo "ERROR : missing argument 1 (keyword)" + >&2 echo "ERROR : missing argument 1 (mode)" print_usage_and_exit fi -if [ -z "$EXPNAME" ]; +if [ -z "$EXPPATH" ]; then - >&2 echo "ERROR : missing argument 2 (expName)" + >&2 echo "ERROR : missing argument 2 (expPath)" print_usage_and_exit fi -shift shift shift -if [ "$KEYWORD" = "." ] -then - KEYWORD="" +if [ ! -d "$EXPPATH" ]; then + >&2 echo "ERROR : directory $EXPPATH doesn't exist" + print_usage_and_exit fi -TEST=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.conllu') -TESTRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.txt') -DEV=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.conllu') -DEVRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.txt') -TRAIN=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.conllu') -TRAINRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.txt') +source $EXPPATH"/config" -EVALTARGET=$TEST -EVALTARGETRAW=$TESTRAW +TRAIN=$(find $CORPUS -type f -name '*train*.conllu') +TRAINRAW=$(find $CORPUS -type f -name '*train*.txt') +DEV=$(find $CORPUS -type f -name '*dev*.conllu') +DEVRAW=$(find $CORPUS -type f -name '*dev*.txt') +TEST=$(find $CORPUS -type f -name '*test*.conllu') +TESTRAW=$(find $CORPUS -type f -name '*test*.txt') -if has_space "$EVALTARGET"; +REF=$TEST +REFRAW=$TESTRAW + +if has_space "$REF" || has_space "$REFRAW"; then - >&2 echo "ERROR : more than 1 match with keyword" $KEYWORD - >&2 echo "TEST : " $EVALTARGET + >&2 echo "ERROR : more than 1 match" + >&2 echo "REF : " $REF + >&2 echo "REFRAW : " $REFRAW print_usage_and_exit fi -if test ! -f $EVALTARGET; +if test ! -f $REF; +then + >&2 echo "ERROR : no ref file found in" $CORPUS + >&2 echo "$REF" + print_usage_and_exit +fi +if test ! -f $REFRAW; then - >&2 echo "ERROR : no target file found with keyword" $KEYWORD - >&2 echo "$EVALTARGET" + >&2 echo "ERROR : no ref file found in" $CORPUS + >&2 echo "$REFRAW" print_usage_and_exit fi +MCD=$EXPPATH"/data/*\.mcd" EVALCONLL="../scripts/conll18_ud_eval.py" -OUTPUT=$EXPNAME"/predicted_eval.tsv" +OUTPUT=$EXPPATH"/predicted_eval.tsv" if [ "$MODE" = "tsv" ]; then -macaon decode --model $EXPNAME --mcd $MCD --inputTSV $EVALTARGET $@ > $OUTPUT && $EVALCONLL $EVALTARGET $OUTPUT -v || exit 1 +macaon decode --model $EXPPATH --mcd $MCD --inputTSV $REF $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT -v || exit 1 exit 0 fi if [ "$MODE" = "txt" ]; then -macaon decode --model $EXPNAME --mcd $MCD --inputTXT $EVALTARGETRAW $@ > $OUTPUT && $EVALCONLL $EVALTARGET $OUTPUT -v || exit 1 +macaon decode --model $EXPPATH --mcd $MCD --inputTXT $REFRAW $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT -v || exit 1 exit 0 fi diff --git a/UD_any/prepareExperiment.sh b/UD_any/prepareExperiment.sh new file mode 100755 index 0000000000000000000000000000000000000000..4468aef8e1f34eb75613d2fb13d7cb1874e581d9 --- /dev/null +++ b/UD_any/prepareExperiment.sh @@ -0,0 +1,43 @@ +#! /bin/bash + +source config + +function print_usage_and_exit { + >&2 echo "USAGE : language templateName expName" + exit 1 +} + +LANG=$1 +TEMPLATENAME=$2 +EXPNAME=$3 + +if [ -z "$LANG" ]; +then + >&2 echo "ERROR : missing argument 1 (lang)" + print_usage_and_exit +fi + +if [ -z "$TEMPLATENAME" ]; +then + >&2 echo "ERROR : missing argument 2 (templateName)" + print_usage_and_exit +fi + +if [ -z "$EXPNAME" ]; +then + >&2 echo "ERROR : missing argument 3 (expName)" + print_usage_and_exit +fi + + +if [ ! -d "$TEMPLATENAME" ]; then + >&2 echo "ERROR : directory $TEMPLATENAME doesn't exist" + print_usage_and_exit +fi + +mkdir -p bin +rm -rf bin/$EXPNAME +cp -r $TEMPLATENAME bin/$EXPNAME +cp -r "data" bin/$EXPNAME/. +echo "CORPUS="$UD_ROOT"/"$LANG > bin/$EXPNAME/config + diff --git a/UD_any/train.sh b/UD_any/train.sh index 0a6fbc2bb776083bd85921927384ad312b415880..265662b3884225d1e8f0517a8925cc7122261d5a 100755 --- a/UD_any/train.sh +++ b/UD_any/train.sh @@ -1,56 +1,45 @@ #! /bin/bash -source config - function has_space { [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1 } function print_usage_and_exit { - >&2 echo "USAGE : (tsv | txt) language_keyword templateName expName [arguments]" + >&2 echo "USAGE : (tsv | txt) expPath [arguments]" exit 1 } -MCD=data/conllu.mcd MODE=$1 -KEYWORD=$2 -TEMPLATENAME=$3 -EXPNAME=$4 - -if [ -z "$KEYWORD" ]; -then - >&2 echo "ERROR : missing argument 1 (keyword)" - print_usage_and_exit -fi +EXPPATH=$2 -if [ -z "$TEMPLATENAME" ]; +if [ -z "$MODE" ]; then - >&2 echo "ERROR : missing argument 2 (templateName)" + >&2 echo "ERROR : missing argument 1 (mode)" print_usage_and_exit fi -if [ -z "$EXPNAME" ]; +if [ -z "$EXPPATH" ]; then - >&2 echo "ERROR : missing argument 3 (expName)" + >&2 echo "ERROR : missing argument 2 (expPath)" print_usage_and_exit fi -shift -shift shift shift -if [ "$KEYWORD" = "." ] -then - KEYWORD="" +if [ ! -d "$EXPPATH" ]; then + >&2 echo "ERROR : directory $EXPPATH doesn't exist" + print_usage_and_exit fi -TRAIN=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.conllu') -TRAINRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.txt') -DEV=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.conllu') -DEVRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.txt') -TEST=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.conllu') -TESTRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.txt') +source $EXPPATH"/config" + +TRAIN=$(find $CORPUS -type f -name '*train*.conllu') +TRAINRAW=$(find $CORPUS -type f -name '*train*.txt') +DEV=$(find $CORPUS -type f -name '*dev*.conllu') +DEVRAW=$(find $CORPUS -type f -name '*dev*.txt') +TEST=$(find $CORPUS -type f -name '*test*.conllu') +TESTRAW=$(find $CORPUS -type f -name '*test*.txt') if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST"; then @@ -63,32 +52,23 @@ fi if test ! -f $TRAIN; then - >&2 echo "ERROR : no train file found with keyword" $KEYWORD + >&2 echo "ERROR : no train file found in" $CORPUS >&2 echo "$TRAIN" print_usage_and_exit fi -mkdir -p bin - -if [ ! -d "$TEMPLATENAME" ]; then - >&2 echo "ERROR : directory $TEMPLATENAME doesn't exist" - print_usage_and_exit -fi - -rm -rf bin/$EXPNAME -cp -r $TEMPLATENAME bin/$EXPNAME -cp -r "data" bin/$EXPNAME/. - -EVALCONLL="../scripts/conll18_ud_eval.py" +CURDIR=$(pwd) +cd $EXPPATH"/"data && make -s clean && make -s && cd $CURDIR +MCD=$EXPPATH"/data/*\.mcd" if [ "$MODE" = "tsv" ]; then -macaon train --model bin/$EXPNAME --mcd $MCD --trainTSV $TRAIN --devTSV $DEV $@ || exit 1 +macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --devTSV $DEV $@ || exit 1 exit 0 fi if [ "$MODE" = "txt" ]; then -macaon train --model bin/$EXPNAME --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $@ || exit 1 +macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $@ || exit 1 exit 0 fi