Skip to content
Snippets Groups Projects
Commit 9074e66f authored by Franck Dary's avatar Franck Dary
Browse files

Changed the way we train and evaluate

parent d0acd064
No related branches found
No related tags found
No related merge requests found
include ../config include ../config
SCRIPTS=../../scripts SCRIPTS=../../../../scripts
CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
MCD=conllu.mcd MCD=conllu.mcd
TRAIN_FILES=$(shell find $(UD_ROOT) -type f -name '*train*.conllu') TRAIN_FILES=$(shell find $(CORPUS) -type f -name '*train*.conllu')
DEV_FILES=$(shell find $(UD_ROOT) -type f -name '*dev*.conllu') DEV_FILES=$(shell find $(CORPUS) -type f -name '*dev*.conllu')
TEST_FILES=$(shell find $(UD_ROOT) -type f -name '*test*.conllu') TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu')
#This part is for lemmatizer rules and excpetions computation #This part is for lemmatizer rules and excpetions computation
THRESHOLD=10 THRESHOLD=10
...@@ -15,8 +15,8 @@ RULES_FILENAME=lemmatizer_rules.ts ...@@ -15,8 +15,8 @@ RULES_FILENAME=lemmatizer_rules.ts
EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm EXCEPTIONS_FPLM_FILENAME=maca_trans_lemmatizer_exceptions.fplm
all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns $(FPLM_FILENAME) $(RULES_FILENAME) all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns $(FPLM_FILENAME) $(RULES_FILENAME)
rm col_*\.txt rm -f col_*\.txt
rm all_no_test.conllu rm -f all_no_test.conllu
all_no_test.conllu: all_no_test.conllu:
cat $(TRAIN_FILES) > $@ cat $(TRAIN_FILES) > $@
...@@ -47,14 +47,14 @@ $(FPLM_FILENAME): all_no_test.conllu $(MCD) ...@@ -47,14 +47,14 @@ $(FPLM_FILENAME): all_no_test.conllu $(MCD)
$(RULES_FILENAME): $(FPLM_FILENAME) $(RULES_FILENAME): $(FPLM_FILENAME)
macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt -t $(THRESHOLD) macaon_compute_l_rules -f $(FPLM_FILENAME) -e $(EXCEPTIONS_FPLM_FILENAME) -r tmp.txt -t $(THRESHOLD)
rm tmp.txt rm -f tmp.txt
echo -e "Default : NOTHING\nTOLOWER b.0 LEMMA\nTOUPPER b.0 LEMMA" > lemmatizer_case.ts echo -e "Default : NOTHING\nTOLOWER b.0 LEMMA\nTOUPPER b.0 LEMMA" > lemmatizer_case.ts
clean: clean:
- rm *\.txt - rm -f *\.txt
- rm *\.conll* - rm -f *\.conll*
- rm *\.ts - rm -f *\.ts
- rm $(RULES_FILENAME) - rm -f $(RULES_FILENAME)
- rm $(EXCEPTIONS_FPLM_FILENAME) - rm -f $(EXCEPTIONS_FPLM_FILENAME)
- rm $(FPLM_FILENAME) - rm -f $(FPLM_FILENAME)
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import sys import sys
sys.path.insert(1, '../../scripts') sys.path.insert(1, '../../../../scripts')
from readMCD import readMCD from readMCD import readMCD
......
#! /bin/bash #! /bin/bash
source config
function has_space { function has_space {
[[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1 [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
} }
function print_usage_and_exit { function print_usage_and_exit {
>&2 echo "USAGE : (tsv | txt) language_keyword templateName expName [arguments]" >&2 echo "USAGE : (tsv | txt) expPath [arguments]"
exit 1 exit 1
} }
MCD=data/conllu.mcd
MODE=$1 MODE=$1
KEYWORD=$2 EXPPATH=$2
EXPNAME=$3
if [ -z "$KEYWORD" ]; if [ -z "$MODE" ];
then then
>&2 echo "ERROR : missing argument 1 (keyword)" >&2 echo "ERROR : missing argument 1 (mode)"
print_usage_and_exit print_usage_and_exit
fi fi
if [ -z "$EXPNAME" ]; if [ -z "$EXPPATH" ];
then then
>&2 echo "ERROR : missing argument 2 (expName)" >&2 echo "ERROR : missing argument 2 (expPath)"
print_usage_and_exit print_usage_and_exit
fi fi
shift
shift shift
shift shift
if [ "$KEYWORD" = "." ] if [ ! -d "$EXPPATH" ]; then
then >&2 echo "ERROR : directory $EXPPATH doesn't exist"
KEYWORD="" print_usage_and_exit
fi fi
TEST=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.conllu') source $EXPPATH"/config"
TESTRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.txt')
DEV=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.conllu')
DEVRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.txt')
TRAIN=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.conllu')
TRAINRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.txt')
EVALTARGET=$TEST TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
EVALTARGETRAW=$TESTRAW TRAINRAW=$(find $CORPUS -type f -name '*train*.txt')
DEV=$(find $CORPUS -type f -name '*dev*.conllu')
DEVRAW=$(find $CORPUS -type f -name '*dev*.txt')
TEST=$(find $CORPUS -type f -name '*test*.conllu')
TESTRAW=$(find $CORPUS -type f -name '*test*.txt')
if has_space "$EVALTARGET"; REF=$TEST
REFRAW=$TESTRAW
if has_space "$REF" || has_space "$REFRAW";
then then
>&2 echo "ERROR : more than 1 match with keyword" $KEYWORD >&2 echo "ERROR : more than 1 match"
>&2 echo "TEST : " $EVALTARGET >&2 echo "REF : " $REF
>&2 echo "REFRAW : " $REFRAW
print_usage_and_exit print_usage_and_exit
fi fi
if test ! -f $EVALTARGET; if test ! -f $REF;
then
>&2 echo "ERROR : no ref file found in" $CORPUS
>&2 echo "$REF"
print_usage_and_exit
fi
if test ! -f $REFRAW;
then then
>&2 echo "ERROR : no target file found with keyword" $KEYWORD >&2 echo "ERROR : no ref file found in" $CORPUS
>&2 echo "$EVALTARGET" >&2 echo "$REFRAW"
print_usage_and_exit print_usage_and_exit
fi fi
MCD=$EXPPATH"/data/*\.mcd"
EVALCONLL="../scripts/conll18_ud_eval.py" EVALCONLL="../scripts/conll18_ud_eval.py"
OUTPUT=$EXPNAME"/predicted_eval.tsv" OUTPUT=$EXPPATH"/predicted_eval.tsv"
if [ "$MODE" = "tsv" ]; then if [ "$MODE" = "tsv" ]; then
macaon decode --model $EXPNAME --mcd $MCD --inputTSV $EVALTARGET $@ > $OUTPUT && $EVALCONLL $EVALTARGET $OUTPUT -v || exit 1 macaon decode --model $EXPPATH --mcd $MCD --inputTSV $REF $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT -v || exit 1
exit 0 exit 0
fi fi
if [ "$MODE" = "txt" ]; then if [ "$MODE" = "txt" ]; then
macaon decode --model $EXPNAME --mcd $MCD --inputTXT $EVALTARGETRAW $@ > $OUTPUT && $EVALCONLL $EVALTARGET $OUTPUT -v || exit 1 macaon decode --model $EXPPATH --mcd $MCD --inputTXT $REFRAW $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT -v || exit 1
exit 0 exit 0
fi fi
......
#! /bin/bash
source config
function print_usage_and_exit {
>&2 echo "USAGE : language templateName expName"
exit 1
}
LANG=$1
TEMPLATENAME=$2
EXPNAME=$3
if [ -z "$LANG" ];
then
>&2 echo "ERROR : missing argument 1 (lang)"
print_usage_and_exit
fi
if [ -z "$TEMPLATENAME" ];
then
>&2 echo "ERROR : missing argument 2 (templateName)"
print_usage_and_exit
fi
if [ -z "$EXPNAME" ];
then
>&2 echo "ERROR : missing argument 3 (expName)"
print_usage_and_exit
fi
if [ ! -d "$TEMPLATENAME" ]; then
>&2 echo "ERROR : directory $TEMPLATENAME doesn't exist"
print_usage_and_exit
fi
mkdir -p bin
rm -rf bin/$EXPNAME
cp -r $TEMPLATENAME bin/$EXPNAME
cp -r "data" bin/$EXPNAME/.
echo "CORPUS="$UD_ROOT"/"$LANG > bin/$EXPNAME/config
#! /bin/bash #! /bin/bash
source config
function has_space { function has_space {
[[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1 [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
} }
function print_usage_and_exit { function print_usage_and_exit {
>&2 echo "USAGE : (tsv | txt) language_keyword templateName expName [arguments]" >&2 echo "USAGE : (tsv | txt) expPath [arguments]"
exit 1 exit 1
} }
MCD=data/conllu.mcd
MODE=$1 MODE=$1
KEYWORD=$2 EXPPATH=$2
TEMPLATENAME=$3
EXPNAME=$4
if [ -z "$KEYWORD" ];
then
>&2 echo "ERROR : missing argument 1 (keyword)"
print_usage_and_exit
fi
if [ -z "$TEMPLATENAME" ]; if [ -z "$MODE" ];
then then
>&2 echo "ERROR : missing argument 2 (templateName)" >&2 echo "ERROR : missing argument 1 (mode)"
print_usage_and_exit print_usage_and_exit
fi fi
if [ -z "$EXPNAME" ]; if [ -z "$EXPPATH" ];
then then
>&2 echo "ERROR : missing argument 3 (expName)" >&2 echo "ERROR : missing argument 2 (expPath)"
print_usage_and_exit print_usage_and_exit
fi fi
shift
shift
shift shift
shift shift
if [ "$KEYWORD" = "." ] if [ ! -d "$EXPPATH" ]; then
then >&2 echo "ERROR : directory $EXPPATH doesn't exist"
KEYWORD="" print_usage_and_exit
fi fi
TRAIN=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.conllu') source $EXPPATH"/config"
TRAINRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*train*.txt')
DEV=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.conllu') TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
DEVRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*dev*.txt') TRAINRAW=$(find $CORPUS -type f -name '*train*.txt')
TEST=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.conllu') DEV=$(find $CORPUS -type f -name '*dev*.conllu')
TESTRAW=$(find $UD_ROOT*$KEYWORD -type f -name '*test*.txt') DEVRAW=$(find $CORPUS -type f -name '*dev*.txt')
TEST=$(find $CORPUS -type f -name '*test*.conllu')
TESTRAW=$(find $CORPUS -type f -name '*test*.txt')
if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST"; if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
then then
...@@ -63,32 +52,23 @@ fi ...@@ -63,32 +52,23 @@ fi
if test ! -f $TRAIN; if test ! -f $TRAIN;
then then
>&2 echo "ERROR : no train file found with keyword" $KEYWORD >&2 echo "ERROR : no train file found in" $CORPUS
>&2 echo "$TRAIN" >&2 echo "$TRAIN"
print_usage_and_exit print_usage_and_exit
fi fi
mkdir -p bin CURDIR=$(pwd)
cd $EXPPATH"/"data && make -s clean && make -s && cd $CURDIR
if [ ! -d "$TEMPLATENAME" ]; then
>&2 echo "ERROR : directory $TEMPLATENAME doesn't exist"
print_usage_and_exit
fi
rm -rf bin/$EXPNAME
cp -r $TEMPLATENAME bin/$EXPNAME
cp -r "data" bin/$EXPNAME/.
EVALCONLL="../scripts/conll18_ud_eval.py"
MCD=$EXPPATH"/data/*\.mcd"
if [ "$MODE" = "tsv" ]; then if [ "$MODE" = "tsv" ]; then
macaon train --model bin/$EXPNAME --mcd $MCD --trainTSV $TRAIN --devTSV $DEV $@ || exit 1 macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --devTSV $DEV $@ || exit 1
exit 0 exit 0
fi fi
if [ "$MODE" = "txt" ]; then if [ "$MODE" = "txt" ]; then
macaon train --model bin/$EXPNAME --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $@ || exit 1 macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $@ || exit 1
exit 0 exit 0
fi fi
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment