diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 37640dd6decadc9cfb2a574296369dc4da234803..5e8ee609c87b3f1b8047f737594d53c2be6d95fd 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -1,11 +1,9 @@ -include ../config - SCRIPTS=../../../../scripts CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl -TRAIN_FILES=$(shell find $(CORPUS) -type f -name '*train*.conllu') -DEV_FILES=$(shell find $(CORPUS) -type f -name '*dev*.conllu') -TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu') +TRAIN_FILES=$(shell find . -type f -name '*train*.conllu') +DEV_FILES=$(shell find . -type f -name '*dev*.conllu') +TEST_FILES=$(shell find . -type f -name '*test*.conllu') #This part is for lemmatizer rules and excpetions computation THRESHOLD=10 @@ -45,14 +43,13 @@ texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) pretrain: texts - ./pretrainEmbeddings.py $(shell find $(CORPUS) -type f -name '*train*.txt') 64 + ./pretrainEmbeddings.py train.txt 64 pretrained.w2v $(FPLM_FILENAME): all_no_test.conllu $(SCRIPTS)/conllu2fplm.py $< > $@ clean: - - rm -f *\.txt - - rm -f *\.conll* - rm -f *\.ts + - rm -f ambiguities\.txt - rm -f $(FPLM_FILENAME) diff --git a/UD_any/data/pretrainEmbeddings.py b/UD_any/data/pretrainEmbeddings.py index d9a7f6dc92e2d62dbab341d8dba6b636eb0f6dd4..11a777ffd80a4a4cecb605a5cc66b208a5410e5c 100755 --- a/UD_any/data/pretrainEmbeddings.py +++ b/UD_any/data/pretrainEmbeddings.py @@ -6,17 +6,17 @@ import subprocess from shutil import which def printUsageAndExit() : - print("USAGE : %s file.conllu embeddingsSize"%sys.argv[0], file=sys.stderr) + print("USAGE : %s file.conllu embeddingsSize outputFile"%sys.argv[0], file=sys.stderr) exit(1) if __name__ == "__main__" : - if len(sys.argv) != 3 : + if len(sys.argv) != 4 : printUsageAndExit() pathToFile = sys.argv[1] embeddingsSize = int(sys.argv[2]) splited = os.path.splitext(pathToFile) - target = splited[0] + ".w2v" + target = sys.argv[3] if which("word2vec") is None : exit(0) diff --git a/UD_any/evaluate.sh b/UD_any/evaluate.sh index 4fcaa3823119bd83ef4199a6276179efbaf7ca35..8b8bddab82c2e518edd45a79c8e9dd3d70c13b33 100755 --- a/UD_any/evaluate.sh +++ b/UD_any/evaluate.sh @@ -1,9 +1,5 @@ #! /usr/bin/env bash -function has_space { - [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1 -} - function print_usage_and_exit { >&2 echo "USAGE : (tsv | txt) expPath [arguments]" exit 1 @@ -32,26 +28,16 @@ if [ ! -d "$EXPPATH" ]; then print_usage_and_exit fi -source $EXPPATH"/config" - -TRAIN=$(find $CORPUS -type f -name '*train*.conllu') -TRAINRAW=$(find $CORPUS -type f -name '*train*.txt') -DEV=$(find $CORPUS -type f -name '*dev*.conllu') -DEVRAW=$(find $CORPUS -type f -name '*dev*.txt') -TEST=$(find $CORPUS -type f -name '*test*.conllu') -TESTRAW=$(find $CORPUS -type f -name '*test*.txt') +TRAIN=$EXPPATH"/data/train.conllu" +TRAINRAW=$EXPPATH"/data/train.txt" +DEV=$EXPPATH"/data/dev.conllu" +DEVRAW=$EXPPATH"/data/dev.txt" +TEST=$EXPPATH"/data/test.conllu" +TESTRAW=$EXPPATH"/data/test.txt" REF=$TEST REFRAW=$TESTRAW -if has_space "$REF" || has_space "$REFRAW"; -then - >&2 echo "ERROR : more than 1 match" - >&2 echo "REF : " $REF - >&2 echo "REFRAW : " $REFRAW - print_usage_and_exit -fi - if test ! -f $REF; then >&2 echo "ERROR : no ref file found in" $CORPUS diff --git a/UD_any/prepareExperiment.sh b/UD_any/prepareExperiment.sh index e89c54afde4c8e3072218afbc766b350962ea0a9..bcd32dd7e587ac9d3ec9652004b3689cfa0eee34 100755 --- a/UD_any/prepareExperiment.sh +++ b/UD_any/prepareExperiment.sh @@ -7,6 +7,10 @@ function print_usage_and_exit { exit 1 } +function has_space { + [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1 +} + LANG=$1 TEMPLATENAME=$2 EXPNAME=$3 @@ -41,12 +45,32 @@ if [ ! -d "$CORPUS" ]; then print_usage_and_exit fi -mkdir -p bin +TRAIN=$(find $CORPUS -type f -name '*train*.conllu') +DEV=$(find $CORPUS -type f -name '*dev*.conllu') +TEST=$(find $CORPUS -type f -name '*test*.conllu') + +if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST"; +then + >&2 echo "ERROR : more than 1 match with keyword" $KEYWORD + >&2 echo "TRAIN : " $TRAIN + >&2 echo "DEV : " $DEV + >&2 echo "TEST : " $TEST + print_usage_and_exit +fi +mkdir -p bin if [ ! -d "bin/$EXPNAME" ]; then -cp -r $TEMPLATENAME bin/$EXPNAME -cp -r "data" bin/$EXPNAME/. -echo "CORPUS="$CORPUS > bin/$EXPNAME/config + cp -r $TEMPLATENAME bin/$EXPNAME + cp -r "data" bin/$EXPNAME/. + if [ -f "$TRAIN" ]; then + cp $TRAIN bin/$EXPNAME/data/train.conllu + fi + if [ -f "$DEV" ]; then + cp $DEV bin/$EXPNAME/data/dev.conllu + fi + if [ -f "$TEST" ]; then + cp $TEST bin/$EXPNAME/data/test.conllu + fi fi diff --git a/UD_any/train.sh b/UD_any/train.sh index 3bc401baa918c82e11d2cb5a97f5df8f59473d55..88c91126a42501e8b81bd74be625d40f3abd76b9 100755 --- a/UD_any/train.sh +++ b/UD_any/train.sh @@ -1,9 +1,5 @@ #! /usr/bin/env bash -function has_space { - [[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1 -} - function print_usage_and_exit { >&2 echo "USAGE : (tsv | txt) expPath [arguments]" exit 1 @@ -12,6 +8,9 @@ function print_usage_and_exit { MODE=$1 EXPPATH=$2 +>&2 echo "********************************************************************************" +>&2 echo "Training : "$EXPPATH + if [ -z "$MODE" ]; then >&2 echo "ERROR : missing argument 1 (mode)" @@ -32,42 +31,40 @@ if [ ! -d "$EXPPATH" ]; then print_usage_and_exit fi -source $EXPPATH"/config" - CURDIR=$(pwd) cd $EXPPATH"/"data && make -s clean && make -s && cd $CURDIR -TRAIN=$(find $CORPUS -type f -name '*train*.conllu') -TRAINRAW=$(find $CORPUS -type f -name '*train*.txt') -DEV=$(find $CORPUS -type f -name '*dev*.conllu') -DEVRAW=$(find $CORPUS -type f -name '*dev*.txt') -TEST=$(find $CORPUS -type f -name '*test*.conllu') -TESTRAW=$(find $CORPUS -type f -name '*test*.txt') -W2V=$(find $CORPUS -type f -name '*.w2v') +TRAIN=$EXPPATH"/data/train.conllu" +TRAINRAW=$EXPPATH"/data/train.txt" +DEV=$EXPPATH"/data/dev.conllu" +DEVRAW=$EXPPATH"/data/dev.txt" +TEST=$EXPPATH"/data/test.conllu" +TESTRAW=$EXPPATH"/data/test.txt" +W2V=$EXPPATH"/data/pretrained.w2v" -if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST"; +if test ! -f $TRAIN; then - >&2 echo "ERROR : more than 1 match with keyword" $KEYWORD - >&2 echo "TRAIN : " $TRAIN - >&2 echo "DEV : " $DEV - >&2 echo "TEST : " $TEST + >&2 echo "ERROR : no train file found in" $EXPPATH + >&2 echo "$TRAIN" print_usage_and_exit fi -if test -z $TRAIN; +if test ! -f $DEV; then - >&2 echo "ERROR : no train file found in" $CORPUS - >&2 echo "$TRAIN" - print_usage_and_exit + DEV="" fi if [ "$MODE" = "txt" ]; then -if test -z $TRAINRAW; -then - >&2 echo "ERROR : no train file found in" $CORPUS - >&2 echo "$TRAINRAW" - print_usage_and_exit -fi + if test ! -f $TRAINRAW; + then + >&2 echo "ERROR : no train file found in" $EXPPATH + >&2 echo "$TRAINRAW" + print_usage_and_exit + fi + if test ! -f $DEVRAW; + then + DEVRAW="" + fi fi if test -f $W2V;