Skip to content
Snippets Groups Projects
Commit 26f25b22 authored by Franck Dary's avatar Franck Dary
Browse files

Corpora are now copied into exp dir to avoid conflicts between multiple experiences

parent a301e461
No related branches found
No related tags found
No related merge requests found
include ../config
SCRIPTS=../../../../scripts SCRIPTS=../../../../scripts
CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
TRAIN_FILES=$(shell find $(CORPUS) -type f -name '*train*.conllu') TRAIN_FILES=$(shell find . -type f -name '*train*.conllu')
DEV_FILES=$(shell find $(CORPUS) -type f -name '*dev*.conllu') DEV_FILES=$(shell find . -type f -name '*dev*.conllu')
TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu') TEST_FILES=$(shell find . -type f -name '*test*.conllu')
#This part is for lemmatizer rules and excpetions computation #This part is for lemmatizer rules and excpetions computation
THRESHOLD=10 THRESHOLD=10
...@@ -45,14 +43,13 @@ texts: ...@@ -45,14 +43,13 @@ texts:
./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
pretrain: texts pretrain: texts
./pretrainEmbeddings.py $(shell find $(CORPUS) -type f -name '*train*.txt') 64 ./pretrainEmbeddings.py train.txt 64 pretrained.w2v
$(FPLM_FILENAME): all_no_test.conllu $(FPLM_FILENAME): all_no_test.conllu
$(SCRIPTS)/conllu2fplm.py $< > $@ $(SCRIPTS)/conllu2fplm.py $< > $@
clean: clean:
- rm -f *\.txt
- rm -f *\.conll*
- rm -f *\.ts - rm -f *\.ts
- rm -f ambiguities\.txt
- rm -f $(FPLM_FILENAME) - rm -f $(FPLM_FILENAME)
...@@ -6,17 +6,17 @@ import subprocess ...@@ -6,17 +6,17 @@ import subprocess
from shutil import which from shutil import which
def printUsageAndExit() : def printUsageAndExit() :
print("USAGE : %s file.conllu embeddingsSize"%sys.argv[0], file=sys.stderr) print("USAGE : %s file.conllu embeddingsSize outputFile"%sys.argv[0], file=sys.stderr)
exit(1) exit(1)
if __name__ == "__main__" : if __name__ == "__main__" :
if len(sys.argv) != 3 : if len(sys.argv) != 4 :
printUsageAndExit() printUsageAndExit()
pathToFile = sys.argv[1] pathToFile = sys.argv[1]
embeddingsSize = int(sys.argv[2]) embeddingsSize = int(sys.argv[2])
splited = os.path.splitext(pathToFile) splited = os.path.splitext(pathToFile)
target = splited[0] + ".w2v" target = sys.argv[3]
if which("word2vec") is None : if which("word2vec") is None :
exit(0) exit(0)
......
#! /usr/bin/env bash #! /usr/bin/env bash
function has_space {
[[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
}
function print_usage_and_exit { function print_usage_and_exit {
>&2 echo "USAGE : (tsv | txt) expPath [arguments]" >&2 echo "USAGE : (tsv | txt) expPath [arguments]"
exit 1 exit 1
...@@ -32,26 +28,16 @@ if [ ! -d "$EXPPATH" ]; then ...@@ -32,26 +28,16 @@ if [ ! -d "$EXPPATH" ]; then
print_usage_and_exit print_usage_and_exit
fi fi
source $EXPPATH"/config" TRAIN=$EXPPATH"/data/train.conllu"
TRAINRAW=$EXPPATH"/data/train.txt"
TRAIN=$(find $CORPUS -type f -name '*train*.conllu') DEV=$EXPPATH"/data/dev.conllu"
TRAINRAW=$(find $CORPUS -type f -name '*train*.txt') DEVRAW=$EXPPATH"/data/dev.txt"
DEV=$(find $CORPUS -type f -name '*dev*.conllu') TEST=$EXPPATH"/data/test.conllu"
DEVRAW=$(find $CORPUS -type f -name '*dev*.txt') TESTRAW=$EXPPATH"/data/test.txt"
TEST=$(find $CORPUS -type f -name '*test*.conllu')
TESTRAW=$(find $CORPUS -type f -name '*test*.txt')
REF=$TEST REF=$TEST
REFRAW=$TESTRAW REFRAW=$TESTRAW
if has_space "$REF" || has_space "$REFRAW";
then
>&2 echo "ERROR : more than 1 match"
>&2 echo "REF : " $REF
>&2 echo "REFRAW : " $REFRAW
print_usage_and_exit
fi
if test ! -f $REF; if test ! -f $REF;
then then
>&2 echo "ERROR : no ref file found in" $CORPUS >&2 echo "ERROR : no ref file found in" $CORPUS
......
...@@ -7,6 +7,10 @@ function print_usage_and_exit { ...@@ -7,6 +7,10 @@ function print_usage_and_exit {
exit 1 exit 1
} }
function has_space {
[[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
}
LANG=$1 LANG=$1
TEMPLATENAME=$2 TEMPLATENAME=$2
EXPNAME=$3 EXPNAME=$3
...@@ -41,12 +45,32 @@ if [ ! -d "$CORPUS" ]; then ...@@ -41,12 +45,32 @@ if [ ! -d "$CORPUS" ]; then
print_usage_and_exit print_usage_and_exit
fi fi
mkdir -p bin TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
DEV=$(find $CORPUS -type f -name '*dev*.conllu')
TEST=$(find $CORPUS -type f -name '*test*.conllu')
if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
then
>&2 echo "ERROR : more than 1 match with keyword" $KEYWORD
>&2 echo "TRAIN : " $TRAIN
>&2 echo "DEV : " $DEV
>&2 echo "TEST : " $TEST
print_usage_and_exit
fi
mkdir -p bin
if [ ! -d "bin/$EXPNAME" ]; then if [ ! -d "bin/$EXPNAME" ]; then
cp -r $TEMPLATENAME bin/$EXPNAME cp -r $TEMPLATENAME bin/$EXPNAME
cp -r "data" bin/$EXPNAME/. cp -r "data" bin/$EXPNAME/.
echo "CORPUS="$CORPUS > bin/$EXPNAME/config if [ -f "$TRAIN" ]; then
cp $TRAIN bin/$EXPNAME/data/train.conllu
fi
if [ -f "$DEV" ]; then
cp $DEV bin/$EXPNAME/data/dev.conllu
fi
if [ -f "$TEST" ]; then
cp $TEST bin/$EXPNAME/data/test.conllu
fi
fi fi
#! /usr/bin/env bash #! /usr/bin/env bash
function has_space {
[[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
}
function print_usage_and_exit { function print_usage_and_exit {
>&2 echo "USAGE : (tsv | txt) expPath [arguments]" >&2 echo "USAGE : (tsv | txt) expPath [arguments]"
exit 1 exit 1
...@@ -12,6 +8,9 @@ function print_usage_and_exit { ...@@ -12,6 +8,9 @@ function print_usage_and_exit {
MODE=$1 MODE=$1
EXPPATH=$2 EXPPATH=$2
>&2 echo "********************************************************************************"
>&2 echo "Training : "$EXPPATH
if [ -z "$MODE" ]; if [ -z "$MODE" ];
then then
>&2 echo "ERROR : missing argument 1 (mode)" >&2 echo "ERROR : missing argument 1 (mode)"
...@@ -32,42 +31,40 @@ if [ ! -d "$EXPPATH" ]; then ...@@ -32,42 +31,40 @@ if [ ! -d "$EXPPATH" ]; then
print_usage_and_exit print_usage_and_exit
fi fi
source $EXPPATH"/config"
CURDIR=$(pwd) CURDIR=$(pwd)
cd $EXPPATH"/"data && make -s clean && make -s && cd $CURDIR cd $EXPPATH"/"data && make -s clean && make -s && cd $CURDIR
TRAIN=$(find $CORPUS -type f -name '*train*.conllu') TRAIN=$EXPPATH"/data/train.conllu"
TRAINRAW=$(find $CORPUS -type f -name '*train*.txt') TRAINRAW=$EXPPATH"/data/train.txt"
DEV=$(find $CORPUS -type f -name '*dev*.conllu') DEV=$EXPPATH"/data/dev.conllu"
DEVRAW=$(find $CORPUS -type f -name '*dev*.txt') DEVRAW=$EXPPATH"/data/dev.txt"
TEST=$(find $CORPUS -type f -name '*test*.conllu') TEST=$EXPPATH"/data/test.conllu"
TESTRAW=$(find $CORPUS -type f -name '*test*.txt') TESTRAW=$EXPPATH"/data/test.txt"
W2V=$(find $CORPUS -type f -name '*.w2v') W2V=$EXPPATH"/data/pretrained.w2v"
if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST"; if test ! -f $TRAIN;
then then
>&2 echo "ERROR : more than 1 match with keyword" $KEYWORD >&2 echo "ERROR : no train file found in" $EXPPATH
>&2 echo "TRAIN : " $TRAIN >&2 echo "$TRAIN"
>&2 echo "DEV : " $DEV
>&2 echo "TEST : " $TEST
print_usage_and_exit print_usage_and_exit
fi fi
if test -z $TRAIN; if test ! -f $DEV;
then then
>&2 echo "ERROR : no train file found in" $CORPUS DEV=""
>&2 echo "$TRAIN"
print_usage_and_exit
fi fi
if [ "$MODE" = "txt" ]; then if [ "$MODE" = "txt" ]; then
if test -z $TRAINRAW; if test ! -f $TRAINRAW;
then then
>&2 echo "ERROR : no train file found in" $CORPUS >&2 echo "ERROR : no train file found in" $EXPPATH
>&2 echo "$TRAINRAW" >&2 echo "$TRAINRAW"
print_usage_and_exit print_usage_and_exit
fi fi
if test ! -f $DEVRAW;
then
DEVRAW=""
fi
fi fi
if test -f $W2V; if test -f $W2V;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment