Skip to content
Snippets Groups Projects
Commit 26f25b22 authored by Franck Dary's avatar Franck Dary
Browse files

Corpora are now copied into exp dir to avoid conflicts between multiple experiences

parent a301e461
No related branches found
No related tags found
No related merge requests found
include ../config
SCRIPTS=../../../../scripts
CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
TRAIN_FILES=$(shell find $(CORPUS) -type f -name '*train*.conllu')
DEV_FILES=$(shell find $(CORPUS) -type f -name '*dev*.conllu')
TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu')
TRAIN_FILES=$(shell find . -type f -name '*train*.conllu')
DEV_FILES=$(shell find . -type f -name '*dev*.conllu')
TEST_FILES=$(shell find . -type f -name '*test*.conllu')
#This part is for lemmatizer rules and excpetions computation
THRESHOLD=10
......@@ -45,14 +43,13 @@ texts:
./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
pretrain: texts
./pretrainEmbeddings.py $(shell find $(CORPUS) -type f -name '*train*.txt') 64
./pretrainEmbeddings.py train.txt 64 pretrained.w2v
$(FPLM_FILENAME): all_no_test.conllu
$(SCRIPTS)/conllu2fplm.py $< > $@
clean:
- rm -f *\.txt
- rm -f *\.conll*
- rm -f *\.ts
- rm -f ambiguities\.txt
- rm -f $(FPLM_FILENAME)
......@@ -6,17 +6,17 @@ import subprocess
from shutil import which
def printUsageAndExit() :
print("USAGE : %s file.conllu embeddingsSize"%sys.argv[0], file=sys.stderr)
print("USAGE : %s file.conllu embeddingsSize outputFile"%sys.argv[0], file=sys.stderr)
exit(1)
if __name__ == "__main__" :
if len(sys.argv) != 3 :
if len(sys.argv) != 4 :
printUsageAndExit()
pathToFile = sys.argv[1]
embeddingsSize = int(sys.argv[2])
splited = os.path.splitext(pathToFile)
target = splited[0] + ".w2v"
target = sys.argv[3]
if which("word2vec") is None :
exit(0)
......
#! /usr/bin/env bash
function has_space {
[[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
}
function print_usage_and_exit {
>&2 echo "USAGE : (tsv | txt) expPath [arguments]"
exit 1
......@@ -32,26 +28,16 @@ if [ ! -d "$EXPPATH" ]; then
print_usage_and_exit
fi
source $EXPPATH"/config"
TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
TRAINRAW=$(find $CORPUS -type f -name '*train*.txt')
DEV=$(find $CORPUS -type f -name '*dev*.conllu')
DEVRAW=$(find $CORPUS -type f -name '*dev*.txt')
TEST=$(find $CORPUS -type f -name '*test*.conllu')
TESTRAW=$(find $CORPUS -type f -name '*test*.txt')
TRAIN=$EXPPATH"/data/train.conllu"
TRAINRAW=$EXPPATH"/data/train.txt"
DEV=$EXPPATH"/data/dev.conllu"
DEVRAW=$EXPPATH"/data/dev.txt"
TEST=$EXPPATH"/data/test.conllu"
TESTRAW=$EXPPATH"/data/test.txt"
REF=$TEST
REFRAW=$TESTRAW
if has_space "$REF" || has_space "$REFRAW";
then
>&2 echo "ERROR : more than 1 match"
>&2 echo "REF : " $REF
>&2 echo "REFRAW : " $REFRAW
print_usage_and_exit
fi
if test ! -f $REF;
then
>&2 echo "ERROR : no ref file found in" $CORPUS
......
......@@ -7,6 +7,10 @@ function print_usage_and_exit {
exit 1
}
function has_space {
[[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
}
LANG=$1
TEMPLATENAME=$2
EXPNAME=$3
......@@ -41,12 +45,32 @@ if [ ! -d "$CORPUS" ]; then
print_usage_and_exit
fi
mkdir -p bin
TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
DEV=$(find $CORPUS -type f -name '*dev*.conllu')
TEST=$(find $CORPUS -type f -name '*test*.conllu')
if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
then
>&2 echo "ERROR : more than 1 match with keyword" $KEYWORD
>&2 echo "TRAIN : " $TRAIN
>&2 echo "DEV : " $DEV
>&2 echo "TEST : " $TEST
print_usage_and_exit
fi
mkdir -p bin
if [ ! -d "bin/$EXPNAME" ]; then
cp -r $TEMPLATENAME bin/$EXPNAME
cp -r "data" bin/$EXPNAME/.
echo "CORPUS="$CORPUS > bin/$EXPNAME/config
if [ -f "$TRAIN" ]; then
cp $TRAIN bin/$EXPNAME/data/train.conllu
fi
if [ -f "$DEV" ]; then
cp $DEV bin/$EXPNAME/data/dev.conllu
fi
if [ -f "$TEST" ]; then
cp $TEST bin/$EXPNAME/data/test.conllu
fi
fi
#! /usr/bin/env bash
function has_space {
[[ "$1" != "${1%[[:space:]]*}" ]] && return 0 || return 1
}
function print_usage_and_exit {
>&2 echo "USAGE : (tsv | txt) expPath [arguments]"
exit 1
......@@ -12,6 +8,9 @@ function print_usage_and_exit {
MODE=$1
EXPPATH=$2
>&2 echo "********************************************************************************"
>&2 echo "Training : "$EXPPATH
if [ -z "$MODE" ];
then
>&2 echo "ERROR : missing argument 1 (mode)"
......@@ -32,42 +31,40 @@ if [ ! -d "$EXPPATH" ]; then
print_usage_and_exit
fi
source $EXPPATH"/config"
CURDIR=$(pwd)
cd $EXPPATH"/"data && make -s clean && make -s && cd $CURDIR
TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
TRAINRAW=$(find $CORPUS -type f -name '*train*.txt')
DEV=$(find $CORPUS -type f -name '*dev*.conllu')
DEVRAW=$(find $CORPUS -type f -name '*dev*.txt')
TEST=$(find $CORPUS -type f -name '*test*.conllu')
TESTRAW=$(find $CORPUS -type f -name '*test*.txt')
W2V=$(find $CORPUS -type f -name '*.w2v')
TRAIN=$EXPPATH"/data/train.conllu"
TRAINRAW=$EXPPATH"/data/train.txt"
DEV=$EXPPATH"/data/dev.conllu"
DEVRAW=$EXPPATH"/data/dev.txt"
TEST=$EXPPATH"/data/test.conllu"
TESTRAW=$EXPPATH"/data/test.txt"
W2V=$EXPPATH"/data/pretrained.w2v"
if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
if test ! -f $TRAIN;
then
>&2 echo "ERROR : more than 1 match with keyword" $KEYWORD
>&2 echo "TRAIN : " $TRAIN
>&2 echo "DEV : " $DEV
>&2 echo "TEST : " $TEST
>&2 echo "ERROR : no train file found in" $EXPPATH
>&2 echo "$TRAIN"
print_usage_and_exit
fi
if test -z $TRAIN;
if test ! -f $DEV;
then
>&2 echo "ERROR : no train file found in" $CORPUS
>&2 echo "$TRAIN"
print_usage_and_exit
DEV=""
fi
if [ "$MODE" = "txt" ]; then
if test -z $TRAINRAW;
if test ! -f $TRAINRAW;
then
>&2 echo "ERROR : no train file found in" $CORPUS
>&2 echo "ERROR : no train file found in" $EXPPATH
>&2 echo "$TRAINRAW"
print_usage_and_exit
fi
if test ! -f $DEVRAW;
then
DEVRAW=""
fi
fi
if test -f $W2V;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment