Skip to content
Snippets Groups Projects
Commit 53f196fc authored by Franck Dary's avatar Franck Dary
Browse files

Allowing to pass mcd to training

parent 1e001f9b
No related branches found
No related tags found
No related merge requests found
# TODO give mcd argument to all scripts
SCRIPTS=../../../../scripts
CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
CONLL2LINES=$(SCRIPTS)/conllu_to_lines.sh
TRAIN_FILES=$(shell find . -type f -name '*train*.conllu')
DEV_FILES=$(shell find . -type f -name '*dev*.conllu')
TEST_FILES=$(shell find . -type f -name '*test*.conllu')
TRAIN_FILES=$(shell find . -name '*train*.conllu')
DEV_FILES=$(shell find . -name '*dev*.conllu')
TEST_FILES=$(shell find . -name '*test*.conllu')
#This part is for lemmatizer rules and excpetions computation
THRESHOLD=10
FPLM_FILENAME=fplm
all_text: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain
all_text: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts transitions pretrain
rm -f all_no_test.conllu
all_lines: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts_lines all_no_test.conllu transitions pretrain
all_lines: writescore_NFIX.ts writescore_FFD.ts writescore_GPT.ts writescore_TRT.ts writescore_FIXPROP.ts tokenizer.ts segmenter.ts texts_lines transitions pretrain
rm -f all_no_test.conllu
all_no_test.conllu:
cat $(TRAIN_FILES) $(DEV_FILES) > $@
tokenizer.ts: all_no_test.conllu
tokenizer.ts: train.conllu
echo "ENDWORD" > $@
$(SCRIPTS)/conllu2splits.py $< > splitwords.ts 2> ambiguities.txt
$(SCRIPTS)/conllu2splits.py $< $(MCD) > splitwords.ts 2> ambiguities.txt
echo "SPLIT 0" >> $@
echo "SPLIT 1" >> $@
echo "SPLIT 2" >> $@
......@@ -59,8 +57,8 @@ writescore_TRT.ts:
writescore_FIXPROP.ts:
echo "WRITESCORE b.0 FIXPROP" > $@
transitions: all_no_test.conllu
./getTransitionSets.py $<
transitions: train.conllu
./getTransitionSets.py $< $(MCD)
texts:
./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
......@@ -73,7 +71,7 @@ pretrain:
./pretrainEmbeddings.sh $(TRAIN_FILES) $$col 128 $$col.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) ; \
done
$(FPLM_FILENAME): all_no_test.conllu
$(FPLM_FILENAME): train.conllu
$(SCRIPTS)/conllu2fplm.py $< > $@
clean:
......
......@@ -7,17 +7,20 @@ sys.path.insert(1, '../../../../scripts')
from readMCD import readMCD
def printUsageAndExit() :
print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr)
print("USAGE : %s file.conllu [mcd]"%sys.argv[0], file=sys.stderr)
exit(1)
if __name__ == "__main__" :
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
if len(sys.argv) != 2 :
if len(sys.argv) < 2 :
printUsageAndExit()
mcd = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
if len(sys.argv) == 3 :
mcd = sys.argv[2].replace(",", " ")
col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
col2index, index2col = readMCD(mcd)
fileContent = []
......
......@@ -45,9 +45,9 @@ if [ ! -d "$CORPUS" ]; then
print_usage_and_exit
fi
TRAIN=$(find $CORPUS -type f -name '*train*.conllu')
DEV=$(find $CORPUS -type f -name '*dev*.conllu')
TEST=$(find $CORPUS -type f -name '*test*.conllu')
TRAIN=$(find $CORPUS -name '*train*.conllu')
DEV=$(find $CORPUS -name '*dev*.conllu')
TEST=$(find $CORPUS -name '*test*.conllu')
W2V=$(find $CORPUS -name '*.w2v')
if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
......@@ -65,17 +65,17 @@ if [ ! -d "bin/$EXPNAME" ]; then
cp -r $TEMPLATENAME bin/$EXPNAME
cp -r "data" bin/$EXPNAME/.
if [ -f "$TRAIN" ]; then
cp -P $TRAIN bin/$EXPNAME/data/train.conllu
ln -s $(readlink -f $TRAIN) bin/$EXPNAME/data/train.conllu
fi
if [ -f "$DEV" ]; then
cp -P $DEV bin/$EXPNAME/data/dev.conllu
ln -s $(readlink -f $DEV) bin/$EXPNAME/data/dev.conllu
fi
if [ -f "$TEST" ]; then
cp -P $TEST bin/$EXPNAME/data/test.conllu
ln -s $(readlink -f $TEST) bin/$EXPNAME/data/test.conllu
fi
if [ ! -z "$W2V" ]; then
mkdir -p bin/$EXPNAME/data/W2V/
cp -P $W2V bin/$EXPNAME/data/W2V/
ln -s $(readlink -f $W2V) bin/$EXPNAME/data/W2V/
fi
fi
......@@ -36,6 +36,21 @@ shift
shift
shift
MCD="ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL"
NO=""
for arg in "$@"
do
if [ "$NO" = "1" ]
then
MCD="$arg"
NO=""
fi
if [ "$arg" = "--mcd" ]
then
NO="1"
fi
done
if [ ! -d "$EXPPATH" ]; then
>&2 echo "ERROR : directory $EXPPATH doesn't exist"
print_usage_and_exit
......@@ -48,7 +63,7 @@ then
fi
CURDIR=$(pwd)
cd $EXPPATH"/"data && make -s clean && PRETRAINED_COLS=$PRETRAINED make $TARGET -s
cd $EXPPATH"/"data && make -s clean && PRETRAINED_COLS=$PRETRAINED MCD=$MCD make $TARGET -s
cd $CURDIR
TRAIN=$EXPPATH"/data/train.conllu"
......
......@@ -731,13 +731,15 @@ def main() :
help="Comma separated list of column names for which to enumerate errors (e.g. \"UPOS,FEATS\").")
parser.add_argument("--extra", "-x", default="",
help="Comma separated list of column names for which to compute score (e.g. \"TIME,EOS\").")
parser.add_argument("--mcd", default="ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC",
help="Comma separated list of column.")
args = parser.parse_args()
errors_metrics = [] if args.enumerate_errors is None else args.enumerate_errors.split(',')
global col2index
global index2col
col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
col2index, index2col = readMCD(args.mcd.replace(",", " "))
# Evaluate
gold_ud, evaluations = evaluate_wrapper(args)
......
......@@ -7,7 +7,7 @@ rules = {}
prefix = "SPLITWORD "
def printUsageAndExit() :
print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr)
print("USAGE : %s file.conllu [mcd]"%sys.argv[0], file=sys.stderr)
exit(1)
def computeRules(sentence) :
......@@ -36,10 +36,14 @@ def computeRules(sentence) :
def main() :
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
if len(sys.argv) != 2 :
if len(sys.argv) < 2 :
printUsageAndExit()
if len(sys.argv) == 3 :
mcd = sys.argv[2].replace(",", " ")
else :
mcd = "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"
col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
col2index, index2col = readMCD(mcd)
sentence = []
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment