diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 36c4a9cdb22ccd7592f1988b18663345118ce970..7f76b6509954ac1cca4d0945fc510be2ef0f27ec 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -1,5 +1,6 @@ SCRIPTS=../../../../scripts CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl +CONLL2LINES=$(SCRIPTS)/conllu_to_lines.sh TRAIN_FILES=$(shell find . -type f -name '*train*.conllu') DEV_FILES=$(shell find . -type f -name '*dev*.conllu') @@ -9,7 +10,9 @@ TEST_FILES=$(shell find . -type f -name '*test*.conllu') THRESHOLD=10 FPLM_FILENAME=fplm -all: writescore_TIME.ts tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain +all_text: writescore_TIME.ts tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain + rm -f all_no_test.conllu +all_lines: writescore_TIME.ts tokenizer.ts segmenter.ts texts_lines all_no_test.conllu transitions pretrain rm -f all_no_test.conllu all_no_test.conllu: @@ -50,6 +53,9 @@ transitions: all_no_test.conllu texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) +texts_lines: + ./getRawText.py $(CONLL2LINES) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) + pretrain: for col in FORM UPOS LEMMA FEATS DEPREL LETTERS ; do \ ./pretrainEmbeddings.sh $(TRAIN_FILES) $$col 64 $$col.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) ; \ diff --git a/UD_any/train.sh b/UD_any/train.sh index da5ac4aa5f4f2fe0949f1d42c49baa163345b1d2..c79962c38eae8c2fba18d8c1b12e9f836a8c9f65 100755 --- a/UD_any/train.sh +++ b/UD_any/train.sh @@ -31,8 +31,14 @@ if [ ! -d "$EXPPATH" ]; then print_usage_and_exit fi +TARGET="all_text" +if [[ "$*" == *--lineByLine* ]] +then + TARGET="all_lines" +fi + CURDIR=$(pwd) -cd $EXPPATH"/"data && make -s clean && make -s +cd $EXPPATH"/"data && make -s clean && make $TARGET -s cd $CURDIR TRAIN=$EXPPATH"/data/train.conllu" diff --git a/scripts/conllu_to_lines.sh b/scripts/conllu_to_lines.sh new file mode 100755 index 0000000000000000000000000000000000000000..ad98780f547176ebe06a5b90533ee42f65edd9f8 --- /dev/null +++ b/scripts/conllu_to_lines.sh @@ -0,0 +1,3 @@ +#! /usr/bin/env bash + +grep "# text =" $1 | cut -c '10-'