From b7aefa899742ec1c7260253ae0252edbbdbb2122 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 11 Feb 2021 08:39:45 +0100 Subject: [PATCH] Added support for option lineByLine --- UD_any/data/Makefile | 8 +++++++- UD_any/train.sh | 8 +++++++- scripts/conllu_to_lines.sh | 3 +++ 3 files changed, 17 insertions(+), 2 deletions(-) create mode 100755 scripts/conllu_to_lines.sh diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 36c4a9c..7f76b65 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -1,5 +1,6 @@ SCRIPTS=../../../../scripts CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl +CONLL2LINES=$(SCRIPTS)/conllu_to_lines.sh TRAIN_FILES=$(shell find . -type f -name '*train*.conllu') DEV_FILES=$(shell find . -type f -name '*dev*.conllu') @@ -9,7 +10,9 @@ TEST_FILES=$(shell find . -type f -name '*test*.conllu') THRESHOLD=10 FPLM_FILENAME=fplm -all: writescore_TIME.ts tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain +all_text: writescore_TIME.ts tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain + rm -f all_no_test.conllu +all_lines: writescore_TIME.ts tokenizer.ts segmenter.ts texts_lines all_no_test.conllu transitions pretrain rm -f all_no_test.conllu all_no_test.conllu: @@ -50,6 +53,9 @@ transitions: all_no_test.conllu texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) +texts_lines: + ./getRawText.py $(CONLL2LINES) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) + pretrain: for col in FORM UPOS LEMMA FEATS DEPREL LETTERS ; do \ ./pretrainEmbeddings.sh $(TRAIN_FILES) $$col 64 $$col.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) ; \ diff --git a/UD_any/train.sh b/UD_any/train.sh index da5ac4a..c79962c 100755 --- a/UD_any/train.sh +++ b/UD_any/train.sh @@ -31,8 +31,14 @@ if [ ! -d "$EXPPATH" ]; then print_usage_and_exit fi +TARGET="all_text" +if [[ "$*" == *--lineByLine* ]] +then + TARGET="all_lines" +fi + CURDIR=$(pwd) -cd $EXPPATH"/"data && make -s clean && make -s +cd $EXPPATH"/"data && make -s clean && make $TARGET -s cd $CURDIR TRAIN=$EXPPATH"/data/train.conllu" diff --git a/scripts/conllu_to_lines.sh b/scripts/conllu_to_lines.sh new file mode 100755 index 0000000..ad98780 --- /dev/null +++ b/scripts/conllu_to_lines.sh @@ -0,0 +1,3 @@ +#! /usr/bin/env bash + +grep "# text =" $1 | cut -c '10-' -- GitLab