From f1f18db2f1d9777e01b7a18d2f43333d330f5d9e Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Sat, 7 Mar 2020 18:18:05 +0100 Subject: [PATCH] Updated for tokenizer --- UD_any/data/Makefile | 6 +++--- UD_any/tokenizer/machine.rm | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 0db1822..29a25b1 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -22,9 +22,9 @@ all_no_test.conllu: cat $(TRAIN_FILES) > $@ tokenizer.ts: all_no_test.conllu $(MCD) - echo "IGNORECHAR" > $@ + echo "ENDWORD" > $@ + echo "IGNORECHAR" >> $@ $(SCRIPTS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt - echo "ENDWORD" >> $@ echo "ADDCHARTOWORD" >> $@ sed -i -e 's/^/<tokenizer> /' $@ @@ -40,7 +40,7 @@ columns: all_no_test.conllu $(MCD) cat tagger.ts parser.ts > taggerparser.ts texts: - ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILEs) $(TEST_FILES) + ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) $(FPLM_FILENAME): all_no_test.conllu $(MCD) $(SCRIPTS)/conllu2fplm.py $< $(MCD) > $@ diff --git a/UD_any/tokenizer/machine.rm b/UD_any/tokenizer/machine.rm index 49d69ac..165e3c1 100644 --- a/UD_any/tokenizer/machine.rm +++ b/UD_any/tokenizer/machine.rm @@ -1,6 +1,6 @@ Name : Tokenizer Machine Classifier : tokenizer CNN(4,0,0,{FORM},{-1,0},{},{FORM},{10}) data/tokenizer.ts -Predictions : FORM +Predictions : ID FORM EOS Strategy : sequential tokenizer tokenizer ENDWORD 1 tokenizer tokenizer SPLITWORD 1 -- GitLab