From ac4b5d0a5ac5ecfc24ddf71da1833d91cbbe67a9 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 2 Apr 2020 22:47:14 +0200 Subject: [PATCH] Adapted to new split transition --- UD_any/data/Makefile | 6 +++++- UD_any/tokenizer/machine.rm | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index b15a589..904e3fa 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -23,10 +23,14 @@ all_no_test.conllu: tokenizer.ts: all_no_test.conllu $(MCD) echo "ENDWORD" > $@ - $(SCRIPTS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt + $(SCRIPTS)/conllu2splits.py $< $(MCD) > splitwords.ts 2> ambiguities.txt + echo "SPLIT 0" >> $@ + echo "SPLIT 1" >> $@ + echo "SPLIT 2" >> $@ echo "ADDCHARTOWORD" >> $@ echo "IGNORECHAR" >> $@ sed -i -e 's/^/<tokenizer> /' $@ + sed -i -e 's/^/<tokenizer> /' splitwords.ts segmenter.ts: echo "EOS b.0" > $@ diff --git a/UD_any/tokenizer/machine.rm b/UD_any/tokenizer/machine.rm index 093a9bb..5ff7eb2 100644 --- a/UD_any/tokenizer/machine.rm +++ b/UD_any/tokenizer/machine.rm @@ -1,7 +1,8 @@ Name : Tokenizer Machine -Classifier : tokenizer LSTM(-1,{-3,-2,-1},{},{FORM},{-1,0},{},{FORM},{10},5,5) data/tokenizer.ts +Classifier : tokenizer LSTM(-1,{-3,-2,-1},{},{FORM},{-1,0},{},{ID,FORM},{1,10},5,5) data/tokenizer.ts +Splitwords : data/splitwords.ts Predictions : ID FORM EOS Strategy : sequential tokenizer tokenizer ENDWORD 1 - tokenizer tokenizer SPLITWORD 1 + tokenizer tokenizer SPLIT 1 tokenizer tokenizer 0 -- GitLab