diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index b15a589833ef964cc34d302da4e3163e68c31e76..904e3fa0edfe7f91651537a7d25bf3f1b525f729 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -23,10 +23,14 @@ all_no_test.conllu: tokenizer.ts: all_no_test.conllu $(MCD) echo "ENDWORD" > $@ - $(SCRIPTS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt + $(SCRIPTS)/conllu2splits.py $< $(MCD) > splitwords.ts 2> ambiguities.txt + echo "SPLIT 0" >> $@ + echo "SPLIT 1" >> $@ + echo "SPLIT 2" >> $@ echo "ADDCHARTOWORD" >> $@ echo "IGNORECHAR" >> $@ sed -i -e 's/^/<tokenizer> /' $@ + sed -i -e 's/^/<tokenizer> /' splitwords.ts segmenter.ts: echo "EOS b.0" > $@ diff --git a/UD_any/tokenizer/machine.rm b/UD_any/tokenizer/machine.rm index 093a9bb89b4256010ae42ba0293bc50039c3cd90..5ff7eb25676c4a17bebc430e926b743e638835e1 100644 --- a/UD_any/tokenizer/machine.rm +++ b/UD_any/tokenizer/machine.rm @@ -1,7 +1,8 @@ Name : Tokenizer Machine -Classifier : tokenizer LSTM(-1,{-3,-2,-1},{},{FORM},{-1,0},{},{FORM},{10},5,5) data/tokenizer.ts +Classifier : tokenizer LSTM(-1,{-3,-2,-1},{},{FORM},{-1,0},{},{ID,FORM},{1,10},5,5) data/tokenizer.ts +Splitwords : data/splitwords.ts Predictions : ID FORM EOS Strategy : sequential tokenizer tokenizer ENDWORD 1 - tokenizer tokenizer SPLITWORD 1 + tokenizer tokenizer SPLIT 1 tokenizer tokenizer 0