diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 0db18227704ad7136b5e4094ce187af518747821..29a25b103b0d8ca9dbb482ec039c7fcf8c754b07 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -22,9 +22,9 @@ all_no_test.conllu: cat $(TRAIN_FILES) > $@ tokenizer.ts: all_no_test.conllu $(MCD) - echo "IGNORECHAR" > $@ + echo "ENDWORD" > $@ + echo "IGNORECHAR" >> $@ $(SCRIPTS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt - echo "ENDWORD" >> $@ echo "ADDCHARTOWORD" >> $@ sed -i -e 's/^/<tokenizer> /' $@ @@ -40,7 +40,7 @@ columns: all_no_test.conllu $(MCD) cat tagger.ts parser.ts > taggerparser.ts texts: - ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILEs) $(TEST_FILES) + ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) $(FPLM_FILENAME): all_no_test.conllu $(MCD) $(SCRIPTS)/conllu2fplm.py $< $(MCD) > $@ diff --git a/UD_any/tokenizer/machine.rm b/UD_any/tokenizer/machine.rm index 49d69acc3d9542c7039f5ea423c738ac4536fd4a..165e3c1f1c3e3659e4352b1f4a310ae818a7a16e 100644 --- a/UD_any/tokenizer/machine.rm +++ b/UD_any/tokenizer/machine.rm @@ -1,6 +1,6 @@ Name : Tokenizer Machine Classifier : tokenizer CNN(4,0,0,{FORM},{-1,0},{},{FORM},{10}) data/tokenizer.ts -Predictions : FORM +Predictions : ID FORM EOS Strategy : sequential tokenizer tokenizer ENDWORD 1 tokenizer tokenizer SPLITWORD 1