diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 0216349fa428f09def44962c68c0e23c6a6508ec..0db18227704ad7136b5e4094ce187af518747821 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -26,6 +26,7 @@ tokenizer.ts: all_no_test.conllu $(MCD) $(SCRIPTS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt echo "ENDWORD" >> $@ echo "ADDCHARTOWORD" >> $@ + sed -i -e 's/^/<tokenizer> /' $@ segmenter.ts: echo "EOS b.0" > $@ diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py index 7a98d7cf0b37b91639085db646c29dd2b41be5fc..ca01d68dfc8b1f47a9d6326bd5c5da916089210b 100755 --- a/UD_any/data/getTransitionSets.py +++ b/UD_any/data/getTransitionSets.py @@ -49,7 +49,7 @@ if __name__ == "__main__" : striped = line.strip() if len(striped) == 0 : continue - print("WRITE b.0 FEATS " + striped, file=output) + print("<morpho> WRITE b.0 FEATS " + striped, file=output) output.close() output = open("morpho_parts.ts", 'w', encoding='utf-8') allParts = set() @@ -65,8 +65,8 @@ if __name__ == "__main__" : allPartsList.append(part) allPartsList.sort() for part in allPartsList : - print("ADD b.0 FEATS " + part, file=output) - print("NOTHING", file=output) + print("<morpho> ADD b.0 FEATS " + part, file=output) + print("<morpho> NOTHING", file=output) output.close() elif nameCol == "DEPREL" : diff --git a/UD_any/tokenizer/machine.rm b/UD_any/tokenizer/machine.rm new file mode 100644 index 0000000000000000000000000000000000000000..49d69acc3d9542c7039f5ea423c738ac4536fd4a --- /dev/null +++ b/UD_any/tokenizer/machine.rm @@ -0,0 +1,7 @@ +Name : Tokenizer Machine +Classifier : tokenizer CNN(4,0,0,{FORM},{-1,0},{},{FORM},{10}) data/tokenizer.ts +Predictions : FORM +Strategy : sequential + tokenizer tokenizer ENDWORD 1 + tokenizer tokenizer SPLITWORD 1 + tokenizer tokenizer 0