From 4ed5c4e09911ab426e725c3e044eddc3b4534ab9 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 5 Mar 2020 16:20:31 +0100 Subject: [PATCH] Added tokenizer machine --- UD_any/data/Makefile | 1 + UD_any/data/getTransitionSets.py | 6 +++--- UD_any/tokenizer/machine.rm | 7 +++++++ 3 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 UD_any/tokenizer/machine.rm diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 0216349..0db1822 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -26,6 +26,7 @@ tokenizer.ts: all_no_test.conllu $(MCD) $(SCRIPTS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt echo "ENDWORD" >> $@ echo "ADDCHARTOWORD" >> $@ + sed -i -e 's/^/<tokenizer> /' $@ segmenter.ts: echo "EOS b.0" > $@ diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py index 7a98d7c..ca01d68 100755 --- a/UD_any/data/getTransitionSets.py +++ b/UD_any/data/getTransitionSets.py @@ -49,7 +49,7 @@ if __name__ == "__main__" : striped = line.strip() if len(striped) == 0 : continue - print("WRITE b.0 FEATS " + striped, file=output) + print("<morpho> WRITE b.0 FEATS " + striped, file=output) output.close() output = open("morpho_parts.ts", 'w', encoding='utf-8') allParts = set() @@ -65,8 +65,8 @@ if __name__ == "__main__" : allPartsList.append(part) allPartsList.sort() for part in allPartsList : - print("ADD b.0 FEATS " + part, file=output) - print("NOTHING", file=output) + print("<morpho> ADD b.0 FEATS " + part, file=output) + print("<morpho> NOTHING", file=output) output.close() elif nameCol == "DEPREL" : diff --git a/UD_any/tokenizer/machine.rm b/UD_any/tokenizer/machine.rm new file mode 100644 index 0000000..49d69ac --- /dev/null +++ b/UD_any/tokenizer/machine.rm @@ -0,0 +1,7 @@ +Name : Tokenizer Machine +Classifier : tokenizer CNN(4,0,0,{FORM},{-1,0},{},{FORM},{10}) data/tokenizer.ts +Predictions : FORM +Strategy : sequential + tokenizer tokenizer ENDWORD 1 + tokenizer tokenizer SPLITWORD 1 + tokenizer tokenizer 0 -- GitLab