From 4ed5c4e09911ab426e725c3e044eddc3b4534ab9 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Thu, 5 Mar 2020 16:20:31 +0100
Subject: [PATCH] Added tokenizer machine

---
 UD_any/data/Makefile             | 1 +
 UD_any/data/getTransitionSets.py | 6 +++---
 UD_any/tokenizer/machine.rm      | 7 +++++++
 3 files changed, 11 insertions(+), 3 deletions(-)
 create mode 100644 UD_any/tokenizer/machine.rm

diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile
index 0216349..0db1822 100644
--- a/UD_any/data/Makefile
+++ b/UD_any/data/Makefile
@@ -26,6 +26,7 @@ tokenizer.ts: all_no_test.conllu $(MCD)
 	$(SCRIPTS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt
 	echo "ENDWORD" >> $@
 	echo "ADDCHARTOWORD" >> $@
+	sed -i -e 's/^/<tokenizer> /' $@
 
 segmenter.ts:
 	echo "EOS b.0" > $@
diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py
index 7a98d7c..ca01d68 100755
--- a/UD_any/data/getTransitionSets.py
+++ b/UD_any/data/getTransitionSets.py
@@ -49,7 +49,7 @@ if __name__ == "__main__" :
         striped = line.strip()
         if len(striped) == 0 :
           continue
-        print("WRITE b.0 FEATS " + striped, file=output)
+        print("<morpho> WRITE b.0 FEATS " + striped, file=output)
       output.close()
       output = open("morpho_parts.ts", 'w', encoding='utf-8')
       allParts = set()
@@ -65,8 +65,8 @@ if __name__ == "__main__" :
         allPartsList.append(part)
       allPartsList.sort()
       for part in allPartsList :
-        print("ADD b.0 FEATS " + part, file=output)
-      print("NOTHING", file=output)
+        print("<morpho> ADD b.0 FEATS " + part, file=output)
+      print("<morpho> NOTHING", file=output)
       output.close()
 
     elif nameCol == "DEPREL" :
diff --git a/UD_any/tokenizer/machine.rm b/UD_any/tokenizer/machine.rm
new file mode 100644
index 0000000..49d69ac
--- /dev/null
+++ b/UD_any/tokenizer/machine.rm
@@ -0,0 +1,7 @@
+Name : Tokenizer Machine
+Classifier : tokenizer CNN(4,0,0,{FORM},{-1,0},{},{FORM},{10}) data/tokenizer.ts
+Predictions : FORM
+Strategy : sequential
+  tokenizer tokenizer ENDWORD 1
+  tokenizer tokenizer SPLITWORD 1
+  tokenizer tokenizer 0
-- 
GitLab