Skip to content
Snippets Groups Projects
Commit 4ed5c4e0 authored by Franck Dary's avatar Franck Dary
Browse files

Added tokenizer machine

parent a32c777f
No related branches found
No related tags found
No related merge requests found
......@@ -26,6 +26,7 @@ tokenizer.ts: all_no_test.conllu $(MCD)
$(SCRIPTS)/conllu2splits.py $< $(MCD) >> $@ 2> ambiguities.txt
echo "ENDWORD" >> $@
echo "ADDCHARTOWORD" >> $@
sed -i -e 's/^/<tokenizer> /' $@
segmenter.ts:
echo "EOS b.0" > $@
......
......@@ -49,7 +49,7 @@ if __name__ == "__main__" :
striped = line.strip()
if len(striped) == 0 :
continue
print("WRITE b.0 FEATS " + striped, file=output)
print("<morpho> WRITE b.0 FEATS " + striped, file=output)
output.close()
output = open("morpho_parts.ts", 'w', encoding='utf-8')
allParts = set()
......@@ -65,8 +65,8 @@ if __name__ == "__main__" :
allPartsList.append(part)
allPartsList.sort()
for part in allPartsList :
print("ADD b.0 FEATS " + part, file=output)
print("NOTHING", file=output)
print("<morpho> ADD b.0 FEATS " + part, file=output)
print("<morpho> NOTHING", file=output)
output.close()
elif nameCol == "DEPREL" :
......
Name : Tokenizer Machine
Classifier : tokenizer CNN(4,0,0,{FORM},{-1,0},{},{FORM},{10}) data/tokenizer.ts
Predictions : FORM
Strategy : sequential
tokenizer tokenizer ENDWORD 1
tokenizer tokenizer SPLITWORD 1
tokenizer tokenizer 0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment