From dc820bb342927cf5aaa0707dbd9127aa07df6127 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Fri, 10 Apr 2020 21:57:35 +0200 Subject: [PATCH] added tokeparser_incr --- UD_any/data/getTransitionSets.py | 2 +- UD_any/tokeparser_incr/machine.rm | 42 +++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 UD_any/tokeparser_incr/machine.rm diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py index 6d06e7b..473ec73 100755 --- a/UD_any/data/getTransitionSets.py +++ b/UD_any/data/getTransitionSets.py @@ -97,7 +97,7 @@ if __name__ == "__main__" : striped = line.strip() if len(striped) == 0 or striped == "root" or striped == "_" : continue - label = striped.split(':')[0] + label = striped if label not in labels : labels.add(striped) labelsList.append(striped) diff --git a/UD_any/tokeparser_incr/machine.rm b/UD_any/tokeparser_incr/machine.rm new file mode 100644 index 0000000..e4edaf3 --- /dev/null +++ b/UD_any/tokeparser_incr/machine.rm @@ -0,0 +1,42 @@ +Name : Tokenizer, Tagger and Morpho Machine +Classifier : tokemorpho +{ + Transitions : {data/tokenizer.ts data/tagger.ts data/morpho_parts.ts data/parser.ts} + Network type : LSTM + Unknown value threshold : -1 + Buffer context : {-5 -4 -3 -2 -1} + Stack context : {} + Columns : {FORM UPOS} + Focused buffer : {-1 0} + Focused stack : {} + Focused columns : {ID FORM FEATS} + Max nb elements : {1 10 10} + Raw input left window : 5 + Raw input right window : 5 + Embeddings size : 256 + MLP : {2048 0.3 2048 0.3} + Context LSTM size : 512 + Focused LSTM size : 256 + Rawinput LSTM size : 64 + Split trans LSTM size : 256 + Num layers : 3 + BiLSTM : true + LSTM dropout : 0.3 + Tree embedding columns : {DEPREL} + Tree embedding buffer : {-1} + Tree embedding stack : {0} + Tree embedding nb : {5 10} + Tree embedding size : 128 +} +Splitwords : data/splitwords.ts +Predictions : ID FORM UPOS FEATS HEAD DEPREL EOS +Strategy : incremental + tokenizer tagger ENDWORD 0 + tokenizer tagger SPLIT 0 + tokenizer tokenizer 0 + tagger morpho 0 + morpho parser NOTHING 0 + morpho morpho 0 + parser tokenizer SHIFT 1 + parser tokenizer RIGHT 1 + parser parser 0 -- GitLab