From 1078b0cee54cb0174d3ff38150bcf970dd1f4e49 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Sun, 19 Apr 2020 21:01:45 +0200 Subject: [PATCH] updated tokeparser machines --- UD_any/tokeparser_base/machine.rm | 46 +++++++++++++++++++++++++++++++ UD_any/tokeparser_incr/machine.rm | 34 +++++++++++++---------- UD_any/tokeparser_seq/machine.rm | 36 +++++++++++++----------- 3 files changed, 85 insertions(+), 31 deletions(-) create mode 100644 UD_any/tokeparser_base/machine.rm diff --git a/UD_any/tokeparser_base/machine.rm b/UD_any/tokeparser_base/machine.rm new file mode 100644 index 0000000..5c4a1d0 --- /dev/null +++ b/UD_any/tokeparser_base/machine.rm @@ -0,0 +1,46 @@ +Name : Tokenizer, Tagger, Morpho and Parser Machine +Classifier : tokeparser +{ + Transitions : {data/tokenizer.ts data/tagger.ts data/morpho_parts.ts data/parser.ts} + Network type : LSTM + Unknown value threshold : 1 + Buffer context : {-3 -2 -1} + Stack context : {2 1 0} + Columns : {FORM UPOS} + Focused buffer : {-1 0} + Focused stack : {2 1 0} + Focused columns : {ID EOS FORM FEATS DEPREL} + Max nb elements : {1 1 10 10 1} + Raw input left window : 5 + Raw input right window : 5 + Embeddings size : 128 + MLP : {2048 0.3} + Context LSTM size : 512 + Focused LSTM size : 256 + Rawinput LSTM size : 32 + Split trans LSTM size : 256 + Num layers : 3 + BiLSTM : true + LSTM dropout : 0.1 + Total input dropout : 0.3 + Embeddings dropout : 0.3 + Dropout 2d : false + Tree embedding columns : {DEPREL} + Tree embedding buffer : {-1} + Tree embedding stack : {0 1 2} + Tree embedding nb : {6} + Tree embedding size : 128 + Optimizer : Adam {0.0005 0.9 0.999 0.00000001 0.00001 true} +} +Splitwords : data/splitwords.ts +Predictions : ID FORM UPOS FEATS HEAD DEPREL EOS +Strategy : sequential + tokenizer tagger ENDWORD 1 + tokenizer tagger SPLIT 1 + tokenizer tagger 0 + tagger morpho 1 + morpho parser NOTHING 1 + morpho parser 0 + parser tokenizer SHIFT 1 + parser tokenizer RIGHT 1 + parser tokenizer 0 diff --git a/UD_any/tokeparser_incr/machine.rm b/UD_any/tokeparser_incr/machine.rm index abe911d..5ec3600 100644 --- a/UD_any/tokeparser_incr/machine.rm +++ b/UD_any/tokeparser_incr/machine.rm @@ -1,32 +1,36 @@ -Name : Tokenizer, Tagger and Morpho Machine -Classifier : tokemorpho +Name : Tokenizer, Tagger, Morpho and Parser Machine +Classifier : tokeparser { Transitions : {data/tokenizer.ts data/tagger.ts data/morpho_parts.ts data/parser.ts} Network type : LSTM - Unknown value threshold : -1 - Buffer context : {-5 -4 -3 -2 -1} - Stack context : {3 2 1 0} + Unknown value threshold : 1 + Buffer context : {-3 -2 -1} + Stack context : {2 1 0} Columns : {FORM UPOS} Focused buffer : {-1 0} - Focused stack : {1 0} - Focused columns : {ID EOS FORM FEATS} - Max nb elements : {1 1 10 10} + Focused stack : {2 1 0} + Focused columns : {ID EOS FORM FEATS DEPREL} + Max nb elements : {1 1 10 10 1} Raw input left window : 5 Raw input right window : 5 - Embeddings size : 256 - MLP : {2048 0.3 2048 0.3} + Embeddings size : 128 + MLP : {2048 0.3} Context LSTM size : 512 Focused LSTM size : 256 - Rawinput LSTM size : 64 + Rawinput LSTM size : 32 Split trans LSTM size : 256 Num layers : 3 BiLSTM : true - LSTM dropout : 0.3 - Tree embedding columns : {FORM POS DEPREL} + LSTM dropout : 0.1 + Total input dropout : 0.3 + Embeddings dropout : 0.3 + Dropout 2d : false + Tree embedding columns : {DEPREL} Tree embedding buffer : {-1} - Tree embedding stack : {0 1} - Tree embedding nb : {6 10} + Tree embedding stack : {0 1 2} + Tree embedding nb : {6} Tree embedding size : 128 + Optimizer : Adam {0.0005 0.9 0.999 0.00000001 0.00001 true} } Splitwords : data/splitwords.ts Predictions : ID FORM UPOS FEATS HEAD DEPREL EOS diff --git a/UD_any/tokeparser_seq/machine.rm b/UD_any/tokeparser_seq/machine.rm index e2b2ab0..77947cd 100644 --- a/UD_any/tokeparser_seq/machine.rm +++ b/UD_any/tokeparser_seq/machine.rm @@ -1,32 +1,36 @@ -Name : Tokenizer, Tagger and Morpho Machine -Classifier : tokemorpho +Name : Tokenizer, Tagger, Morpho and Parser Machine +Classifier : tokeparser { Transitions : {data/tokenizer.ts data/tagger.ts data/morpho_parts.ts data/parser.ts} Network type : LSTM - Unknown value threshold : -1 - Buffer context : {-5 -4 -3 -2 -1 1 2} - Stack context : {3 2 1 0} + Unknown value threshold : 1 + Buffer context : {-3 -2 -1 1 2 3} + Stack context : {2 1 0} Columns : {FORM UPOS} - Focused buffer : {-1 0} - Focused stack : {1 0} - Focused columns : {ID EOS FORM FEATS} - Max nb elements : {1 1 10 10} + Focused buffer : {-1 0 1 2} + Focused stack : {2 1 0} + Focused columns : {ID EOS FORM FEATS DEPREL} + Max nb elements : {1 1 10 10 1} Raw input left window : 5 Raw input right window : 5 - Embeddings size : 256 - MLP : {2048 0.3 2048 0.3} + Embeddings size : 128 + MLP : {2048 0.3} Context LSTM size : 512 Focused LSTM size : 256 - Rawinput LSTM size : 64 + Rawinput LSTM size : 32 Split trans LSTM size : 256 Num layers : 3 BiLSTM : true - LSTM dropout : 0.3 - Tree embedding columns : {FORM POS DEPREL} + LSTM dropout : 0.1 + Total input dropout : 0.3 + Embeddings dropout : 0.3 + Dropout 2d : false + Tree embedding columns : {DEPREL} Tree embedding buffer : {-1} - Tree embedding stack : {0 1} - Tree embedding nb : {6 10} + Tree embedding stack : {0 1 2} + Tree embedding nb : {6} Tree embedding size : 128 + Optimizer : Adam {0.0005 0.9 0.999 0.00000001 0.00001 true} } Splitwords : data/splitwords.ts Predictions : ID FORM UPOS FEATS HEAD DEPREL EOS -- GitLab