diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py index 6d06e7b78a346caca5cf0dce725affecfa2ef582..473ec73599c3df02dd608302d7f613a1d64b2830 100755 --- a/UD_any/data/getTransitionSets.py +++ b/UD_any/data/getTransitionSets.py @@ -97,7 +97,7 @@ if __name__ == "__main__" : striped = line.strip() if len(striped) == 0 or striped == "root" or striped == "_" : continue - label = striped.split(':')[0] + label = striped if label not in labels : labels.add(striped) labelsList.append(striped) diff --git a/UD_any/tokeparser_incr/machine.rm b/UD_any/tokeparser_incr/machine.rm new file mode 100644 index 0000000000000000000000000000000000000000..e4edaf3387f42ab1071d5bc74d2b0f9fc8fb6fcb --- /dev/null +++ b/UD_any/tokeparser_incr/machine.rm @@ -0,0 +1,42 @@ +Name : Tokenizer, Tagger and Morpho Machine +Classifier : tokemorpho +{ + Transitions : {data/tokenizer.ts data/tagger.ts data/morpho_parts.ts data/parser.ts} + Network type : LSTM + Unknown value threshold : -1 + Buffer context : {-5 -4 -3 -2 -1} + Stack context : {} + Columns : {FORM UPOS} + Focused buffer : {-1 0} + Focused stack : {} + Focused columns : {ID FORM FEATS} + Max nb elements : {1 10 10} + Raw input left window : 5 + Raw input right window : 5 + Embeddings size : 256 + MLP : {2048 0.3 2048 0.3} + Context LSTM size : 512 + Focused LSTM size : 256 + Rawinput LSTM size : 64 + Split trans LSTM size : 256 + Num layers : 3 + BiLSTM : true + LSTM dropout : 0.3 + Tree embedding columns : {DEPREL} + Tree embedding buffer : {-1} + Tree embedding stack : {0} + Tree embedding nb : {5 10} + Tree embedding size : 128 +} +Splitwords : data/splitwords.ts +Predictions : ID FORM UPOS FEATS HEAD DEPREL EOS +Strategy : incremental + tokenizer tagger ENDWORD 0 + tokenizer tagger SPLIT 0 + tokenizer tokenizer 0 + tagger morpho 0 + morpho parser NOTHING 0 + morpho morpho 0 + parser tokenizer SHIFT 1 + parser tokenizer RIGHT 1 + parser parser 0