diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index d5082f58ffe456b1e1df6908ffcee61f360a799b..4f12bef1be8c818a053ad9d9e1b5f5b632472899 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -48,7 +48,9 @@ texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) pretrain: - ./pretrainEmbeddings.sh $(TRAIN_FILES) 64 pretrained.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) + for col in FORM UPOS FEATS DEPREL LETTERS ; do \ + ./pretrainEmbeddings.sh $(TRAIN_FILES) $$col 64 $$col.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) ; \ + done $(FPLM_FILENAME): all_no_test.conllu $(SCRIPTS)/conllu2fplm.py $< > $@ diff --git a/UD_any/data/pretrainEmbeddings.sh b/UD_any/data/pretrainEmbeddings.sh index dc84f55f953320a00595391149ba3e1e4fd240c1..cfdc6ada7591378e2501a8814d150aa5b36a0224 100755 --- a/UD_any/data/pretrainEmbeddings.sh +++ b/UD_any/data/pretrainEmbeddings.sh @@ -1,25 +1,32 @@ #! /usr/bin/env bash GLOVE="../../../../GloVe/" +HORIZONTAL="../../../../scripts/conllu2horizontal.py" -if [ "$#" -ne 3 ]; then - echo "USAGE : $0 input.conllu embeddingsSize output.w2v" +if [ "$#" -ne 4 ]; then + echo "USAGE : $0 input.conllu colName embeddingsSize output.w2v" exit 1 fi +MINCOUNT=2 +if [ $2 == "LETTERS" ]; then + MINCOUNT=10 +fi + CURDIR="$(pwd)" cd $GLOVE && make && cd $CURDIR \ -&& udpipe --output=horizontal none $1 > in.text \ -&& $GLOVE"build/vocab_count" -min-count 2 < in.text > vocab.txt \ +&& $HORIZONTAL $1 $2 > in.text \ +&& $GLOVE"build/vocab_count" -min-count $MINCOUNT < in.text > vocab.txt \ && $GLOVE"build/cooccur" -symmetric 0 -window-size 10 -vocab-file vocab.txt -memory 8.0 -overflow-file tempoverflow < in.text > cooccurrences.bin \ && $GLOVE"build/shuffle" -memory 8.0 -seed 100 < cooccurrences.bin > cooccurrence.shuf.bin \ -&& $GLOVE"build/glove" -iter 50 -save_gradsq 0 -write-header 1 -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file out -gradsq-file gradsq -vector-size $2 -seed 100 -threads 1 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 1 \ -&& mv out.txt $3 +&& $GLOVE"build/glove" -iter 50 -save_gradsq 0 -write-header 1 -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file out -gradsq-file gradsq -vector-size $3 -seed 100 -threads 1 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 1 \ +&& mv out.txt $4 rm in.text 2> /dev/null rm vocab.txt 2> /dev/null rm cooccurrences.bin 2> /dev/null rm cooccurrence.shuf.bin 2> /dev/null rm overflow_*\.bin 2> /dev/null +rm gradsq.txt 2> /dev/null exit 0 diff --git a/UD_any/templates/parser/machine.rm b/UD_any/templates/parser/machine.rm index cf0d5de6eb093e007fac0767d0aee1f4e1f0e60d..a8ac0ac4fb0948f9720fe901cbbcb4213b3edc98 100644 --- a/UD_any/templates/parser/machine.rm +++ b/UD_any/templates/parser/machine.rm @@ -4,7 +4,7 @@ Classifier : parser Transitions : {parser,data/parser_eager_rel_strict.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Context : Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{UPOS FEATS EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32} diff --git a/UD_any/templates/tagger/machine.rm b/UD_any/templates/tagger/machine.rm index 730dc86581622671bf3b1cc9238742be82c58f3d..026d380a436d0de16f93a8086312c34b049b8f8f 100644 --- a/UD_any/templates/tagger/machine.rm +++ b/UD_any/templates/tagger/machine.rm @@ -4,9 +4,9 @@ Classifier : tagger Transitions : {tagger,data/tagger.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2} Context : Targets{b.-2 b.-1 b.0 b.1 b.2} Columns{EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{} - Context : Targets{b.-3 b.-2 b.-1} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{} + Context : Targets{b.-3 b.-2 b.-1} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{data/UPOS.w2v} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} InputDropout : 0.3 diff --git a/UD_any/templates/taggerparser_incr_one/machine.rm b/UD_any/templates/taggerparser_incr_one/machine.rm index 682e736350db6ff2ad4f5fdd97b57a5303dc1e6f..54888fcf8d65c46ddd8a80bfa99404aa443ece8e 100644 --- a/UD_any/templates/taggerparser_incr_one/machine.rm +++ b/UD_any/templates/taggerparser_incr_one/machine.rm @@ -4,7 +4,7 @@ Classifier : taggerparser Transitions : {tagger,data/tagger.ts parser,data/parser_eager_rel_strict.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Context : Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{b.-2 b.-1 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{s.0 s.1 s.2 s.0.0 b.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} diff --git a/UD_any/templates/taggerparser_incr_two/machine.rm b/UD_any/templates/taggerparser_incr_two/machine.rm index e69d67d096ed18da74e0cc159db0b679d63e597f..9d2b181920267dae6d5bbbbde5a7c4c80981ac0f 100644 --- a/UD_any/templates/taggerparser_incr_two/machine.rm +++ b/UD_any/templates/taggerparser_incr_two/machine.rm @@ -4,7 +4,7 @@ Classifier : tagger Transitions : {tagger,data/tagger.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Context : Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{b.-2 b.-1 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} @@ -20,7 +20,7 @@ Classifier : parser Transitions : {parser,data/parser_eager_rel_strict.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Context : Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{b.-2 b.-1 b.0 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} diff --git a/UD_any/templates/tokenizer/machine.rm b/UD_any/templates/tokenizer/machine.rm index 8b1305ccfa79bf85a47cc0f6388e33e30335779d..59ac28094f095638466d4a5c8f675098a6f286c9 100644 --- a/UD_any/templates/tokenizer/machine.rm +++ b/UD_any/templates/tokenizer/machine.rm @@ -4,7 +4,7 @@ Classifier : tokenizer Transitions : {tokenizer,data/tokenizer.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2} Context : Targets{b.-2 b.-1 b.0} Columns{ID} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_base/machine.rm b/UD_any/templates/tokeparser_base/machine.rm index 3de9ff2178b9a18cea97d312aa51ddf91e591200..3b2548c7040e10f00cd95d7ed8d94888c2186c1a 100644 --- a/UD_any/templates/tokeparser_base/machine.rm +++ b/UD_any/templates/tokeparser_base/machine.rm @@ -4,7 +4,7 @@ Classifier : tokeparser Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_base_big/machine.rm b/UD_any/templates/tokeparser_base_big/machine.rm index d46cbf4b0a6887f24e9d3ff4dfea1fd282dac0ca..daa5c57afd92fc89d6884d12ee476a69608f1930 100644 --- a/UD_any/templates/tokeparser_base_big/machine.rm +++ b/UD_any/templates/tokeparser_base_big/machine.rm @@ -4,7 +4,7 @@ Classifier : tokeparser Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{128} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_base_two/machine.rm b/UD_any/templates/tokeparser_base_two/machine.rm index 7468c8800316b2b9fd3322b595cdf970b122345d..c8bb14506ee5578e40d0bdfc5480b506f5e1e266 100644 --- a/UD_any/templates/tokeparser_base_two/machine.rm +++ b/UD_any/templates/tokeparser_base_two/machine.rm @@ -4,7 +4,7 @@ Classifier : tokelemmatizer Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} @@ -21,7 +21,7 @@ Classifier : parser Transitions : {parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32} InputDropout : 0.3 diff --git a/UD_any/templates/tokeparser_incr/machine.rm b/UD_any/templates/tokeparser_incr/machine.rm index 051f6054454611c2c82172414ed552099025233c..e8e54248443704469543fae5fd5ab2f5c3fb7e68 100644 --- a/UD_any/templates/tokeparser_incr/machine.rm +++ b/UD_any/templates/tokeparser_incr/machine.rm @@ -4,7 +4,7 @@ Classifier : tokeparser Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_incr_big/machine.rm b/UD_any/templates/tokeparser_incr_big/machine.rm index 095831c61ce3e6c358a451a7be62e07a6beaf148..25ac3326790cabd3204c76c0ca45df7f33bd1839 100644 --- a/UD_any/templates/tokeparser_incr_big/machine.rm +++ b/UD_any/templates/tokeparser_incr_big/machine.rm @@ -4,7 +4,7 @@ Classifier : tokeparser Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{128} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_incr_two/machine.rm b/UD_any/templates/tokeparser_incr_two/machine.rm index cfc2e58d5102b55dc5bfdfa911e5dcc8bcbf3075..a20f4ac605ebd1ad721ead955abdf4b129787f9a 100644 --- a/UD_any/templates/tokeparser_incr_two/machine.rm +++ b/UD_any/templates/tokeparser_incr_two/machine.rm @@ -4,7 +4,7 @@ Classifier : tokelemmatizer Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} @@ -21,7 +21,7 @@ Classifier : parser Transitions : {parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32} InputDropout : 0.3 diff --git a/UD_any/templates/tokeparser_seq/machine.rm b/UD_any/templates/tokeparser_seq/machine.rm index dd7708f67a70dbb743b6ade36a03a682d916ccbd..a8cc9b8608bcba6e924759fd8778496ffc92a2b5 100644 --- a/UD_any/templates/tokeparser_seq/machine.rm +++ b/UD_any/templates/tokeparser_seq/machine.rm @@ -4,7 +4,7 @@ Classifier : tokeparser Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_seq_big/machine.rm b/UD_any/templates/tokeparser_seq_big/machine.rm index 4740571b68fb8d636af9caae0b44c7360862dce4..118a6ceecdc6cdd55261f95ed7225dd8976ade62 100644 --- a/UD_any/templates/tokeparser_seq_big/machine.rm +++ b/UD_any/templates/tokeparser_seq_big/machine.rm @@ -4,7 +4,7 @@ Classifier : tokeparser Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{128} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_seq_transformer/machine.rm b/UD_any/templates/tokeparser_seq_transformer/machine.rm index 08c31005b34097e51ca9ec7036139972b96684f8..08c1996a4c264c09bdf8a19359d433885c18db79 100644 --- a/UD_any/templates/tokeparser_seq_transformer/machine.rm +++ b/UD_any/templates/tokeparser_seq_transformer/machine.rm @@ -5,7 +5,7 @@ Classifier : tokeparser LossMultiplier : {} Network type : Modular StateName : Out{256} - Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{FORM} Transformer{1 1 0 1} In{64} Out{256} w2v{data/pretrained.w2v} + Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{FORM} Transformer{1 1 0 1} In{64} Out{256} w2v{data/FORM.w2v} Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} Transformer{1 1 0 1} In{64} Out{256} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_seq_two/machine.rm b/UD_any/templates/tokeparser_seq_two/machine.rm index 3ab1eb2a2a35e6e99bc8224e46aa8ce73a204f9c..d338fd3c17d5b5a150e49410bcdf68941ef25d33 100644 --- a/UD_any/templates/tokeparser_seq_two/machine.rm +++ b/UD_any/templates/tokeparser_seq_two/machine.rm @@ -4,7 +4,7 @@ Classifier : tokelemmatizer Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} @@ -21,7 +21,7 @@ Classifier : parser Transitions : {parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32} InputDropout : 0.3 diff --git a/scripts/conllu2horizontal.py b/scripts/conllu2horizontal.py new file mode 100755 index 0000000000000000000000000000000000000000..b0997c0250bcd4bdce0e64a134c1144eba8379b4 --- /dev/null +++ b/scripts/conllu2horizontal.py @@ -0,0 +1,50 @@ +#! /usr/bin/env python3 + +import sys +from readMCD import readMCD + +def printUsageAndExit() : + print("USAGE : %s file.conllu (columnName | LETTERS)"%sys.argv[0], file=sys.stderr) + sys.exit(1) + +if __name__ == "__main__" : + if len(sys.argv) != 3 : + printUsageAndExit() + + col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL") + col = sys.argv[2] + + if col != "LETTERS" : + for line in open(sys.argv[1], "r") : + if line.startswith("#") : + splited = line.split("global.columns =") + if len(splited) > 1 : + col2index, index2col = readMCD(splited[-1].strip()) + continue + + if len(line.strip()) == 0 : + print("") + continue + + splited = line.strip().split("\t") + + if col not in col2index : + print("ERROR : invalid columnName '%s'"%col) + exit(1) + index = col2index[col] + if index not in range(len(splited)) : + print("ERROR : column %s not found in line '%s'"%(index, line.strip())) + exit(1) + + print(splited[index].replace(" ", "◌"), end=" ") + else : + for line in open(sys.argv[1], "r") : + if line.startswith("#") : + splited = line.split("global.columns =") + if len(splited) > 1 : + col2index, index2col = readMCD(splited[-1].strip()) + splited = line.split("text =") + if len(splited) > 1 : + text = splited[-1].replace("\n", " ").replace(" ", "◌") + print(" ".join(list(text))) +