From 42de4d6d043cd0d47cd69b2e72fcd8c2e58c9c75 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Tue, 28 Jul 2020 22:13:40 +0200 Subject: [PATCH] Produce pretrained glove embeddings for multiple columns and for letters --- UD_any/data/Makefile | 4 +- UD_any/data/pretrainEmbeddings.sh | 19 ++++--- UD_any/templates/parser/machine.rm | 2 +- UD_any/templates/tagger/machine.rm | 4 +- .../taggerparser_incr_one/machine.rm | 2 +- .../taggerparser_incr_two/machine.rm | 4 +- UD_any/templates/tokenizer/machine.rm | 2 +- UD_any/templates/tokeparser_base/machine.rm | 2 +- .../templates/tokeparser_base_big/machine.rm | 2 +- .../templates/tokeparser_base_two/machine.rm | 4 +- UD_any/templates/tokeparser_incr/machine.rm | 2 +- .../templates/tokeparser_incr_big/machine.rm | 2 +- .../templates/tokeparser_incr_two/machine.rm | 4 +- UD_any/templates/tokeparser_seq/machine.rm | 2 +- .../templates/tokeparser_seq_big/machine.rm | 2 +- .../tokeparser_seq_transformer/machine.rm | 2 +- .../templates/tokeparser_seq_two/machine.rm | 4 +- scripts/conllu2horizontal.py | 50 +++++++++++++++++++ 18 files changed, 86 insertions(+), 27 deletions(-) create mode 100755 scripts/conllu2horizontal.py diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index d5082f5..4f12bef 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -48,7 +48,9 @@ texts: ./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES) pretrain: - ./pretrainEmbeddings.sh $(TRAIN_FILES) 64 pretrained.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) + for col in FORM UPOS FEATS DEPREL LETTERS ; do \ + ./pretrainEmbeddings.sh $(TRAIN_FILES) $$col 64 $$col.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) ; \ + done $(FPLM_FILENAME): all_no_test.conllu $(SCRIPTS)/conllu2fplm.py $< > $@ diff --git a/UD_any/data/pretrainEmbeddings.sh b/UD_any/data/pretrainEmbeddings.sh index dc84f55..cfdc6ad 100755 --- a/UD_any/data/pretrainEmbeddings.sh +++ b/UD_any/data/pretrainEmbeddings.sh @@ -1,25 +1,32 @@ #! /usr/bin/env bash GLOVE="../../../../GloVe/" +HORIZONTAL="../../../../scripts/conllu2horizontal.py" -if [ "$#" -ne 3 ]; then - echo "USAGE : $0 input.conllu embeddingsSize output.w2v" +if [ "$#" -ne 4 ]; then + echo "USAGE : $0 input.conllu colName embeddingsSize output.w2v" exit 1 fi +MINCOUNT=2 +if [ $2 == "LETTERS" ]; then + MINCOUNT=10 +fi + CURDIR="$(pwd)" cd $GLOVE && make && cd $CURDIR \ -&& udpipe --output=horizontal none $1 > in.text \ -&& $GLOVE"build/vocab_count" -min-count 2 < in.text > vocab.txt \ +&& $HORIZONTAL $1 $2 > in.text \ +&& $GLOVE"build/vocab_count" -min-count $MINCOUNT < in.text > vocab.txt \ && $GLOVE"build/cooccur" -symmetric 0 -window-size 10 -vocab-file vocab.txt -memory 8.0 -overflow-file tempoverflow < in.text > cooccurrences.bin \ && $GLOVE"build/shuffle" -memory 8.0 -seed 100 < cooccurrences.bin > cooccurrence.shuf.bin \ -&& $GLOVE"build/glove" -iter 50 -save_gradsq 0 -write-header 1 -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file out -gradsq-file gradsq -vector-size $2 -seed 100 -threads 1 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 1 \ -&& mv out.txt $3 +&& $GLOVE"build/glove" -iter 50 -save_gradsq 0 -write-header 1 -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file out -gradsq-file gradsq -vector-size $3 -seed 100 -threads 1 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 1 \ +&& mv out.txt $4 rm in.text 2> /dev/null rm vocab.txt 2> /dev/null rm cooccurrences.bin 2> /dev/null rm cooccurrence.shuf.bin 2> /dev/null rm overflow_*\.bin 2> /dev/null +rm gradsq.txt 2> /dev/null exit 0 diff --git a/UD_any/templates/parser/machine.rm b/UD_any/templates/parser/machine.rm index cf0d5de..a8ac0ac 100644 --- a/UD_any/templates/parser/machine.rm +++ b/UD_any/templates/parser/machine.rm @@ -4,7 +4,7 @@ Classifier : parser Transitions : {parser,data/parser_eager_rel_strict.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Context : Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{UPOS FEATS EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32} diff --git a/UD_any/templates/tagger/machine.rm b/UD_any/templates/tagger/machine.rm index 730dc86..026d380 100644 --- a/UD_any/templates/tagger/machine.rm +++ b/UD_any/templates/tagger/machine.rm @@ -4,9 +4,9 @@ Classifier : tagger Transitions : {tagger,data/tagger.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2} Context : Targets{b.-2 b.-1 b.0 b.1 b.2} Columns{EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{} - Context : Targets{b.-3 b.-2 b.-1} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{} + Context : Targets{b.-3 b.-2 b.-1} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{data/UPOS.w2v} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} InputDropout : 0.3 diff --git a/UD_any/templates/taggerparser_incr_one/machine.rm b/UD_any/templates/taggerparser_incr_one/machine.rm index 682e736..54888fc 100644 --- a/UD_any/templates/taggerparser_incr_one/machine.rm +++ b/UD_any/templates/taggerparser_incr_one/machine.rm @@ -4,7 +4,7 @@ Classifier : taggerparser Transitions : {tagger,data/tagger.ts parser,data/parser_eager_rel_strict.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Context : Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{b.-2 b.-1 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{s.0 s.1 s.2 s.0.0 b.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} diff --git a/UD_any/templates/taggerparser_incr_two/machine.rm b/UD_any/templates/taggerparser_incr_two/machine.rm index e69d67d..9d2b181 100644 --- a/UD_any/templates/taggerparser_incr_two/machine.rm +++ b/UD_any/templates/taggerparser_incr_two/machine.rm @@ -4,7 +4,7 @@ Classifier : tagger Transitions : {tagger,data/tagger.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Context : Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{b.-2 b.-1 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} @@ -20,7 +20,7 @@ Classifier : parser Transitions : {parser,data/parser_eager_rel_strict.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Context : Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{b.-2 b.-1 b.0 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{} Context : Targets{s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} diff --git a/UD_any/templates/tokenizer/machine.rm b/UD_any/templates/tokenizer/machine.rm index 8b1305c..59ac280 100644 --- a/UD_any/templates/tokenizer/machine.rm +++ b/UD_any/templates/tokenizer/machine.rm @@ -4,7 +4,7 @@ Classifier : tokenizer Transitions : {tokenizer,data/tokenizer.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2} Context : Targets{b.-2 b.-1 b.0} Columns{ID} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_base/machine.rm b/UD_any/templates/tokeparser_base/machine.rm index 3de9ff2..3b2548c 100644 --- a/UD_any/templates/tokeparser_base/machine.rm +++ b/UD_any/templates/tokeparser_base/machine.rm @@ -4,7 +4,7 @@ Classifier : tokeparser Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_base_big/machine.rm b/UD_any/templates/tokeparser_base_big/machine.rm index d46cbf4..daa5c57 100644 --- a/UD_any/templates/tokeparser_base_big/machine.rm +++ b/UD_any/templates/tokeparser_base_big/machine.rm @@ -4,7 +4,7 @@ Classifier : tokeparser Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{128} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_base_two/machine.rm b/UD_any/templates/tokeparser_base_two/machine.rm index 7468c88..c8bb145 100644 --- a/UD_any/templates/tokeparser_base_two/machine.rm +++ b/UD_any/templates/tokeparser_base_two/machine.rm @@ -4,7 +4,7 @@ Classifier : tokelemmatizer Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} @@ -21,7 +21,7 @@ Classifier : parser Transitions : {parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32} InputDropout : 0.3 diff --git a/UD_any/templates/tokeparser_incr/machine.rm b/UD_any/templates/tokeparser_incr/machine.rm index 051f605..e8e5424 100644 --- a/UD_any/templates/tokeparser_incr/machine.rm +++ b/UD_any/templates/tokeparser_incr/machine.rm @@ -4,7 +4,7 @@ Classifier : tokeparser Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_incr_big/machine.rm b/UD_any/templates/tokeparser_incr_big/machine.rm index 095831c..25ac332 100644 --- a/UD_any/templates/tokeparser_incr_big/machine.rm +++ b/UD_any/templates/tokeparser_incr_big/machine.rm @@ -4,7 +4,7 @@ Classifier : tokeparser Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{128} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_incr_two/machine.rm b/UD_any/templates/tokeparser_incr_two/machine.rm index cfc2e58..a20f4ac 100644 --- a/UD_any/templates/tokeparser_incr_two/machine.rm +++ b/UD_any/templates/tokeparser_incr_two/machine.rm @@ -4,7 +4,7 @@ Classifier : tokelemmatizer Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} @@ -21,7 +21,7 @@ Classifier : parser Transitions : {parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32} InputDropout : 0.3 diff --git a/UD_any/templates/tokeparser_seq/machine.rm b/UD_any/templates/tokeparser_seq/machine.rm index dd7708f..a8cc9b8 100644 --- a/UD_any/templates/tokeparser_seq/machine.rm +++ b/UD_any/templates/tokeparser_seq/machine.rm @@ -4,7 +4,7 @@ Classifier : tokeparser Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_seq_big/machine.rm b/UD_any/templates/tokeparser_seq_big/machine.rm index 4740571..118a6ce 100644 --- a/UD_any/templates/tokeparser_seq_big/machine.rm +++ b/UD_any/templates/tokeparser_seq_big/machine.rm @@ -4,7 +4,7 @@ Classifier : tokeparser Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{128} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_seq_transformer/machine.rm b/UD_any/templates/tokeparser_seq_transformer/machine.rm index 08c3100..08c1996 100644 --- a/UD_any/templates/tokeparser_seq_transformer/machine.rm +++ b/UD_any/templates/tokeparser_seq_transformer/machine.rm @@ -5,7 +5,7 @@ Classifier : tokeparser LossMultiplier : {} Network type : Modular StateName : Out{256} - Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{FORM} Transformer{1 1 0 1} In{64} Out{256} w2v{data/pretrained.w2v} + Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{FORM} Transformer{1 1 0 1} In{64} Out{256} w2v{data/FORM.w2v} Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} Transformer{1 1 0 1} In{64} Out{256} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} diff --git a/UD_any/templates/tokeparser_seq_two/machine.rm b/UD_any/templates/tokeparser_seq_two/machine.rm index 3ab1eb2..d338fd3 100644 --- a/UD_any/templates/tokeparser_seq_two/machine.rm +++ b/UD_any/templates/tokeparser_seq_two/machine.rm @@ -4,7 +4,7 @@ Classifier : tokelemmatizer Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} @@ -21,7 +21,7 @@ Classifier : parser Transitions : {parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts} LossMultiplier : {} Network type : Modular - Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} + Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{} History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32} InputDropout : 0.3 diff --git a/scripts/conllu2horizontal.py b/scripts/conllu2horizontal.py new file mode 100755 index 0000000..b0997c0 --- /dev/null +++ b/scripts/conllu2horizontal.py @@ -0,0 +1,50 @@ +#! /usr/bin/env python3 + +import sys +from readMCD import readMCD + +def printUsageAndExit() : + print("USAGE : %s file.conllu (columnName | LETTERS)"%sys.argv[0], file=sys.stderr) + sys.exit(1) + +if __name__ == "__main__" : + if len(sys.argv) != 3 : + printUsageAndExit() + + col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL") + col = sys.argv[2] + + if col != "LETTERS" : + for line in open(sys.argv[1], "r") : + if line.startswith("#") : + splited = line.split("global.columns =") + if len(splited) > 1 : + col2index, index2col = readMCD(splited[-1].strip()) + continue + + if len(line.strip()) == 0 : + print("") + continue + + splited = line.strip().split("\t") + + if col not in col2index : + print("ERROR : invalid columnName '%s'"%col) + exit(1) + index = col2index[col] + if index not in range(len(splited)) : + print("ERROR : column %s not found in line '%s'"%(index, line.strip())) + exit(1) + + print(splited[index].replace(" ", "◌"), end=" ") + else : + for line in open(sys.argv[1], "r") : + if line.startswith("#") : + splited = line.split("global.columns =") + if len(splited) > 1 : + col2index, index2col = readMCD(splited[-1].strip()) + splited = line.split("text =") + if len(splited) > 1 : + text = splited[-1].replace("\n", " ").replace(" ", "◌") + print(" ".join(list(text))) + -- GitLab