Skip to content
Snippets Groups Projects
Commit 42de4d6d authored by Franck Dary's avatar Franck Dary
Browse files

Produce pretrained glove embeddings for multiple columns and for letters

parent 70f6b20a
No related branches found
No related tags found
No related merge requests found
Showing
with 86 additions and 27 deletions
......@@ -48,7 +48,9 @@ texts:
./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
pretrain:
./pretrainEmbeddings.sh $(TRAIN_FILES) 64 pretrained.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 )
for col in FORM UPOS FEATS DEPREL LETTERS ; do \
./pretrainEmbeddings.sh $(TRAIN_FILES) $$col 64 $$col.w2v 2> pretrain_log.err || ( cat pretrain_log.err && exit 1 ) ; \
done
$(FPLM_FILENAME): all_no_test.conllu
$(SCRIPTS)/conllu2fplm.py $< > $@
......
#! /usr/bin/env bash
GLOVE="../../../../GloVe/"
HORIZONTAL="../../../../scripts/conllu2horizontal.py"
if [ "$#" -ne 3 ]; then
echo "USAGE : $0 input.conllu embeddingsSize output.w2v"
if [ "$#" -ne 4 ]; then
echo "USAGE : $0 input.conllu colName embeddingsSize output.w2v"
exit 1
fi
MINCOUNT=2
if [ $2 == "LETTERS" ]; then
MINCOUNT=10
fi
CURDIR="$(pwd)"
cd $GLOVE && make && cd $CURDIR \
&& udpipe --output=horizontal none $1 > in.text \
&& $GLOVE"build/vocab_count" -min-count 2 < in.text > vocab.txt \
&& $HORIZONTAL $1 $2 > in.text \
&& $GLOVE"build/vocab_count" -min-count $MINCOUNT < in.text > vocab.txt \
&& $GLOVE"build/cooccur" -symmetric 0 -window-size 10 -vocab-file vocab.txt -memory 8.0 -overflow-file tempoverflow < in.text > cooccurrences.bin \
&& $GLOVE"build/shuffle" -memory 8.0 -seed 100 < cooccurrences.bin > cooccurrence.shuf.bin \
&& $GLOVE"build/glove" -iter 50 -save_gradsq 0 -write-header 1 -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file out -gradsq-file gradsq -vector-size $2 -seed 100 -threads 1 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 1 \
&& mv out.txt $3
&& $GLOVE"build/glove" -iter 50 -save_gradsq 0 -write-header 1 -input-file cooccurrence.shuf.bin -vocab-file vocab.txt -save-file out -gradsq-file gradsq -vector-size $3 -seed 100 -threads 1 -alpha 0.75 -x-max 100.0 -eta 0.05 -binary 0 -model 1 \
&& mv out.txt $4
rm in.text 2> /dev/null
rm vocab.txt 2> /dev/null
rm cooccurrences.bin 2> /dev/null
rm cooccurrence.shuf.bin 2> /dev/null
rm overflow_*\.bin 2> /dev/null
rm gradsq.txt 2> /dev/null
exit 0
......@@ -4,7 +4,7 @@ Classifier : parser
Transitions : {parser,data/parser_eager_rel_strict.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2}
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2}
Context : Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{UPOS FEATS EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Context : Targets{s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32}
......
......@@ -4,9 +4,9 @@ Classifier : tagger
Transitions : {tagger,data/tagger.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2}
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2}
Context : Targets{b.-2 b.-1 b.0 b.1 b.2} Columns{EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Context : Targets{b.-3 b.-2 b.-1} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Context : Targets{b.-3 b.-2 b.-1} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{data/UPOS.w2v}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
InputDropout : 0.3
......
......@@ -4,7 +4,7 @@ Classifier : taggerparser
Transitions : {tagger,data/tagger.ts parser,data/parser_eager_rel_strict.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2}
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2}
Context : Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Context : Targets{b.-2 b.-1 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Context : Targets{s.0 s.1 s.2 s.0.0 b.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
......
......@@ -4,7 +4,7 @@ Classifier : tagger
Transitions : {tagger,data/tagger.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2}
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2}
Context : Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Context : Targets{b.-2 b.-1 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Context : Targets{s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
......@@ -20,7 +20,7 @@ Classifier : parser
Transitions : {parser,data/parser_eager_rel_strict.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2}
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2}
Context : Targets{b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{EOS ID} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Context : Targets{b.-2 b.-1 b.0 s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Context : Targets{s.0 s.1 s.2 s.0.0 s.0.-1 s.1.0 s.1.-1 s.0.1 s.0.-2 s.1.1 s.1.-2} Columns{DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
......
......@@ -4,7 +4,7 @@ Classifier : tokenizer
Transitions : {tokenizer,data/tokenizer.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2}
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-2 b.-1 b.0 b.1 b.2}
Context : Targets{b.-2 b.-1 b.0} Columns{ID} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
......
......@@ -4,7 +4,7 @@ Classifier : tokeparser
Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
......
......@@ -4,7 +4,7 @@ Classifier : tokeparser
Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{128} w2v{}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
......
......@@ -4,7 +4,7 @@ Classifier : tokelemmatizer
Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
......@@ -21,7 +21,7 @@ Classifier : parser
Transitions : {parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32}
InputDropout : 0.3
......
......@@ -4,7 +4,7 @@ Classifier : tokeparser
Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
......
......@@ -4,7 +4,7 @@ Classifier : tokeparser
Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{128} w2v{}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
......
......@@ -4,7 +4,7 @@ Classifier : tokelemmatizer
Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
......@@ -21,7 +21,7 @@ Classifier : parser
Transitions : {parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Contextual : Window{-10 0} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Context : Targets{b.-3 b.-2 b.-1 b.0 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32}
InputDropout : 0.3
......
......@@ -4,7 +4,7 @@ Classifier : tokeparser
Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
......
......@@ -4,7 +4,7 @@ Classifier : tokeparser
Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{256} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{128} w2v{}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
......
......@@ -5,7 +5,7 @@ Classifier : tokeparser
LossMultiplier : {}
Network type : Modular
StateName : Out{256}
Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{FORM} Transformer{1 1 0 1} In{64} Out{256} w2v{data/pretrained.w2v}
Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{FORM} Transformer{1 1 0 1} In{64} Out{256} w2v{data/FORM.w2v}
Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} Transformer{1 1 0 1} In{64} Out{256} w2v{}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
......
......@@ -4,7 +4,7 @@ Classifier : tokelemmatizer
Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
......@@ -21,7 +21,7 @@ Classifier : parser
Transitions : {parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
LossMultiplier : {}
Network type : Modular
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/pretrained.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32}
InputDropout : 0.3
......
#! /usr/bin/env python3
import sys
from readMCD import readMCD
def printUsageAndExit() :
print("USAGE : %s file.conllu (columnName | LETTERS)"%sys.argv[0], file=sys.stderr)
sys.exit(1)
if __name__ == "__main__" :
if len(sys.argv) != 3 :
printUsageAndExit()
col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL")
col = sys.argv[2]
if col != "LETTERS" :
for line in open(sys.argv[1], "r") :
if line.startswith("#") :
splited = line.split("global.columns =")
if len(splited) > 1 :
col2index, index2col = readMCD(splited[-1].strip())
continue
if len(line.strip()) == 0 :
print("")
continue
splited = line.strip().split("\t")
if col not in col2index :
print("ERROR : invalid columnName '%s'"%col)
exit(1)
index = col2index[col]
if index not in range(len(splited)) :
print("ERROR : column %s not found in line '%s'"%(index, line.strip()))
exit(1)
print(splited[index].replace(" ", ""), end=" ")
else :
for line in open(sys.argv[1], "r") :
if line.startswith("#") :
splited = line.split("global.columns =")
if len(splited) > 1 :
col2index, index2col = readMCD(splited[-1].strip())
splited = line.split("text =")
if len(splited) > 1 :
text = splited[-1].replace("\n", " ").replace(" ", "")
print(" ".join(list(text)))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment