From 10107218cb0733f3a04d6a5c7589dce8a84e90aa Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 14 Nov 2019 21:01:39 +0100 Subject: [PATCH] Created first version of tokeparser_incremental --- .../data/feature_models/morpho_incremental.fm | 28 +++++++++ .../data/feature_models/tagger_incremental.fm | 25 ++++++++ .../feature_models/tokenizer_incremental.fm | 42 +++++++++++++ .../lemmatizer_case.cla | 7 +++ .../lemmatizer_lookup.cla | 4 ++ .../lemmatizer_rules.cla | 7 +++ UD_any/tokeparser_incremental/machine.tm | 35 +++++++++++ UD_any/tokeparser_incremental/morpho.cla | 7 +++ UD_any/tokeparser_incremental/normal.tm | 31 ++++++++++ UD_any/tokeparser_incremental/parser.cla | 7 +++ UD_any/tokeparser_incremental/strategy.cla | 4 ++ UD_any/tokeparser_incremental/tagger.cla | 7 +++ UD_any/tokeparser_incremental/test.bd | 11 ++++ UD_any/tokeparser_incremental/tokenizer.cla | 7 +++ .../tokeparser_incremental/tokeparser.dicts | 62 +++++++++++++++++++ UD_any/tokeparser_incremental/train.bd | 11 ++++ 16 files changed, 295 insertions(+) create mode 100644 UD_any/data/feature_models/morpho_incremental.fm create mode 100644 UD_any/data/feature_models/tagger_incremental.fm create mode 100644 UD_any/data/feature_models/tokenizer_incremental.fm create mode 100644 UD_any/tokeparser_incremental/lemmatizer_case.cla create mode 100644 UD_any/tokeparser_incremental/lemmatizer_lookup.cla create mode 100644 UD_any/tokeparser_incremental/lemmatizer_rules.cla create mode 100644 UD_any/tokeparser_incremental/machine.tm create mode 100644 UD_any/tokeparser_incremental/morpho.cla create mode 100644 UD_any/tokeparser_incremental/normal.tm create mode 100644 UD_any/tokeparser_incremental/parser.cla create mode 100644 UD_any/tokeparser_incremental/strategy.cla create mode 100644 UD_any/tokeparser_incremental/tagger.cla create mode 100644 UD_any/tokeparser_incremental/test.bd create mode 100644 UD_any/tokeparser_incremental/tokenizer.cla create mode 100644 UD_any/tokeparser_incremental/tokeparser.dicts create mode 100644 UD_any/tokeparser_incremental/train.bd diff --git a/UD_any/data/feature_models/morpho_incremental.fm b/UD_any/data/feature_models/morpho_incremental.fm new file mode 100644 index 0000000..2086605 --- /dev/null +++ b/UD_any/data/feature_models/morpho_incremental.fm @@ -0,0 +1,28 @@ +# FORM +b.-2#FORM.fasttext +b.-1#FORM.fasttext +b.0#FORM.fasttext +# POS +b.-3#POS +b.-2#POS +b.-1#POS +b.0#POS +# MORPHO +b.-2#MORPHO +b.-1#MORPHO +b.0#MORPHO +# UPPERCASE +b.0#FORM.U +# LENGTH +b.0#FORM.LEN +# SUFFIXES +b.0#FORM.PART.-4.-4 +b.0#FORM.PART.-3.-3 +b.0#FORM.PART.-2.-2 +b.0#FORM.PART.-1.-1 +# PREFIXES +b.0#FORM.PART.0.0 +b.0#FORM.PART.1.1 +b.0#FORM.PART.2.2 +b.0#FORM.PART.3.3 + diff --git a/UD_any/data/feature_models/tagger_incremental.fm b/UD_any/data/feature_models/tagger_incremental.fm new file mode 100644 index 0000000..1bd749b --- /dev/null +++ b/UD_any/data/feature_models/tagger_incremental.fm @@ -0,0 +1,25 @@ +# FORM +b.-2#FORM.fasttext +b.-1#FORM.fasttext +b.0#FORM.fasttext +# POS +b.-3#POS +b.-2#POS +b.-1#POS +# UPPERCASE +b.0#FORM.U +# LENGTH +b.0#FORM.LEN +# SUFFIXES +b.0#FORM.PART.-4.-4 +b.0#FORM.PART.-3.-3 +b.0#FORM.PART.-2.-2 +b.0#FORM.PART.-1.-1 +# PREFIXES +b.0#FORM.PART.0.0 +b.0#FORM.PART.1.1 +b.0#FORM.PART.2.2 +b.0#FORM.PART.3.3 +# MORPHO +b.-2#MORPHO +b.-1#MORPHO diff --git a/UD_any/data/feature_models/tokenizer_incremental.fm b/UD_any/data/feature_models/tokenizer_incremental.fm new file mode 100644 index 0000000..d10a62e --- /dev/null +++ b/UD_any/data/feature_models/tokenizer_incremental.fm @@ -0,0 +1,42 @@ +# FORM +b.-2#FORM.fasttext +b.-1#FORM.fasttext +b.0#FORM.fasttext +# UPPERCASE +b.-1#FORM.U +# LENGTH +b.-1#FORM.LEN +b.0#FORM.LEN +# SUFFIXES +b.0#FORM.PART.-4.-4 +b.0#FORM.PART.-3.-3 +b.0#FORM.PART.-2.-2 +b.0#FORM.PART.-1.-1 +# PREFIXES +b.0#FORM.PART.0.0 +b.0#FORM.PART.1.1 +b.0#FORM.PART.2.2 +b.0#FORM.PART.3.3 +# RAW INPUT +raw.-5 +raw.-4 +raw.-3 +raw.-2 +raw.-1 +raw.0 +raw.2 +raw.3 +raw.4 +raw.5 +raw.6 +# POS +b.-2#POS +b.-1#POS +b.0#POS +# MORPHO +b.-2#MORPHO +b.-1#MORPHO +b.0#MORPHO +# EOS +b.-1#EOS +b.-2#EOS diff --git a/UD_any/tokeparser_incremental/lemmatizer_case.cla b/UD_any/tokeparser_incremental/lemmatizer_case.cla new file mode 100644 index 0000000..0994d40 --- /dev/null +++ b/UD_any/tokeparser_incremental/lemmatizer_case.cla @@ -0,0 +1,7 @@ +Name : Lemmatizer_Case +Type : Prediction +Oracle : lemma_case +Feature Model : data/feature_models/lemmatizer_rules.fm +Action Set : data/lemmatizer_case.as +Topology : (100,RELU,0.1) +Dynamic : yes diff --git a/UD_any/tokeparser_incremental/lemmatizer_lookup.cla b/UD_any/tokeparser_incremental/lemmatizer_lookup.cla new file mode 100644 index 0000000..41f63b2 --- /dev/null +++ b/UD_any/tokeparser_incremental/lemmatizer_lookup.cla @@ -0,0 +1,4 @@ +Name : Lemmatizer_Lookup +Type : Information +Oracle : lemma_lookup +Oracle Filename : data/maca_trans_lemmatizer_exceptions.fplm diff --git a/UD_any/tokeparser_incremental/lemmatizer_rules.cla b/UD_any/tokeparser_incremental/lemmatizer_rules.cla new file mode 100644 index 0000000..9b59b62 --- /dev/null +++ b/UD_any/tokeparser_incremental/lemmatizer_rules.cla @@ -0,0 +1,7 @@ +Name : Lemmatizer_Rules +Type : Prediction +Oracle : lemma_rules +Feature Model : data/feature_models/lemmatizer_rules.fm +Action Set : data/lemmatizer_rules.as +Topology : (500,RELU,0.3) +Dynamic : yes diff --git a/UD_any/tokeparser_incremental/machine.tm b/UD_any/tokeparser_incremental/machine.tm new file mode 100644 index 0000000..0e1b043 --- /dev/null +++ b/UD_any/tokeparser_incremental/machine.tm @@ -0,0 +1,35 @@ +Name : Tokenizer, Tagger, Morpho, Lemmatizer and Parser Machine +Dicts : tokeparser.dicts +%CLASSIFIERS +strategy strategy.cla +tokenizer tokenizer.cla +tagger tagger.cla +morpho morpho.cla +lemmatizer_lookup lemmatizer_lookup.cla +lemmatizer_rules lemmatizer_rules.cla +lemmatizer_case lemmatizer_case.cla +parser parser.cla +%STATES +strategy strategy +tokenizer tokenizer +tagger tagger +morpho morpho +lemmatizer_lookup lemmatizer_lookup +lemmatizer_rules lemmatizer_rules +lemmatizer_case lemmatizer_case +parser parser +%TRANSITIONS +strategy tokenizer MOVE tokenizer +strategy tagger MOVE tagger +strategy morpho MOVE morpho +strategy lemmatizer_lookup MOVE lemmatizer_lookup +strategy lemmatizer_rules MOVE lemmatizer_rules +strategy lemmatizer_case MOVE lemmatizer_case +strategy parser MOVE parser +tokenizer strategy * +tagger strategy * +morpho strategy * +lemmatizer_lookup strategy * +lemmatizer_case strategy * +lemmatizer_rules strategy * +parser strategy * diff --git a/UD_any/tokeparser_incremental/morpho.cla b/UD_any/tokeparser_incremental/morpho.cla new file mode 100644 index 0000000..63340a0 --- /dev/null +++ b/UD_any/tokeparser_incremental/morpho.cla @@ -0,0 +1,7 @@ +Name : Morpho +Type : Prediction +Oracle : morpho +Feature Model : data/feature_models/morpho_incremental.fm +Action Set : data/morpho_parts.as +Topology : (500,RELU,0.3) +Dynamic : yes diff --git a/UD_any/tokeparser_incremental/normal.tm b/UD_any/tokeparser_incremental/normal.tm new file mode 100644 index 0000000..77ebff3 --- /dev/null +++ b/UD_any/tokeparser_incremental/normal.tm @@ -0,0 +1,31 @@ +Name : Tagger, Morpho, Lemmatizer and Parser Machine +Dicts : tagparser.dicts +%CLASSIFIERS +strategy strategy.cla +tagger tagger.cla +morpho morpho.cla +lemmatizer_lookup lemmatizer_lookup.cla +lemmatizer_rules lemmatizer_rules.cla +lemmatizer_case lemmatizer_case.cla +parser parser.cla +%STATES +strategy strategy +tagger tagger +morpho morpho +lemmatizer_lookup lemmatizer_lookup +lemmatizer_rules lemmatizer_rules +lemmatizer_case lemmatizer_case +parser parser +%TRANSITIONS +strategy tagger MOVE tagger +strategy morpho MOVE morpho +strategy lemmatizer_lookup MOVE lemmatizer_lookup +strategy lemmatizer_rules MOVE lemmatizer_rules +strategy lemmatizer_case MOVE lemmatizer_case +strategy parser MOVE parser +tagger strategy * +morpho strategy * +lemmatizer_lookup strategy * +lemmatizer_case strategy * +lemmatizer_rules strategy * +parser strategy * diff --git a/UD_any/tokeparser_incremental/parser.cla b/UD_any/tokeparser_incremental/parser.cla new file mode 100644 index 0000000..77714be --- /dev/null +++ b/UD_any/tokeparser_incremental/parser.cla @@ -0,0 +1,7 @@ +Name : Parser +Type : Prediction +Oracle : parser +Feature Model : data/feature_models/parser_nofuture.fm +Action Set : data/parser.as +Topology : (500,RELU,0.3) +Dynamic : yes diff --git a/UD_any/tokeparser_incremental/strategy.cla b/UD_any/tokeparser_incremental/strategy.cla new file mode 100644 index 0000000..fcf66b5 --- /dev/null +++ b/UD_any/tokeparser_incremental/strategy.cla @@ -0,0 +1,4 @@ +Name : Strategy +Type : Information +Oracle : strategy_tokenizer,tagger,morpho,lemmatizer,parser +Oracle Filename : none diff --git a/UD_any/tokeparser_incremental/tagger.cla b/UD_any/tokeparser_incremental/tagger.cla new file mode 100644 index 0000000..9aa35ed --- /dev/null +++ b/UD_any/tokeparser_incremental/tagger.cla @@ -0,0 +1,7 @@ +Name : Tagger +Type : Prediction +Oracle : tagger +Feature Model : data/feature_models/tagger_incremental.fm +Action Set : data/tagger.as +Topology : (500,RELU,0.3) +Dynamic : yes diff --git a/UD_any/tokeparser_incremental/test.bd b/UD_any/tokeparser_incremental/test.bd new file mode 100644 index 0000000..b2ef996 --- /dev/null +++ b/UD_any/tokeparser_incremental/test.bd @@ -0,0 +1,11 @@ +#Index Name ref/hyp dict Policy Must print?# +################################################### +0 ID hyp none Final 1 +1 FORM hyp form Final 1 +3 POS hyp pos Final 1 +4 XPOS hyp pos Final 1 +5 MORPHO hyp morpho Final 1 +2 LEMMA hyp form Final 1 +6 GOV hyp int Final 1 +7 LABEL hyp labels Final 1 +0 EOS hyp eos Final 0 diff --git a/UD_any/tokeparser_incremental/tokenizer.cla b/UD_any/tokeparser_incremental/tokenizer.cla new file mode 100644 index 0000000..127a62e --- /dev/null +++ b/UD_any/tokeparser_incremental/tokenizer.cla @@ -0,0 +1,7 @@ +Name : Tokenizer +Type : Prediction +Oracle : tokenizer +Feature Model : data/feature_models/tokenizer_incremental.fm +Action Set : data/tokenizer.as +Topology : (500,RELU,0.3) +Dynamic : no diff --git a/UD_any/tokeparser_incremental/tokeparser.dicts b/UD_any/tokeparser_incremental/tokeparser.dicts new file mode 100644 index 0000000..01ff2b9 --- /dev/null +++ b/UD_any/tokeparser_incremental/tokeparser.dicts @@ -0,0 +1,62 @@ +#Name Dimension Mode # +############################ +# TOKENIZER +Tokenizer_bool 02 Embeddings +Tokenizer_int 05 Embeddings +Tokenizer_letters 30 Embeddings +Tokenizer_form 30 Embeddings +Tokenizer_form.f 30 Embeddings +Tokenizer_actions 05 Embeddings +Tokenizer_entropy 05 Embeddings +Tokenizer_pos 18 Embeddings +Tokenizer_morpho 22 Embeddings +Tokenizer_eos 16 Embeddings +# TAGGER +Tagger_actions 18 Embeddings _ +Tagger_bool 16 Embeddings _ +Tagger_int 16 Embeddings _ +Tagger_eos 16 Embeddings _ +Tagger_gov 16 Embeddings _ +Tagger_pos 18 Embeddings _ +Tagger_form 30 Embeddings _ +Tagger_form.f 30 Embeddings _ +Tagger_lemma 30 Embeddings _ +Tagger_letters 30 Embeddings _ +Tagger_labels 18 Embeddings _ +Tagger_morpho 22 Embeddings _ +# MORPHO +Morpho_actions 18 Embeddings _ +Morpho_bool 16 Embeddings _ +Morpho_int 16 Embeddings _ +Morpho_eos 16 Embeddings _ +Morpho_gov 16 Embeddings _ +Morpho_pos 18 Embeddings _ +Morpho_form 30 Embeddings _ +Morpho_form.f 30 Embeddings _ +Morpho_lemma 30 Embeddings _ +Morpho_letters 30 Embeddings _ +Morpho_labels 18 Embeddings _ +Morpho_morpho 22 Embeddings _ +# LEMMATIZER +Lemmatizer_Rules_form 30 Embeddings +Lemmatizer_Rules_letters 10 Embeddings +Lemmatizer_Rules_pos 30 Embeddings +Lemmatizer_Rules_morpho 30 Embeddings +# LEMMATIZER +Lemmatizer_Case_form 30 Embeddings +Lemmatizer_Case_letters 10 Embeddings +Lemmatizer_Case_pos 30 Embeddings +Lemmatizer_Case_morpho 30 Embeddings +# PARSER +Parser_actions 18 Embeddings _ +Parser_bool 16 Embeddings _ +Parser_int 16 Embeddings _ +Parser_eos 16 Embeddings _ +Parser_gov 16 Embeddings _ +Parser_pos 18 Embeddings _ +Parser_form 30 Embeddings _ +Parser_form.f 30 Embeddings _ +Parser_lemma 30 Embeddings _ +Parser_letters 30 Embeddings _ +Parser_labels 18 Embeddings _ +Parser_morpho 22 Embeddings _ diff --git a/UD_any/tokeparser_incremental/train.bd b/UD_any/tokeparser_incremental/train.bd new file mode 100644 index 0000000..963c311 --- /dev/null +++ b/UD_any/tokeparser_incremental/train.bd @@ -0,0 +1,11 @@ +#Index Name ref/hyp dict Policy Must print?# +################################################### +0 ID hyp none FromZero 1 +1 FORM hyp form FromZero 1 +3 POS hyp pos FromZero 1 +4 XPOS hyp pos FromZero 1 +5 MORPHO hyp morpho FromZero 1 +2 LEMMA hyp form FromZero 1 +6 GOV hyp int FromZero 1 +7 LABEL hyp labels FromZero 1 +0 EOS hyp eos FromZero 0 -- GitLab