From 30d8eecddc1aff9a0ded9c1941f79699c61dd134 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Mon, 7 Dec 2020 09:01:37 +0100
Subject: [PATCH] Updated doc

---
 README.md                       |  1 +
 documentation/classifier.md     | 31 +++++++++++++---------
 documentation/gettingStarted.md |  2 +-
 documentation/install.md        |  2 +-
 documentation/readingMachine.md | 47 ++++++++++++++++-----------------
 5 files changed, 44 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index 2a5bad1..3782f55 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ It has been built to perform any combination of the folloing annotations :
 * Tokenization
 * POS tagging
 * Feats tagging
+* Lemmatization
 * Dependency parsing
 * Sentence segmentation
 
diff --git a/documentation/classifier.md b/documentation/classifier.md
index 4fed932..fe9847a 100644
--- a/documentation/classifier.md
+++ b/documentation/classifier.md
@@ -12,20 +12,25 @@ It's definition is made of three parts :
 
 	Example :
 	```
-	Classifier : tagparser
+	Classifier : tokeparser
 	{
-	  Transitions : {tagger,data/tagger.ts morpho,data/morpho_parts.ts parser,data/parser.ts segmenter,data/segmenter.ts}
-	  LossMultiplier : {segmenter,10.0}
-	  Network type : Modular
-	```
-* In the second part we must define the feature function and architecture of the neural network, see below for complete overview of Modular network type. This part must be ended with the line 'End'. Example :
-	```
-  StateName : Out{64}
-  Context : Buffer{-3 -2 -1 0 1 2} Stack{} Columns{FORM} LSTM{1 1 0 1} In{64} Out{64}
-  Focused : Column{FEATS} NbElem{13} Buffer{-1 0} Stack{2 1 0} LSTM{1 1 0 1} In{64} Out{64}
-  InputDropout : 0.5
-  MLP : {2048 0.3 2048 0.3}
-  End
+	  Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
+	  LossMultiplier : {}
+		Network type : Modular
+	  Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{FORM,data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
+	  Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
+	  Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} w2v{}
+	  Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} w2v{}
+		RawInput : Left{5} Right{10} LSTM{1 1 0.0 1} In{32} Out{32}
+	  History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32}
+		SplitTrans : LSTM{1 1 0.0 1} In{64} Out{64}
+		InputDropout : 0.3
+		MLP : {1600 0.3 1600 0.3}
+		End
+	  Optimizer : Adagrad {0.01 0.000001 0 0.0000000001}
+	  Type : classification
+	  Loss : crossentropy
+	}
 	```
 
 * In the third part, we must define the hyperparameters of the optimizer algorithm. Currently available optimizers are : 
diff --git a/documentation/gettingStarted.md b/documentation/gettingStarted.md
index 97c3325..1ea565a 100644
--- a/documentation/gettingStarted.md
+++ b/documentation/gettingStarted.md
@@ -25,7 +25,7 @@ Simply edit the file `macaon_data/UD_any/config` so that `UD_ROOT=` points to th
 * `data` : will be copied inside every model directory, is in charge of generating [Transition Sets](transitionSet.md).
 * `prepareExperiment.sh` : script that create a directory in `bin` for your model, allowing it to be trained.
 * `train.sh` : train a model that has been prepared by `prepareExperiment.sh`.
-* `evaluate.sh` : evaluate a model that has been trained bu `train.sh`.
+* `evaluate.sh` : evaluate a model that has been trained by `train.sh`.
 * `batches.py` : a file that you can use to define multiple experiments. To be used as an argument to `launchBatches.py`.
 * `launchBatches.py` : script that allows you to run multiple experiments at the same time. Can be used to launch *oar* or *slurm* jobs.
 * `templates/*` : contains a [Reading Machine](readingMachine.md) file that you can train using `train.sh`.
diff --git a/documentation/install.md b/documentation/install.md
index 38ca0bc..730c345 100644
--- a/documentation/install.md
+++ b/documentation/install.md
@@ -4,7 +4,7 @@
 * GNU/Linux OS
 * CMake >= 3.16.4
 * C++20 compiler such as g++ >= 9.2
-* LibTorch version 1.5 cxx11 ABI : [link](https://pytorch.org/get-started/locally/)
+* LibTorch version >= 1.5 cxx11 ABI : [link](https://pytorch.org/get-started/locally/)
 * Boost >= 1.53.0 with program_options : [link](https://www.boost.org/doc/libs/1_73_0/more/getting_started/unix-variants.html)
 
 ## Download :
diff --git a/documentation/readingMachine.md b/documentation/readingMachine.md
index 5286e7e..ca737b1 100644
--- a/documentation/readingMachine.md
+++ b/documentation/readingMachine.md
@@ -22,31 +22,28 @@ A reading machine is defined in a `.rm` file (or given as argument to `macaon tr
 Here is an example of a Reading Machine doing tokenization, POS tagging, Morphological tagging, dependency parsing and sentence segmentation in a sequential fashion :
 
 ```
-Name : Tokenizer, Tagger, Morpho and Parser Machine
+Name : Tokenizer, Tagger, Morpho, Lemmatizer, Parser and Segmenter Machine
 Classifier : tokeparser
 {
-  Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_parts.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
-  LossMultiplier : {segmenter,3.0}
-  Network type : Modular
-  StateName : Out{1024}
-  Context : Buffer{-3 -2 -1 1 2} Stack{} Columns{FORM} LSTM{1 1 0 1} In{64} Out{64}
-  Context : Buffer{-3 -2 -1 0 1 2} Stack{1 0} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64}
-  Focused : Column{ID} NbElem{1} Buffer{-1 0 1} Stack{2 1 0} LSTM{1 1 0 1} In{64} Out{64}
-  Focused : Column{FORM} NbElem{13} Buffer{-1 0 1 2} Stack{2 1 0} LSTM{1 1 0 1} In{64} Out{64}
-  Focused : Column{FEATS} NbElem{13} Buffer{-1 0 1 2} Stack{2 1 0} LSTM{1 1 0 1} In{64} Out{64}
-  Focused : Column{EOS} NbElem{1} Buffer{-1 0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
-  Focused : Column{DEPREL} NbElem{1} Buffer{} Stack{2 1 0} LSTM{1 1 0 1} In{64} Out{64}
-  DepthLayerTree : Columns{DEPREL} Buffer{} Stack{2 1 0} LayerSizes{3} LSTM{1 1 0.0 1} In{64} Out{64}
-  History : NbElem{10} LSTM{1 1 0 1} In{64} Out{64}
-	RawInput : Left{5} Right{5} LSTM{1 1 0.0 1} In{32} Out{32}
+  Transitions : {tokenizer,data/tokenizer.ts tagger,data/tagger.ts morpho,data/morpho_whole.ts lemmatizer_rules,data/lemmatizer_rules.ts lemmatizer_case,data/lemmatizer_case.ts parser,data/parser_eager_rel_strict.ts segmenter,data/segmenter.ts}
+  LossMultiplier : {}
+	Network type : Modular
+  Contextual : Window{-10 10} Columns{FORM} LSTM{1 1 0 1} In{64} Out{128} w2v{FORM,data/FORM.w2v} Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1}
+  Context : Targets{b.-3 b.-2 b.-1 b.0 b.1 b.2 s.0 s.1 s.2 b.0.0 s.0.0 s.0.-1 s.1.0 s.1.-1 s.2.0 s.2.-1} Columns{EOS ID UPOS FEATS DEPREL} LSTM{1 1 0 1} In{64} Out{64} w2v{}
+  Focused : Column{prefix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} w2v{}
+  Focused : Column{suffix3:FORM} NbElem{3} Buffer{0} Stack{} LSTM{1 1 0 1} In{64} Out{64} w2v{}
+	RawInput : Left{5} Right{10} LSTM{1 1 0.0 1} In{32} Out{32}
+  History : NbElem{10} LSTM{1 1 0 1} In{32} Out{32}
 	SplitTrans : LSTM{1 1 0.0 1} In{64} Out{64}
-  InputDropout : 0.5
-  MLP : {2048 0.3 2048 0.3}
-  End
-  Optimizer : Adam {0.0003 0.9 0.999 0.00000001 0.00002 true}
+	InputDropout : 0.3
+	MLP : {1600 0.3 1600 0.3}
+	End
+  Optimizer : Adagrad {0.01 0.000001 0 0.0000000001}
+  Type : classification
+  Loss : crossentropy
 }
 Splitwords : data/splitwords.ts
-Predictions : ID FORM UPOS FEATS HEAD DEPREL EOS
+Predictions : ID FORM UPOS FEATS LEMMA HEAD DEPREL EOS
 Strategy
 {
 	Block : End{cannotMove}
@@ -56,8 +53,10 @@ Strategy
 	Block : End{cannotMove}
 	tagger tagger * 1
 	Block : End{cannotMove}
-	morpho morpho NOTHING 1
-	morpho morpho * 0
+	morpho morpho * 1
+	Block : End{cannotMove}
+	lemmatizer_rules lemmatizer_case * 0
+  lemmatizer_case lemmatizer_rules * 1
 	Block : End{cannotMove}
 	parser segmenter eager_SHIFT 0
 	parser segmenter eager_RIGHT_rel 0
@@ -68,9 +67,9 @@ Strategy
 
 This format is composed of several parts :
 * Name : The name of your machine.
-* Classifier : The name of your classifier, followed by its definition between braces. See [Classifier](classifier.md).
+* Classifier : The name of your classifier, followed by its definition between braces. There can be as many classifiers as you want. In the above example, we only define one. See [Classifier](classifier.md).
 * Splitwords : [Transition Set](transitionSet.md) file that contains transitions for multi-words tokenization.\
-It is only mandatory if the machine performs tokeization. This file is automatically generated by `train.sh`.
+It is only mandatory if the machine performs tokenization. This file is automatically generated by `train.sh`.
 * Predictions : Names of the columns that are predicted by your machine.
 * Strategy, followed by its definition between braces. See [Strategy](strategy.md).
 
-- 
GitLab