From b42fd532cdc57df0f0c4b9dba1f95c3b0a156927 Mon Sep 17 00:00:00 2001
From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
Date: Tue, 11 Apr 2017 21:14:36 +0200
Subject: [PATCH] added makefiles to compile morphological analyzer for UD and
 fr

---
 UD/template/maca_trans_morpho/Makefile    | 20 +++++++++++
 fm/maca_trans_morpho.fm                   | 19 ++++++++++
 fm/maca_trans_morpho_fann.fm              | 27 ++++++++++++++
 fr/Makefile                               | 11 +++---
 fr/data/morpho-lexicon/fplm_add           | 28 +++++++++++++--
 fr/eval/Makefile                          |  2 +-
 fr/maca_trans_morpho/Makefile             | 20 +++++++++++
 fr/maca_trans_parser/Makefile             |  2 +-
 fr/maca_trans_parser/maca_trans_parser.fm |  6 ++++
 makefiles/eval_ud.makefile                | 18 ++++++++--
 makefiles/maca_trans_morpho.makefile      | 43 +++++++++++++++++++++++
 tools/eval_mcf.pl                         | 12 +++++--
 tools/ftb_lib.c                           |  2 +-
 13 files changed, 197 insertions(+), 13 deletions(-)
 create mode 100644 UD/template/maca_trans_morpho/Makefile
 create mode 100644 fm/maca_trans_morpho.fm
 create mode 100644 fm/maca_trans_morpho_fann.fm
 create mode 100644 fr/maca_trans_morpho/Makefile
 create mode 100644 makefiles/maca_trans_morpho.makefile

diff --git a/UD/template/maca_trans_morpho/Makefile b/UD/template/maca_trans_morpho/Makefile
new file mode 100644
index 0000000..71a25f6
--- /dev/null
+++ b/UD/template/maca_trans_morpho/Makefile
@@ -0,0 +1,20 @@
+MCF_TRAIN=../data/treebank/train.mcf
+MCF_DEV=../data/treebank/dev.mcf
+MCF_TEST=../data/treebank/test.mcf
+
+CFF_TRAIN=train.cff
+FANN_TRAIN=train.fann
+CFF_FANN_TRAIN=train.fann.cff
+CFF_CUTOFF_TRAIN=train.cutoff.cff
+PERCEPTRON_ITERATIONS=9
+CFF_CUTOFF=1
+FEATURES_MODEL_FILENAME=../../fm/maca_trans_morpho.fm
+FEATURES_MODEL_FANN_FILENAME=../../fm/maca_trans_tagger_fann.fm
+VOCABS_FILENAME=maca_trans_morpho.vocab 
+VOCABS_FANN_FILENAME=maca_trans_morpho_fann.vocab 
+MCD_FILENAME=../../mcd/wpmlgfs.mcd
+MODEL_FILENAME=maca_trans_morpho.model 
+NUMBER_OF_SENTENCES=10000000
+
+include ../../makefiles/maca_trans_morpho.makefile
+
diff --git a/fm/maca_trans_morpho.fm b/fm/maca_trans_morpho.fm
new file mode 100644
index 0000000..9fd77ae
--- /dev/null
+++ b/fm/maca_trans_morpho.fm
@@ -0,0 +1,19 @@
+b0U1
+b0f
+b0len
+bm1f
+bm2f
+b0p
+bm1p
+bm2p
+bm3p
+bm2p bm1p
+bm2p bm3p
+b0s1
+b0s2
+b0s3
+b0s4
+b0s5
+b0s1 b0s2
+b0s1 b0s2 b0s3
+b0s1 b0s2 b0s3 b0s4
diff --git a/fm/maca_trans_morpho_fann.fm b/fm/maca_trans_morpho_fann.fm
new file mode 100644
index 0000000..5593ea8
--- /dev/null
+++ b/fm/maca_trans_morpho_fann.fm
@@ -0,0 +1,27 @@
+b0f
+b0len
+b0p
+b0s1
+b0s1
+b0s1
+b0s1
+b0s2
+b0s2
+b0s2
+b0s2
+b0s3
+b0s3
+b0s3
+b0s4
+b0s4
+b0s5
+b0U1
+bm1f
+bm1p
+bm1p
+bm2f
+bm2p
+bm2p
+bm2p
+bm3p
+bm3p
diff --git a/fr/Makefile b/fr/Makefile
index c5dc3b7..7dbf865 100644
--- a/fr/Makefile
+++ b/fr/Makefile
@@ -6,17 +6,19 @@ compile:
 	$(MAKE) -C data/morpho-lexicon compile
 	$(MAKE) -C data/treebank compile
 	$(MAKE) -C maca_lexer compile
-	$(MAKE) -C maca_trans_parser compile
 	$(MAKE) -C maca_trans_tagger compile
-	$(MAKE) -C maca_crf_tagger compile
+	$(MAKE) -C maca_trans_morpho compile
+	$(MAKE) -C maca_trans_parser compile
+#	$(MAKE) -C maca_crf_tagger compile
 
 install: 
 	-mkdir -p bin
-	$(MAKE) -C maca_trans_parser install
 	$(MAKE) -C maca_trans_tagger install
+	$(MAKE) -C maca_trans_morpho install
+	$(MAKE) -C maca_trans_parser install
 	$(MAKE) -C maca_lemmatizer install
 	$(MAKE) -C maca_lexer install
-	$(MAKE) -C maca_crf_tagger install
+#	$(MAKE) -C maca_crf_tagger install
 #	@tar -cvzf ./maca_datas.tgz bin
 
 evaluation:
@@ -27,6 +29,7 @@ clean:
 	$(MAKE) -C data/treebank clean
 	$(MAKE) -C maca_lexer clean
 	$(MAKE) -C maca_trans_parser clean
+	$(MAKE) -C maca_trans_morpho clean
 	$(MAKE) -C maca_trans_tagger clean
 	$(MAKE) -C maca_crf_tagger clean
 	$(MAKE) -C eval clean
diff --git a/fr/data/morpho-lexicon/fplm_add b/fr/data/morpho-lexicon/fplm_add
index d20dc46..b3e6260 100644
--- a/fr/data/morpho-lexicon/fplm_add
+++ b/fr/data/morpho-lexicon/fplm_add
@@ -4,5 +4,29 @@ de	det	un	#####
 aux	prep	à	#####
 au	prep	à	#####
 du	prep	de	#####
-M	titre	m	#####
-MM	titre	MM	#####
\ No newline at end of file
+
+M	titre	M	#####
+m	titre	M	#####
+Mr	titre	M	#####
+mr	titre	M	#####
+MM	titre	M	#####
+mm	titre	M	#####
+
+Mme	titre	M	#####
+mme	titre	M	#####
+Mmes	titre	M	#####
+mmes	titre	M	#####
+
+Mlle	titre	M	#####
+mlle	titre	M	#####
+Mlles	titre	M	#####
+mlles	titre	M	#####
+
+Dr	titre	docteur	#####
+Drs	titre	docteur	#####
+
+Pr	titre	professeur	#####
+Prs	titre	professeur	#####
+
+Mgr	titre	monseigneur	#####
+mgr	titre	monseigneur	#####
diff --git a/fr/eval/Makefile b/fr/eval/Makefile
index c4e9512..ddd7fe9 100644
--- a/fr/eval/Makefile
+++ b/fr/eval/Makefile
@@ -3,4 +3,4 @@ DEV=../data/treebank/dev.mcf
 LANGUAGE=fr
 MCD_FILE=wplgfs.mcd
 
-include ../../makefiles/eval.makefile
+include ../../makefiles/eval_ud.makefile
diff --git a/fr/maca_trans_morpho/Makefile b/fr/maca_trans_morpho/Makefile
new file mode 100644
index 0000000..71a25f6
--- /dev/null
+++ b/fr/maca_trans_morpho/Makefile
@@ -0,0 +1,20 @@
+MCF_TRAIN=../data/treebank/train.mcf
+MCF_DEV=../data/treebank/dev.mcf
+MCF_TEST=../data/treebank/test.mcf
+
+CFF_TRAIN=train.cff
+FANN_TRAIN=train.fann
+CFF_FANN_TRAIN=train.fann.cff
+CFF_CUTOFF_TRAIN=train.cutoff.cff
+PERCEPTRON_ITERATIONS=9
+CFF_CUTOFF=1
+FEATURES_MODEL_FILENAME=../../fm/maca_trans_morpho.fm
+FEATURES_MODEL_FANN_FILENAME=../../fm/maca_trans_tagger_fann.fm
+VOCABS_FILENAME=maca_trans_morpho.vocab 
+VOCABS_FANN_FILENAME=maca_trans_morpho_fann.vocab 
+MCD_FILENAME=../../mcd/wpmlgfs.mcd
+MODEL_FILENAME=maca_trans_morpho.model 
+NUMBER_OF_SENTENCES=10000000
+
+include ../../makefiles/maca_trans_morpho.makefile
+
diff --git a/fr/maca_trans_parser/Makefile b/fr/maca_trans_parser/Makefile
index 4cf2c4a..e2e5f32 100644
--- a/fr/maca_trans_parser/Makefile
+++ b/fr/maca_trans_parser/Makefile
@@ -12,7 +12,7 @@ VOCABS_FILENAME=maca_trans_parser.vocab
 MODEL_FILENAME=maca_trans_parser.model 
 NUMBER_OF_SENTENCES=10000000
 #NUMBER_OF_SENTENCES=1000
-MCD_FILENAME=wplgfs.mcd
+MCD_FILENAME=../../mcd/wpmlgfs.mcd
 STREAM_MODE= -S
 
 include ../../makefiles/maca_trans_parser.makefile
diff --git a/fr/maca_trans_parser/maca_trans_parser.fm b/fr/maca_trans_parser/maca_trans_parser.fm
index aface90..04e6a4f 100644
--- a/fr/maca_trans_parser/maca_trans_parser.fm
+++ b/fr/maca_trans_parser/maca_trans_parser.fm
@@ -47,3 +47,9 @@ t2
 
 bm1p
 bm2p
+
+s0m b0m
+s0p s0m b0p b0m
+#s0p b0m b0p
+#s0p s0m b0p
+
diff --git a/makefiles/eval_ud.makefile b/makefiles/eval_ud.makefile
index 106817b..14f10c8 100644
--- a/makefiles/eval_ud.makefile
+++ b/makefiles/eval_ud.makefile
@@ -1,6 +1,7 @@
 TAGGER=maca_trans_tagger
 PARSER=maca_trans_parser
 CRF_TAGGER=crf_barebones_decoder
+MORPHO_ANALYZER=maca_trans_morpho
 LEMMATIZER=maca_lemmatizer
 EVAL_WPLGFS=../../tools/eval_wplgfs.pl
 EVAL_WPLSGF=../../tools/eval_wplsgf.pl
@@ -75,12 +76,21 @@ test_Wp: test_W
 test_Wpl: test_Wp
 	$(LEMMATIZER) -C $(WPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@
 
+test_Wpm: test_Wp
+	$(MORPHO_ANALYZER) -C $(WPMLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@
+
+test_Wpml: test_Wpm
+	$(LEMMATIZER) -C $(WPMLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@
+
 test_WPl: test_WP
 	$(LEMMATIZER) -C $(WPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@
 
 test_WPMLgfs: test_WPML
 	$(PARSER) -L $(LANGUAGE) -C $(WPMLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
 
+test_Wpmlgfs: test_Wpml
+	$(PARSER) -L $(LANGUAGE) -C $(WPMLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
+
 test_WPLSgf: test_WPLS
 	$(PARSER) -L $(LANGUAGE) -C $(WPLSGF_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
 
@@ -93,6 +103,9 @@ test_WPLgfs: test_WPL
 test_Wplgfs: test_Wpl
 	$(PARSER) -L $(LANGUAGE) -C $(WPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
 
+eval_Wpmlgfs: test_WPMLGFS test_Wpmlgfs
+	$(EVAL_WPLGFS) -g test_WPMLGFS -s test_WPMLgfs
+
 eval_WPMLgfs: test_WPMLGFS test_WPMLgfs
 	$(EVAL_WPLGFS) -g test_WPMLGFS -s test_WPMLgfs
 
@@ -106,15 +119,16 @@ eval_Wplgfs: test_WPLGFS test_Wplgfs
 	$(EVAL_WPLGFS) -g test_WPLGFS -s test_Wplgfs
 
 eval_header:
-	echo "file	 	pos	lemma	uas	las	srec	sacc" > $(RESULT_FILE)
+	echo "file	 	pos	morpho	lemma	uas	las	srec	sacc" > $(RESULT_FILE)
 
-eval: eval_header test_WPLGFS test_WPLgfs test_WPlgfs test_Wplgfs
+eval: eval_header test_WPLGFS test_WPLgfs test_WPlgfs test_Wplgfs test_Wpmlgfs
 	$(EVAL_MCF) -G WPLGFS -S WPLGFS -g test_WPLGFS -s test_WPLGFS >> $(RESULT_FILE)
 	$(EVAL_MCF) -G WPMLGFS -S WPMLGFS -g test_WPMLGFS -s test_WPMLgfs >> $(RESULT_FILE)
 	$(EVAL_MCF) -G WPLGFS -S WPLSGF -g test_WPLGFS -s test_WPLSgf >> $(RESULT_FILE)
 	$(EVAL_MCF) -G WPLGFS -S WPLGFS -g test_WPLGFS -s test_WPLgfs >> $(RESULT_FILE)
 	$(EVAL_MCF) -G WPLGFS -S WPLGFS -g test_WPLGFS -s test_WPlgfs >> $(RESULT_FILE)
 	$(EVAL_MCF) -G WPLGFS -S WPLGFS -g test_WPLGFS -s test_Wplgfs >> $(RESULT_FILE)
+	$(EVAL_MCF) -G WPMLGFS -S WPMLGFS -g test_WPMLGFS -s test_Wpmlgfs >> $(RESULT_FILE)
 
 
 test_WPLGFS.conll: test_WPLGFS
diff --git a/makefiles/maca_trans_morpho.makefile b/makefiles/maca_trans_morpho.makefile
new file mode 100644
index 0000000..5981e83
--- /dev/null
+++ b/makefiles/maca_trans_morpho.makefile
@@ -0,0 +1,43 @@
+##-----------------------------------------------------------------------
+## compile
+##-----------------------------------------------------------------------
+
+compile: $(MODEL_FILENAME) $(FANN_TRAIN)
+
+$(CFF_TRAIN): $(MCF_TRAIN)
+	maca_trans_morpho_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FILENAME) --vocabs $(VOCABS_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES)
+
+$(CFF_FANN_TRAIN): $(MCF_TRAIN)
+	maca_trans_morpho_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FANN_FILENAME) --vocabs $(VOCABS_FANN_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES)
+
+
+$(CFF_CUTOFF_TRAIN): $(CFF_TRAIN)
+	cff_cutoff --input $< --vocabs $(VOCABS_FILENAME) --cutoff $(CFF_CUTOFF) > $@
+
+$(FANN_TRAIN): $(CFF_FANN_TRAIN)
+	cff2fann --vocabs $(VOCABS_FANN_FILENAME) --cff $< --feat_model $(FEATURES_MODEL_FANN_FILENAME) -C $(MCD_FILENAME) > $@
+
+$(MODEL_FILENAME): $(CFF_CUTOFF_TRAIN)
+#$(MODEL_FILENAME): $(CFF_TRAIN)
+	perceptron_train --cff $< --model $(MODEL_FILENAME) -n $(PERCEPTRON_ITERATIONS)
+
+##-----------------------------------------------------------------------
+## install
+##-----------------------------------------------------------------------
+
+install:
+	- cp $(FEATURES_MODEL_FILENAME) ../bin
+	- cp $(VOCABS_FILENAME) ../bin
+	- cp $(MODEL_FILENAME) ../bin
+
+##-----------------------------------------------------------------------
+## clean
+##-----------------------------------------------------------------------
+
+clean:
+	- rm -f $(VOCABS_FILENAME)
+	- rm -f $(MODEL_FILENAME)
+	- rm -f $(CFF_TRAIN)
+	- rm -f $(CFF_CUTOFF_TRAIN)
+
+
diff --git a/tools/eval_mcf.pl b/tools/eval_mcf.pl
index f8fb57b..75a450f 100755
--- a/tools/eval_mcf.pl
+++ b/tools/eval_mcf.pl
@@ -191,6 +191,13 @@ while(<REF>){
 #	    print "$ref_pos $hyp_pos\n";
 	}
 	
+	if($ref_morph eq $hyp_morph){
+	    $correct_morph_total_nb++; 
+	}
+	else{
+#	    print "$ref_form \t $ref_lemma \t $hyp_lemma\n";
+	}
+
 	if($ref_lemma eq $hyp_lemma){
 	    $correct_lemma_total_nb++; 
 	}
@@ -227,6 +234,7 @@ close HYP;
 
 my $pos_acc = $correct_pos_total_nb / $word_nb * 100;
 my $lemma_acc = $correct_lemma_total_nb / $word_nb * 100;
+my $morph_acc = $correct_morph_total_nb / $word_nb * 100;
 my $las = $correct_gov_fct_total_nb / $word_nb * 100;
 my $uas = $correct_gov_total_nb / $word_nb  * 100 ;
 
@@ -236,8 +244,8 @@ my $seg_recall = $nb_hyp_ref_seg / $nb_ref_seg;
 my $seg_precision = $nb_hyp_ref_seg / ($nb_hyp_seg + 1);
 
 
-printf(stderr "pos acc = %.2f lemma acc = %.2f uas = %.2f las = %.2f seg recall = %.2f seg precision = %.2f size = %d\n", $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb);
-printf(stdout "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n", $hyp, $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb);
+printf(stderr "pos acc = %.2f morph acc = %.2f lemma acc = %.2f uas = %.2f las = %.2f seg recall = %.2f seg precision = %.2f size = %d\n", $pos_acc, $morph_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb);
+printf(stdout "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n", $hyp, $pos_acc, $morph_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb);
 
 
 
diff --git a/tools/ftb_lib.c b/tools/ftb_lib.c
index 794814c..b29e462 100644
--- a/tools/ftb_lib.c
+++ b/tools/ftb_lib.c
@@ -173,9 +173,9 @@ void print_sentence_no_newline(sentence *s)
     /* fprintf(stdout, "\t%s", w->form); */
     fprintf(stdout, "%s", w->form);
     fprintf(stdout, "\t%s", w->postag);
+    fprintf(stdout, "\t%s", w->feats);
     fprintf(stdout, "\t%s", w->lemma);
     /* fprintf(stdout, "\t%s", w->cpostag); */
-    /* fprintf(stdout, "\t%s", w->feats); */
 
     fprintf(stdout, "\t%d", w->head);
 
-- 
GitLab