From b42fd532cdc57df0f0c4b9dba1f95c3b0a156927 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Tue, 11 Apr 2017 21:14:36 +0200 Subject: [PATCH] added makefiles to compile morphological analyzer for UD and fr --- UD/template/maca_trans_morpho/Makefile | 20 +++++++++++ fm/maca_trans_morpho.fm | 19 ++++++++++ fm/maca_trans_morpho_fann.fm | 27 ++++++++++++++ fr/Makefile | 11 +++--- fr/data/morpho-lexicon/fplm_add | 28 +++++++++++++-- fr/eval/Makefile | 2 +- fr/maca_trans_morpho/Makefile | 20 +++++++++++ fr/maca_trans_parser/Makefile | 2 +- fr/maca_trans_parser/maca_trans_parser.fm | 6 ++++ makefiles/eval_ud.makefile | 18 ++++++++-- makefiles/maca_trans_morpho.makefile | 43 +++++++++++++++++++++++ tools/eval_mcf.pl | 12 +++++-- tools/ftb_lib.c | 2 +- 13 files changed, 197 insertions(+), 13 deletions(-) create mode 100644 UD/template/maca_trans_morpho/Makefile create mode 100644 fm/maca_trans_morpho.fm create mode 100644 fm/maca_trans_morpho_fann.fm create mode 100644 fr/maca_trans_morpho/Makefile create mode 100644 makefiles/maca_trans_morpho.makefile diff --git a/UD/template/maca_trans_morpho/Makefile b/UD/template/maca_trans_morpho/Makefile new file mode 100644 index 0000000..71a25f6 --- /dev/null +++ b/UD/template/maca_trans_morpho/Makefile @@ -0,0 +1,20 @@ +MCF_TRAIN=../data/treebank/train.mcf +MCF_DEV=../data/treebank/dev.mcf +MCF_TEST=../data/treebank/test.mcf + +CFF_TRAIN=train.cff +FANN_TRAIN=train.fann +CFF_FANN_TRAIN=train.fann.cff +CFF_CUTOFF_TRAIN=train.cutoff.cff +PERCEPTRON_ITERATIONS=9 +CFF_CUTOFF=1 +FEATURES_MODEL_FILENAME=../../fm/maca_trans_morpho.fm +FEATURES_MODEL_FANN_FILENAME=../../fm/maca_trans_tagger_fann.fm +VOCABS_FILENAME=maca_trans_morpho.vocab +VOCABS_FANN_FILENAME=maca_trans_morpho_fann.vocab +MCD_FILENAME=../../mcd/wpmlgfs.mcd +MODEL_FILENAME=maca_trans_morpho.model +NUMBER_OF_SENTENCES=10000000 + +include ../../makefiles/maca_trans_morpho.makefile + diff --git a/fm/maca_trans_morpho.fm b/fm/maca_trans_morpho.fm new file mode 100644 index 0000000..9fd77ae --- /dev/null +++ b/fm/maca_trans_morpho.fm @@ -0,0 +1,19 @@ +b0U1 +b0f +b0len +bm1f +bm2f +b0p +bm1p +bm2p +bm3p +bm2p bm1p +bm2p bm3p +b0s1 +b0s2 +b0s3 +b0s4 +b0s5 +b0s1 b0s2 +b0s1 b0s2 b0s3 +b0s1 b0s2 b0s3 b0s4 diff --git a/fm/maca_trans_morpho_fann.fm b/fm/maca_trans_morpho_fann.fm new file mode 100644 index 0000000..5593ea8 --- /dev/null +++ b/fm/maca_trans_morpho_fann.fm @@ -0,0 +1,27 @@ +b0f +b0len +b0p +b0s1 +b0s1 +b0s1 +b0s1 +b0s2 +b0s2 +b0s2 +b0s2 +b0s3 +b0s3 +b0s3 +b0s4 +b0s4 +b0s5 +b0U1 +bm1f +bm1p +bm1p +bm2f +bm2p +bm2p +bm2p +bm3p +bm3p diff --git a/fr/Makefile b/fr/Makefile index c5dc3b7..7dbf865 100644 --- a/fr/Makefile +++ b/fr/Makefile @@ -6,17 +6,19 @@ compile: $(MAKE) -C data/morpho-lexicon compile $(MAKE) -C data/treebank compile $(MAKE) -C maca_lexer compile - $(MAKE) -C maca_trans_parser compile $(MAKE) -C maca_trans_tagger compile - $(MAKE) -C maca_crf_tagger compile + $(MAKE) -C maca_trans_morpho compile + $(MAKE) -C maca_trans_parser compile +# $(MAKE) -C maca_crf_tagger compile install: -mkdir -p bin - $(MAKE) -C maca_trans_parser install $(MAKE) -C maca_trans_tagger install + $(MAKE) -C maca_trans_morpho install + $(MAKE) -C maca_trans_parser install $(MAKE) -C maca_lemmatizer install $(MAKE) -C maca_lexer install - $(MAKE) -C maca_crf_tagger install +# $(MAKE) -C maca_crf_tagger install # @tar -cvzf ./maca_datas.tgz bin evaluation: @@ -27,6 +29,7 @@ clean: $(MAKE) -C data/treebank clean $(MAKE) -C maca_lexer clean $(MAKE) -C maca_trans_parser clean + $(MAKE) -C maca_trans_morpho clean $(MAKE) -C maca_trans_tagger clean $(MAKE) -C maca_crf_tagger clean $(MAKE) -C eval clean diff --git a/fr/data/morpho-lexicon/fplm_add b/fr/data/morpho-lexicon/fplm_add index d20dc46..b3e6260 100644 --- a/fr/data/morpho-lexicon/fplm_add +++ b/fr/data/morpho-lexicon/fplm_add @@ -4,5 +4,29 @@ de det un ##### aux prep à ##### au prep à ##### du prep de ##### -M titre m ##### -MM titre MM ##### \ No newline at end of file + +M titre M ##### +m titre M ##### +Mr titre M ##### +mr titre M ##### +MM titre M ##### +mm titre M ##### + +Mme titre M ##### +mme titre M ##### +Mmes titre M ##### +mmes titre M ##### + +Mlle titre M ##### +mlle titre M ##### +Mlles titre M ##### +mlles titre M ##### + +Dr titre docteur ##### +Drs titre docteur ##### + +Pr titre professeur ##### +Prs titre professeur ##### + +Mgr titre monseigneur ##### +mgr titre monseigneur ##### diff --git a/fr/eval/Makefile b/fr/eval/Makefile index c4e9512..ddd7fe9 100644 --- a/fr/eval/Makefile +++ b/fr/eval/Makefile @@ -3,4 +3,4 @@ DEV=../data/treebank/dev.mcf LANGUAGE=fr MCD_FILE=wplgfs.mcd -include ../../makefiles/eval.makefile +include ../../makefiles/eval_ud.makefile diff --git a/fr/maca_trans_morpho/Makefile b/fr/maca_trans_morpho/Makefile new file mode 100644 index 0000000..71a25f6 --- /dev/null +++ b/fr/maca_trans_morpho/Makefile @@ -0,0 +1,20 @@ +MCF_TRAIN=../data/treebank/train.mcf +MCF_DEV=../data/treebank/dev.mcf +MCF_TEST=../data/treebank/test.mcf + +CFF_TRAIN=train.cff +FANN_TRAIN=train.fann +CFF_FANN_TRAIN=train.fann.cff +CFF_CUTOFF_TRAIN=train.cutoff.cff +PERCEPTRON_ITERATIONS=9 +CFF_CUTOFF=1 +FEATURES_MODEL_FILENAME=../../fm/maca_trans_morpho.fm +FEATURES_MODEL_FANN_FILENAME=../../fm/maca_trans_tagger_fann.fm +VOCABS_FILENAME=maca_trans_morpho.vocab +VOCABS_FANN_FILENAME=maca_trans_morpho_fann.vocab +MCD_FILENAME=../../mcd/wpmlgfs.mcd +MODEL_FILENAME=maca_trans_morpho.model +NUMBER_OF_SENTENCES=10000000 + +include ../../makefiles/maca_trans_morpho.makefile + diff --git a/fr/maca_trans_parser/Makefile b/fr/maca_trans_parser/Makefile index 4cf2c4a..e2e5f32 100644 --- a/fr/maca_trans_parser/Makefile +++ b/fr/maca_trans_parser/Makefile @@ -12,7 +12,7 @@ VOCABS_FILENAME=maca_trans_parser.vocab MODEL_FILENAME=maca_trans_parser.model NUMBER_OF_SENTENCES=10000000 #NUMBER_OF_SENTENCES=1000 -MCD_FILENAME=wplgfs.mcd +MCD_FILENAME=../../mcd/wpmlgfs.mcd STREAM_MODE= -S include ../../makefiles/maca_trans_parser.makefile diff --git a/fr/maca_trans_parser/maca_trans_parser.fm b/fr/maca_trans_parser/maca_trans_parser.fm index aface90..04e6a4f 100644 --- a/fr/maca_trans_parser/maca_trans_parser.fm +++ b/fr/maca_trans_parser/maca_trans_parser.fm @@ -47,3 +47,9 @@ t2 bm1p bm2p + +s0m b0m +s0p s0m b0p b0m +#s0p b0m b0p +#s0p s0m b0p + diff --git a/makefiles/eval_ud.makefile b/makefiles/eval_ud.makefile index 106817b..14f10c8 100644 --- a/makefiles/eval_ud.makefile +++ b/makefiles/eval_ud.makefile @@ -1,6 +1,7 @@ TAGGER=maca_trans_tagger PARSER=maca_trans_parser CRF_TAGGER=crf_barebones_decoder +MORPHO_ANALYZER=maca_trans_morpho LEMMATIZER=maca_lemmatizer EVAL_WPLGFS=../../tools/eval_wplgfs.pl EVAL_WPLSGF=../../tools/eval_wplsgf.pl @@ -75,12 +76,21 @@ test_Wp: test_W test_Wpl: test_Wp $(LEMMATIZER) -C $(WPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@ +test_Wpm: test_Wp + $(MORPHO_ANALYZER) -C $(WPMLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@ + +test_Wpml: test_Wpm + $(LEMMATIZER) -C $(WPMLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@ + test_WPl: test_WP $(LEMMATIZER) -C $(WPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@ test_WPMLgfs: test_WPML $(PARSER) -L $(LANGUAGE) -C $(WPMLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@ +test_Wpmlgfs: test_Wpml + $(PARSER) -L $(LANGUAGE) -C $(WPMLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@ + test_WPLSgf: test_WPLS $(PARSER) -L $(LANGUAGE) -C $(WPLSGF_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@ @@ -93,6 +103,9 @@ test_WPLgfs: test_WPL test_Wplgfs: test_Wpl $(PARSER) -L $(LANGUAGE) -C $(WPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@ +eval_Wpmlgfs: test_WPMLGFS test_Wpmlgfs + $(EVAL_WPLGFS) -g test_WPMLGFS -s test_WPMLgfs + eval_WPMLgfs: test_WPMLGFS test_WPMLgfs $(EVAL_WPLGFS) -g test_WPMLGFS -s test_WPMLgfs @@ -106,15 +119,16 @@ eval_Wplgfs: test_WPLGFS test_Wplgfs $(EVAL_WPLGFS) -g test_WPLGFS -s test_Wplgfs eval_header: - echo "file pos lemma uas las srec sacc" > $(RESULT_FILE) + echo "file pos morpho lemma uas las srec sacc" > $(RESULT_FILE) -eval: eval_header test_WPLGFS test_WPLgfs test_WPlgfs test_Wplgfs +eval: eval_header test_WPLGFS test_WPLgfs test_WPlgfs test_Wplgfs test_Wpmlgfs $(EVAL_MCF) -G WPLGFS -S WPLGFS -g test_WPLGFS -s test_WPLGFS >> $(RESULT_FILE) $(EVAL_MCF) -G WPMLGFS -S WPMLGFS -g test_WPMLGFS -s test_WPMLgfs >> $(RESULT_FILE) $(EVAL_MCF) -G WPLGFS -S WPLSGF -g test_WPLGFS -s test_WPLSgf >> $(RESULT_FILE) $(EVAL_MCF) -G WPLGFS -S WPLGFS -g test_WPLGFS -s test_WPLgfs >> $(RESULT_FILE) $(EVAL_MCF) -G WPLGFS -S WPLGFS -g test_WPLGFS -s test_WPlgfs >> $(RESULT_FILE) $(EVAL_MCF) -G WPLGFS -S WPLGFS -g test_WPLGFS -s test_Wplgfs >> $(RESULT_FILE) + $(EVAL_MCF) -G WPMLGFS -S WPMLGFS -g test_WPMLGFS -s test_Wpmlgfs >> $(RESULT_FILE) test_WPLGFS.conll: test_WPLGFS diff --git a/makefiles/maca_trans_morpho.makefile b/makefiles/maca_trans_morpho.makefile new file mode 100644 index 0000000..5981e83 --- /dev/null +++ b/makefiles/maca_trans_morpho.makefile @@ -0,0 +1,43 @@ +##----------------------------------------------------------------------- +## compile +##----------------------------------------------------------------------- + +compile: $(MODEL_FILENAME) $(FANN_TRAIN) + +$(CFF_TRAIN): $(MCF_TRAIN) + maca_trans_morpho_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FILENAME) --vocabs $(VOCABS_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES) + +$(CFF_FANN_TRAIN): $(MCF_TRAIN) + maca_trans_morpho_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FANN_FILENAME) --vocabs $(VOCABS_FANN_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES) + + +$(CFF_CUTOFF_TRAIN): $(CFF_TRAIN) + cff_cutoff --input $< --vocabs $(VOCABS_FILENAME) --cutoff $(CFF_CUTOFF) > $@ + +$(FANN_TRAIN): $(CFF_FANN_TRAIN) + cff2fann --vocabs $(VOCABS_FANN_FILENAME) --cff $< --feat_model $(FEATURES_MODEL_FANN_FILENAME) -C $(MCD_FILENAME) > $@ + +$(MODEL_FILENAME): $(CFF_CUTOFF_TRAIN) +#$(MODEL_FILENAME): $(CFF_TRAIN) + perceptron_train --cff $< --model $(MODEL_FILENAME) -n $(PERCEPTRON_ITERATIONS) + +##----------------------------------------------------------------------- +## install +##----------------------------------------------------------------------- + +install: + - cp $(FEATURES_MODEL_FILENAME) ../bin + - cp $(VOCABS_FILENAME) ../bin + - cp $(MODEL_FILENAME) ../bin + +##----------------------------------------------------------------------- +## clean +##----------------------------------------------------------------------- + +clean: + - rm -f $(VOCABS_FILENAME) + - rm -f $(MODEL_FILENAME) + - rm -f $(CFF_TRAIN) + - rm -f $(CFF_CUTOFF_TRAIN) + + diff --git a/tools/eval_mcf.pl b/tools/eval_mcf.pl index f8fb57b..75a450f 100755 --- a/tools/eval_mcf.pl +++ b/tools/eval_mcf.pl @@ -191,6 +191,13 @@ while(<REF>){ # print "$ref_pos $hyp_pos\n"; } + if($ref_morph eq $hyp_morph){ + $correct_morph_total_nb++; + } + else{ +# print "$ref_form \t $ref_lemma \t $hyp_lemma\n"; + } + if($ref_lemma eq $hyp_lemma){ $correct_lemma_total_nb++; } @@ -227,6 +234,7 @@ close HYP; my $pos_acc = $correct_pos_total_nb / $word_nb * 100; my $lemma_acc = $correct_lemma_total_nb / $word_nb * 100; +my $morph_acc = $correct_morph_total_nb / $word_nb * 100; my $las = $correct_gov_fct_total_nb / $word_nb * 100; my $uas = $correct_gov_total_nb / $word_nb * 100 ; @@ -236,8 +244,8 @@ my $seg_recall = $nb_hyp_ref_seg / $nb_ref_seg; my $seg_precision = $nb_hyp_ref_seg / ($nb_hyp_seg + 1); -printf(stderr "pos acc = %.2f lemma acc = %.2f uas = %.2f las = %.2f seg recall = %.2f seg precision = %.2f size = %d\n", $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb); -printf(stdout "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n", $hyp, $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb); +printf(stderr "pos acc = %.2f morph acc = %.2f lemma acc = %.2f uas = %.2f las = %.2f seg recall = %.2f seg precision = %.2f size = %d\n", $pos_acc, $morph_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb); +printf(stdout "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n", $hyp, $pos_acc, $morph_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb); diff --git a/tools/ftb_lib.c b/tools/ftb_lib.c index 794814c..b29e462 100644 --- a/tools/ftb_lib.c +++ b/tools/ftb_lib.c @@ -173,9 +173,9 @@ void print_sentence_no_newline(sentence *s) /* fprintf(stdout, "\t%s", w->form); */ fprintf(stdout, "%s", w->form); fprintf(stdout, "\t%s", w->postag); + fprintf(stdout, "\t%s", w->feats); fprintf(stdout, "\t%s", w->lemma); /* fprintf(stdout, "\t%s", w->cpostag); */ - /* fprintf(stdout, "\t%s", w->feats); */ fprintf(stdout, "\t%d", w->head); -- GitLab