From 74ee7a4c080fac713fdc18d0512c2a969b29f810 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Mon, 29 May 2017 11:55:43 +0200 Subject: [PATCH] fine tuning in datcha --- datcha/data/morpho-lexicon/fplm_change_pos.pl | 2 +- datcha/eval/Makefile | 11 +++++++++-- datcha/maca_trans_tagger/maca_trans_tagger.fm | 16 ++++++++-------- datcha/tools/datcha2mcf.pl | 3 ++- fr/data/treebank/Makefile | 1 + makefiles/treebank.makefile | 11 +++++++---- tools/eval_mcf.pl | 6 ++++-- 7 files changed, 32 insertions(+), 18 deletions(-) diff --git a/datcha/data/morpho-lexicon/fplm_change_pos.pl b/datcha/data/morpho-lexicon/fplm_change_pos.pl index d968b09..04aa6f5 100755 --- a/datcha/data/morpho-lexicon/fplm_change_pos.pl +++ b/datcha/data/morpho-lexicon/fplm_change_pos.pl @@ -14,7 +14,7 @@ $orfeo2datcha{"NOM"} = "NOM"; $orfeo2datcha{"PCT"} = "PUN"; $orfeo2datcha{"PRE"} = "PRP"; $orfeo2datcha{"PRO"} = "PRO"; -$orfeo2datcha{"PRQ"} = "PRO:RE%"; +$orfeo2datcha{"PRQ"} = "PRO:RE"; $orfeo2datcha{"VNF"} = "VER:infi"; $orfeo2datcha{"VPP"} = "VER:pper"; $orfeo2datcha{"VPR"} = "VER:ppre"; diff --git a/datcha/eval/Makefile b/datcha/eval/Makefile index 42ec8b0..74f2955 100644 --- a/datcha/eval/Makefile +++ b/datcha/eval/Makefile @@ -1,7 +1,7 @@ MCF_TRAIN=../data/treebank/train.mcf MCF_DEV=../data/treebank/test.mcf MCF_TEST=../data/treebank/test.mcf -EVAL_MCF=/home/alexis/gitlab/maca_data2/tools/eval_mcf.pl +EVAL_MCF=../../tools/eval_mcf.pl CFF_TRAIN=train.cff CFF_CUTOFF_TRAIN=train.cutoff.cff @@ -63,10 +63,17 @@ total: test_Wp test_WP test_wp test_cp test_S test_L clean: - rm -f test_W + - rm -f test_c + - rm -f test_w - rm -f test_P - rm -f test_WP - rm -f test_Wp - rm -f test_cp + - rm -f test_wp - rm -f total - + - rm test_L + - rm test_p[c] + - rm test_p[w] + - rm test_p[W] + - rm test_S diff --git a/datcha/maca_trans_tagger/maca_trans_tagger.fm b/datcha/maca_trans_tagger/maca_trans_tagger.fm index 79cf47f..67a7dd3 100644 --- a/datcha/maca_trans_tagger/maca_trans_tagger.fm +++ b/datcha/maca_trans_tagger/maca_trans_tagger.fm @@ -2,20 +2,20 @@ b0U1 b0sgn b1sgn b2sgn -b0f -#b1f -#b2f -b0len bm1f bm2f +b0f +b1f +b2f +b0len bm1p bm2p bm3p +bm3p bm2p bm2p bm1p -bm2p bm3p bm1p b0sgn - -b0s1 -b0s1 b0s2 +#b0s1 +#b0s1 b0s2 b0s1 b0s2 b0s3 b0s1 b0s2 b0s3 b0s4 + diff --git a/datcha/tools/datcha2mcf.pl b/datcha/tools/datcha2mcf.pl index 07c0ded..ca48c25 100755 --- a/datcha/tools/datcha2mcf.pl +++ b/datcha/tools/datcha2mcf.pl @@ -25,6 +25,7 @@ $tilt2datcha_pos{"SYM"} = "NOM"; $tilt2datcha_pos{"URL"} = "NOM"; $tilt2datcha_pos{"HEURE"} = "NOM"; $tilt2datcha_pos{"XXX"} = "NOM"; +$tilt2datcha_pos{"VER:impe"} = "VER"; $tilt2datcha_pos{"ADJ"} = "ADJ"; @@ -40,7 +41,6 @@ $tilt2datcha_pos{"PRO"} = "PRO"; $tilt2datcha_pos{"PRP"} = "PRP"; $tilt2datcha_pos{"PUN"} = "PUN"; $tilt2datcha_pos{"VER"} = "VER"; -$tilt2datcha_pos{"VER:impe"} = "VER:impe"; $tilt2datcha_pos{"VER:infi"} = "VER:infi"; $tilt2datcha_pos{"VER:pper"} = "VER:pper"; $tilt2datcha_pos{"VER:ppre"} = "VER:ppre"; @@ -90,6 +90,7 @@ while(<>){ chop; #Bonjour Bonjour OK INT bonjour tchat1 TC [00:11:09] Bonjour IV ($A, $B, $C, $D, $E, $F, $G, $H, $I, $J) = split /\t/; + if($A eq "XX") {next;} $form =~ s/ /_/g; $lemma =~ s/ /_/g; $cpos = $tilt2datcha_cpos{$pos}; diff --git a/fr/data/treebank/Makefile b/fr/data/treebank/Makefile index ccae4f5..7b5e0a9 100644 --- a/fr/data/treebank/Makefile +++ b/fr/data/treebank/Makefile @@ -3,6 +3,7 @@ FTB_DIR=../../../data/ftb TRAIN=$(FTB_DIR)/ftb.train.conll07 TEST=$(FTB_DIR)/ftb.test.conll07 DEV=$(FTB_DIR)/ftb.dev.conll07 +THRESHOLD=50 compile: train.mcf test.mcf dev.mcf train.conll07 test.conll07 dev.conll07 diff --git a/makefiles/treebank.makefile b/makefiles/treebank.makefile index 06e9086..3e33ecb 100644 --- a/makefiles/treebank.makefile +++ b/makefiles/treebank.makefile @@ -3,13 +3,16 @@ TOOLS=../../../tools compile: train.mcf test.mcf dev.mcf train.mcf: $(TRAIN) - $(TOOLS)/conllu2mcf -f $< -1W -2C -3L -4H -5D > $@ + $(TOOLS)/conll_keep_most_frequent_morpho_tags.pl $< $(THRESHOLD) > tmp + $(TOOLS)/conllu2mcf -f tmp -1W -2C -3L -4H -5D > $@ test.mcf: $(TEST) - $(TOOLS)/conllu2mcf -f $< -1W -2C -3L -4H -5D > $@ + $(TOOLS)/conll_keep_most_frequent_morpho_tags.pl $< $(THRESHOLD) > tmp + $(TOOLS)/conllu2mcf -f tmp -1W -2C -3L -4H -5D > $@ dev.mcf: $(TEST) - $(TOOLS)/conllu2mcf -f $< -1W -2C -3L -4H -5D > $@ + $(TOOLS)/conll_keep_most_frequent_morpho_tags.pl $< $(THRESHOLD) > tmp + $(TOOLS)/conllu2mcf -f tmp -1W -2C -3L -4H -5D > $@ clean: - - rm test.mcf train.mcf dev.mcf + - rm test.mcf train.mcf dev.mcf tmp diff --git a/tools/eval_mcf.pl b/tools/eval_mcf.pl index 5fce09e..8a28aab 100755 --- a/tools/eval_mcf.pl +++ b/tools/eval_mcf.pl @@ -200,11 +200,13 @@ while(<REF>){ # print "$ref_form \t $ref_lemma \t $hyp_lemma\n"; } - if($ref_lemma eq $hyp_lemma){ + if(lc $ref_lemma eq lc $hyp_lemma){ $correct_lemma_total_nb++; } else{ -# print "$ref_form \t $ref_lemma \t $hyp_lemma\n"; +# if($ref_pos eq "v"){ +# print "$ref_form \t $ref_pos \t $ref_lemma \t $hyp_lemma\n"; +# } } $ref_dist = $ref_gov - $ref_index; $hyp_dist = $hyp_gov - $hyp_index; -- GitLab