diff --git a/datcha/data/morpho-lexicon/fplm_change_pos.pl b/datcha/data/morpho-lexicon/fplm_change_pos.pl index d968b0901acb51b07a7f4e63b734a748e239d72a..04aa6f5deb29ff8b4792baed3eacc11cb6253fd6 100755 --- a/datcha/data/morpho-lexicon/fplm_change_pos.pl +++ b/datcha/data/morpho-lexicon/fplm_change_pos.pl @@ -14,7 +14,7 @@ $orfeo2datcha{"NOM"} = "NOM"; $orfeo2datcha{"PCT"} = "PUN"; $orfeo2datcha{"PRE"} = "PRP"; $orfeo2datcha{"PRO"} = "PRO"; -$orfeo2datcha{"PRQ"} = "PRO:RE%"; +$orfeo2datcha{"PRQ"} = "PRO:RE"; $orfeo2datcha{"VNF"} = "VER:infi"; $orfeo2datcha{"VPP"} = "VER:pper"; $orfeo2datcha{"VPR"} = "VER:ppre"; diff --git a/datcha/eval/Makefile b/datcha/eval/Makefile index 42ec8b0ba186bdbbdcc57ba2587394b873650622..74f2955a3d10ed08a1db5143234f1e7b02bbc7c4 100644 --- a/datcha/eval/Makefile +++ b/datcha/eval/Makefile @@ -1,7 +1,7 @@ MCF_TRAIN=../data/treebank/train.mcf MCF_DEV=../data/treebank/test.mcf MCF_TEST=../data/treebank/test.mcf -EVAL_MCF=/home/alexis/gitlab/maca_data2/tools/eval_mcf.pl +EVAL_MCF=../../tools/eval_mcf.pl CFF_TRAIN=train.cff CFF_CUTOFF_TRAIN=train.cutoff.cff @@ -63,10 +63,17 @@ total: test_Wp test_WP test_wp test_cp test_S test_L clean: - rm -f test_W + - rm -f test_c + - rm -f test_w - rm -f test_P - rm -f test_WP - rm -f test_Wp - rm -f test_cp + - rm -f test_wp - rm -f total - + - rm test_L + - rm test_p[c] + - rm test_p[w] + - rm test_p[W] + - rm test_S diff --git a/datcha/maca_trans_tagger/maca_trans_tagger.fm b/datcha/maca_trans_tagger/maca_trans_tagger.fm index 79cf47f123324352de0ed19a0dab4811d2d3c244..67a7dd310b443f765f3749c7d263f46b0a5f82eb 100644 --- a/datcha/maca_trans_tagger/maca_trans_tagger.fm +++ b/datcha/maca_trans_tagger/maca_trans_tagger.fm @@ -2,20 +2,20 @@ b0U1 b0sgn b1sgn b2sgn -b0f -#b1f -#b2f -b0len bm1f bm2f +b0f +b1f +b2f +b0len bm1p bm2p bm3p +bm3p bm2p bm2p bm1p -bm2p bm3p bm1p b0sgn - -b0s1 -b0s1 b0s2 +#b0s1 +#b0s1 b0s2 b0s1 b0s2 b0s3 b0s1 b0s2 b0s3 b0s4 + diff --git a/datcha/tools/datcha2mcf.pl b/datcha/tools/datcha2mcf.pl index 07c0dedf1c36cf30cc01ca7f873f4756434467f7..ca48c258125faf8d578e984d67a7559c1b5cf9d6 100755 --- a/datcha/tools/datcha2mcf.pl +++ b/datcha/tools/datcha2mcf.pl @@ -25,6 +25,7 @@ $tilt2datcha_pos{"SYM"} = "NOM"; $tilt2datcha_pos{"URL"} = "NOM"; $tilt2datcha_pos{"HEURE"} = "NOM"; $tilt2datcha_pos{"XXX"} = "NOM"; +$tilt2datcha_pos{"VER:impe"} = "VER"; $tilt2datcha_pos{"ADJ"} = "ADJ"; @@ -40,7 +41,6 @@ $tilt2datcha_pos{"PRO"} = "PRO"; $tilt2datcha_pos{"PRP"} = "PRP"; $tilt2datcha_pos{"PUN"} = "PUN"; $tilt2datcha_pos{"VER"} = "VER"; -$tilt2datcha_pos{"VER:impe"} = "VER:impe"; $tilt2datcha_pos{"VER:infi"} = "VER:infi"; $tilt2datcha_pos{"VER:pper"} = "VER:pper"; $tilt2datcha_pos{"VER:ppre"} = "VER:ppre"; @@ -90,6 +90,7 @@ while(<>){ chop; #Bonjour Bonjour OK INT bonjour tchat1 TC [00:11:09] Bonjour IV ($A, $B, $C, $D, $E, $F, $G, $H, $I, $J) = split /\t/; + if($A eq "XX") {next;} $form =~ s/ /_/g; $lemma =~ s/ /_/g; $cpos = $tilt2datcha_cpos{$pos}; diff --git a/fr/data/treebank/Makefile b/fr/data/treebank/Makefile index ccae4f53307509f40bcf00ca7f88300d5b1840e0..7b5e0a9755531c5be6d3bbb32b162a5f154dce7a 100644 --- a/fr/data/treebank/Makefile +++ b/fr/data/treebank/Makefile @@ -3,6 +3,7 @@ FTB_DIR=../../../data/ftb TRAIN=$(FTB_DIR)/ftb.train.conll07 TEST=$(FTB_DIR)/ftb.test.conll07 DEV=$(FTB_DIR)/ftb.dev.conll07 +THRESHOLD=50 compile: train.mcf test.mcf dev.mcf train.conll07 test.conll07 dev.conll07 diff --git a/makefiles/treebank.makefile b/makefiles/treebank.makefile index 06e90866e0408cedcf2041a3a89a971af0c33132..3e33ecbdb90b3da21b5df36e223666fbc4a368f8 100644 --- a/makefiles/treebank.makefile +++ b/makefiles/treebank.makefile @@ -3,13 +3,16 @@ TOOLS=../../../tools compile: train.mcf test.mcf dev.mcf train.mcf: $(TRAIN) - $(TOOLS)/conllu2mcf -f $< -1W -2C -3L -4H -5D > $@ + $(TOOLS)/conll_keep_most_frequent_morpho_tags.pl $< $(THRESHOLD) > tmp + $(TOOLS)/conllu2mcf -f tmp -1W -2C -3L -4H -5D > $@ test.mcf: $(TEST) - $(TOOLS)/conllu2mcf -f $< -1W -2C -3L -4H -5D > $@ + $(TOOLS)/conll_keep_most_frequent_morpho_tags.pl $< $(THRESHOLD) > tmp + $(TOOLS)/conllu2mcf -f tmp -1W -2C -3L -4H -5D > $@ dev.mcf: $(TEST) - $(TOOLS)/conllu2mcf -f $< -1W -2C -3L -4H -5D > $@ + $(TOOLS)/conll_keep_most_frequent_morpho_tags.pl $< $(THRESHOLD) > tmp + $(TOOLS)/conllu2mcf -f tmp -1W -2C -3L -4H -5D > $@ clean: - - rm test.mcf train.mcf dev.mcf + - rm test.mcf train.mcf dev.mcf tmp diff --git a/tools/eval_mcf.pl b/tools/eval_mcf.pl index 5fce09e1a824cf3e6dc1b9b0a2426502391c486d..8a28aabcde6f07d673e7320b816488898aff6e34 100755 --- a/tools/eval_mcf.pl +++ b/tools/eval_mcf.pl @@ -200,11 +200,13 @@ while(<REF>){ # print "$ref_form \t $ref_lemma \t $hyp_lemma\n"; } - if($ref_lemma eq $hyp_lemma){ + if(lc $ref_lemma eq lc $hyp_lemma){ $correct_lemma_total_nb++; } else{ -# print "$ref_form \t $ref_lemma \t $hyp_lemma\n"; +# if($ref_pos eq "v"){ +# print "$ref_form \t $ref_pos \t $ref_lemma \t $hyp_lemma\n"; +# } } $ref_dist = $ref_gov - $ref_index; $hyp_dist = $hyp_gov - $hyp_index;