Skip to content
Snippets Groups Projects
Commit c9f32600 authored by Franck Dary's avatar Franck Dary
Browse files

Working eval

parent 15b73c6c
No related branches found
No related tags found
No related merge requests found
*/bin
#! /bin/bash
LANG=fr
MCF=../data/treebank/test.mcf
MCD=../data/treebank/test.mcd
source ../../scripts/eval.sh
#! /bin/bash
if [ "$#" -ne 1 ]; then
echo "Usage : $0 input"
exit
fi
macaon_decode --tm machine.tm --bd test.bd --mcd ../../data/treebank/test.mcd -I $1 --expName
#! /bin/bash
if [ "$#" -ne 1 ]; then
echo "Usage : $0 input"
exit
fi
macaon_decode --tm machine.tm --bd test.bd --mcd ../../data/treebank/test.mcd -I $1 --expName
#! /bin/bash
if [ "$#" -ne 1 ]; then
echo "Usage : $0 input"
exit
fi
macaon_decode --tm machine.tm --bd test.bd --mcd ../../data/treebank/test.mcd -I $1 --expName
#! /bin/bash
if [ "$#" -ne 1 ]; then
echo "Usage : $0 input"
exit
fi
macaon_decode --tm machine.tm --bd test.bd --mcd ../../data/treebank/test.mcd -I $1 --expName
#! /bin/bash
if [ "$#" -ne 1 ]; then
echo "Usage : $0 input"
exit
fi
macaon_decode --tm machine.tm --bd test.bd --mcd ../../data/treebank/test.mcd -I $1 --expName
......@@ -2,44 +2,4 @@
LANG="fr"
if [ "$#" -lt 2 ]; then
echo "Usage : $0 templateName expName"
exit
fi
TEMPLATENAME=$1
EXPNAME=$2
# Here we collect all the extra arguments, to forward them to macaon_train
shift
shift
ARGS=""
for arg in "$@"
do
ARGS="$ARGS $arg"
done
LANGPATH=$MACAON_DIR/$LANG
TEMPLATEPATH=$LANGPATH/$TEMPLATENAME
EXPPATH=$LANGPATH/bin/$EXPNAME
mkdir -p bin
if [ ! -d "$TEMPLATEPATH" ]; then
echo "ERROR : directory $TEMPLATEPATH doesn't exist"
exit
fi
# Here we create the experience folder and we copy every useful file into it
rm -r $EXPPATH 2> /dev/null
mkdir $EXPPATH
cp $TEMPLATEPATH/* $EXPPATH/.
# Here we add the experience name to the decode script
sed -i "$ s/$/ $EXPNAME/" $EXPPATH/decode.sh
# We link the decode script to a file located in bin
ln -f -s $EXPPATH/decode.sh $LANGPATH/bin/maca_tm_$EXPNAME
# We start the training
macaon_train --tm machine.tm --bd train.bd --mcd ../../data/treebank/train.mcd -T ../../data/treebank/train.mcf --dev ../../data/treebank/dev.mcf --expName $EXPNAME --lang $LANG $ARGS
../scripts/train.sh $LANG $@
#! /bin/bash
if [ "$#" -lt 1 ]; then
echo "Usage : eval.sh expName1 expName2..."
exit
fi
EVAL_MCF=../../tools/eval_mcf.pl
RESULT_FILE="$LANG.res"
echo "tool pos morpho lemma uas las srec sacc nbWords" > $RESULT_FILE
OUTPUT=output.txt
ERR=stderr.log
for arg in "$@"
do
if maca_tm_$arg $MCF $MCD > $OUTPUT 2> $ERR ; then
$EVAL_MCF -G WPMLGFS -g $MCF -S WPMLGFS -s $OUTPUT >> $RESULT_FILE 2>>$ERR
sed -i -e "s/$OUTPUT/$arg/g" $RESULT_FILE
fi
rm $OUTPUT
done
cat $RESULT_FILE | column -t > $OUTPUT && mv $OUTPUT $RESULT_FILE
#! /bin/bash
if [ "$#" -lt 3 ]; then
echo "Usage : train.sh templateName expName"
exit
fi
LANG=$1
TEMPLATENAME=$2
EXPNAME=$3
# Here we collect all the extra arguments, to forward them to macaon_train
shift
shift
ARGS=""
for arg in "$@"
do
ARGS="$ARGS $arg"
done
LANGPATH=$MACAON_DIR/$LANG
TEMPLATEPATH=$LANGPATH/$TEMPLATENAME
EXPPATH=$LANGPATH/bin/$EXPNAME
mkdir -p $LANGPATH/bin
if [ ! -d "$TEMPLATEPATH" ]; then
echo "ERROR : directory $TEMPLATEPATH doesn't exist"
echo "Usage : train.sh templateName expName"
exit
fi
# Here we create the experience folder and we copy every useful file into it
rm -r $EXPPATH 2> /dev/null
mkdir $EXPPATH
cp $TEMPLATEPATH/* $EXPPATH/.
# Here we create the decode script
echo "\
#! /bin/bash
if [ \"\$#\" -ne 2 ]; then
echo \"Usage : \$0 input mcd\"
exit
fi
macaon_decode --lang $LANG --tm machine.tm --bd test.bd -I \$1 --mcd \$2 --expName $EXPNAME\
" > $EXPPATH/decode.sh
chmod +x $EXPPATH/decode.sh
# We link the decode script to a file located in bin
ln -f -s $EXPPATH/decode.sh $LANGPATH/bin/maca_tm_$EXPNAME
# We start the training
macaon_train --tm machine.tm --bd train.bd --mcd ../../data/treebank/train.mcd -T ../../data/treebank/train.mcf --dev ../../data/treebank/dev.mcf --expName $EXPNAME --lang $LANG $ARGS
#!/usr/bin/perl
$arg = shift;
while($arg){
if($arg eq "-g"){$ref = shift;}
elsif($arg eq "-s"){$hyp = shift;}
elsif($arg eq "-G"){$ref_mcd = shift;}
elsif($arg eq "-S"){$hyp_mcd = shift;}
elsif($arg eq "-tac"){$TAGGING_ACCURACY_PER_CATEGORY = 1;}
elsif($arg eq "-tcm"){$TAGGING_CONFUSION_MATRIX = 1;}
elsif($arg eq "-tec"){$TAGGING_ERRORS_PER_CATEGORY = 1;}
elsif($arg eq "-paf"){$PARSING_ACCURACY_PER_FUNCTION = 1;}
elsif($arg eq "-lcm"){$LABELING_CONFUSION_MATRIX = 1;}
elsif($arg eq "-acm"){$ATTACHEMENT_CONFUSION_MATRIX = 1;}
elsif($arg eq "-all"){
$TAGGING_ACCURACY_PER_CATEGORY = 1;
$TAGGING_CONFUSION_MATRIX = 1;
$TAGGING_ERRORS_PER_CATEGORY = 1;
$PARSING_ACCURACY_PER_FUNCTION = 1;
$LABELING_CONFUSION_MATRIX = 1;
$ATTACHEMENT_CONFUSION_MATRIX = 1;
}
elsif($arg eq "-h"){
print "usage eval07.pl OPTIONS -g <reference file> -s <system output>\n";
print "OPTIONS :\n";
print "\t-tac tagging accuracy per category\n";
print "\t-tcm tagging confusion matrix\n";
print "\t-tec tagging errors per category\n";
print "\t-paf parsing accuracy per function\n";
print "\t-lcm labeling confusion matrix\n";
print "\t-acm attachment confusion matrix\n";
print "\t-all all options\n";
exit;
}
$arg = shift;
}
# determine the column in the reference file
#
$ref_form_col = 0;
$ref_pos_col = 1;
$ref_lemma_col = 2;
$ref_gov_col = 3;
$ref_fct_col = 4;
$ref_seg_col = 5;
$ref_morph_col = 10;
if($ref_mcd)
{
for($i=0; $i<length $ref_mcd; $i++){
$car = substr($ref_mcd, $i, 1);
# print "car $i = $car\n";
if($car eq 'W'){$ref_form_col = $i; next;}
if($car eq 'P'){$ref_pos_col = $i; next;}
if($car eq 'L'){$ref_lemma_col = $i; next;}
if($car eq 'G'){$ref_gov_col = $i; next;}
if($car eq 'F'){$ref_fct_col = $i; next;}
if($car eq 'S'){$ref_seg_col = $i; next;}
if($car eq 'M'){$ref_morph_col = $i; next;}
}
}
# determine the column in the hypothesis file
$hyp_form_col = 0;
$hyp_pos_col = 1;
$hyp_lemma_col = 2;
$hyp_gov_col = 3;
$hyp_fct_col = 4;
$hyp_seg_col = 5;
$hyp_morph_col = 10;
if($hyp_mcd)
{
for($i=0; $i<length $hyp_mcd; $i++){
$car = substr($hyp_mcd, $i, 1);
# print "car $i = $car\n";
if($car eq 'W'){$hyp_form_col = $i; next;}
if($car eq 'P'){$hyp_pos_col = $i; next;}
if($car eq 'L'){$hyp_lemma_col = $i; next;}
if($car eq 'G'){$hyp_gov_col = $i; next;}
if($car eq 'F'){$hyp_fct_col = $i; next;}
if($car eq 'S'){$hyp_seg_col = $i; next;}
if($car eq 'M'){$hyp_morph_col = $i; next;}
}
}
open REF, $ref or die "cannot open file $ref";
open HYP, $hyp or die "cannot open file $hyp";
my $line_nb;
my $word_nb;
my $correct_pos_nb;
my $correct_gov_nb;
my $correct_gov_fct_nb;
sub is_punctuation_ptb{
my $pos = shift(@_);
if($pos eq "``"){return 1;}
if($pos eq ","){return 1;}
if($pos eq ":"){return 1;}
if($pos eq "."){return 1;}
if($pos eq "''"){return 1;}
if($pos eq "-LRB-"){return 1;}
if($pos eq "-RRB-"){return 1;}
return 0;
}
sub is_punctuation_ftb{
my $pos = shift(@_);
if($pos eq "PCT"){return 1;}
if($pos eq "PONCT"){return 1;}
if($pos eq "ponctw"){return 1;}
if($pos eq "poncts"){return 1;}
return 0;
}
sub is_punctuation_ud{
my $pos = shift(@_);
if($pos eq "PUNCT"){return 1;}
return 0;
}
while(<REF>){
chop;
$line_nb++;
# ($ref_form, $ref_pos, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split /\t/;
@ref_array = split /\t/;
$column_nb = -1;
foreach $elt (@ref_array){
$column_nb++;
if($column_nb == $ref_form_col){$ref_form = $elt; next;}
if($column_nb == $ref_pos_col){$ref_pos = $elt; next;}
if($column_nb == $ref_lemma_col){$ref_lemma = $elt; next;}
if($column_nb == $ref_gov_col){$ref_gov = $elt; next;}
if($column_nb == $ref_fct_col){$ref_fct = $elt; next;}
if($column_nb == $ref_seg_col){$ref_seg = $elt; next;}
if($column_nb == $ref_morph_col){$ref_morph = $elt; next;}
}
$_ = <HYP>;
chop $_;
# print;
# ($hyp_form, $hyp_pos, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split /\t/;
@hyp_array = split /\t/;
$column_nb = -1;
foreach $elt (@hyp_array){
$column_nb++;
if($column_nb == $hyp_form_col){$hyp_form = $elt; next;}
if($column_nb == $hyp_pos_col){$hyp_pos = $elt; next;}
if($column_nb == $hyp_lemma_col){$hyp_lemma = $elt; next;}
if($column_nb == $hyp_gov_col){$hyp_gov = $elt; next;}
if($column_nb == $hyp_fct_col){$hyp_fct = $elt; next;}
if($column_nb == $hyp_seg_col){$hyp_seg = $elt; next;}
if($column_nb == $hyp_morph_col){$hyp_morph = $elt; next;}
}
# print "ref = $hyp_seg\n";
if($ref_seg){ $nb_ref_seg++;}
if($hyp_seg){ $nb_hyp_seg++;}
if(($ref_seg) && ($hyp_seg)){ $nb_hyp_ref_seg++;}
# if(($ref_index) && (!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){
if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){
# if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos)) && (!is_punctuation_ud($ref_pos))){
# if($ref_form ne $hyp_form){die "mismatch line $line_nb\n";}
$word_nb++;
$pos_nb{$ref_pos}++;
$fct_nb{$ref_fct}++;
if($ref_pos eq $hyp_pos){
$correct_pos_total_nb++;
$correct_pos_nb{$ref_pos}++;
}
else{
$false_pos_form{$ref_pos}{$ref_form}++;
$pos_confusion_matrix{$ref_pos}{$hyp_pos}++;
# print "$ref_form $ref_pos $hyp_pos\n";
# print "$ref_pos $hyp_pos\n";
}
if($ref_morph eq $hyp_morph){
$correct_morph_total_nb++;
}
else{
# print "$ref_form \t $ref_lemma \t $hyp_lemma\n";
}
if(lc $ref_lemma eq lc $hyp_lemma){
$correct_lemma_total_nb++;
}
else{
# if($ref_pos eq "v"){
# print "$ref_form \t $ref_pos \t $ref_lemma \t $hyp_lemma\n";
# }
}
$ref_dist = $ref_gov - $ref_index;
$hyp_dist = $hyp_gov - $hyp_index;
# if($ref_gov eq $hyp_gov){
if($ref_dist eq $hyp_dist){
$correct_gov_nb++;
$correct_gov_total_nb++;
if($ref_fct eq $hyp_fct){
$correct_gov_fct_total_nb++;
$correct_gov_fct_nb{$ref_fct}++;
}
else{
$labeling_confusion_matrix{$ref_fct}{$hyp_fct}++;
}
}
else{
$attachement_confusion_matrix{$ref_fct}{$hyp_fct}++;
}
}
$ref_index = "";
}
close REF;
close HYP;
my $pos_acc = $correct_pos_total_nb / $word_nb * 100;
my $lemma_acc = $correct_lemma_total_nb / $word_nb * 100;
my $morph_acc = $correct_morph_total_nb / $word_nb * 100;
my $las = $correct_gov_fct_total_nb / $word_nb * 100;
my $uas = $correct_gov_total_nb / $word_nb * 100 ;
my $seg_recall = $nb_hyp_ref_seg / ($nb_ref_seg + 1);
my $seg_precision = $nb_hyp_ref_seg / ($nb_hyp_seg + 1);
printf(stderr "pos acc = %.2f morph acc = %.2f lemma acc = %.2f uas = %.2f las = %.2f seg recall = %.2f seg precision = %.2f size = %d\n", $pos_acc, $morph_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb);
printf(stdout "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n", $hyp, $pos_acc, $morph_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb);
if($TAGGING_ACCURACY_PER_CATEGORY){
print "\n\n--------------------------------------------------------------------------------------\n";
printf "TAGGING ACCURACY PER CATEGORY\n";
printf "CAT\tFREQ\tACC\tIMPACT\n";
foreach $pos (keys %correct_pos_nb){
$acc = $correct_pos_nb{$pos} / $pos_nb{$pos};
$freq = $pos_nb{$pos} / $word_nb;
if($word_nb == $correct_pos_total_nb){
$impact = 0;
}
else{
$impact = ($pos_nb{$pos} - $correct_pos_nb{$pos}) / ($word_nb - $correct_pos_total_nb);
}
printf("%s\t%6.2f\t%6.2f\t%6.2f\n", $pos, $freq*100, $acc*100, $impact*100);
}
}
if($TAGGING_CONFUSION_MATRIX){
print "\n\n--------------------------------------------------------------------------------------\n";
print "TAGGING CONFUSION MATRIX\n";
foreach $ref_pos (keys %pos_confusion_matrix){
$pos_error_nb = $pos_nb{$ref_pos} - $correct_pos_nb{$ref_pos};
print "$ref_pos ($pos_error_nb) :";
foreach $hyp_pos (keys %{$pos_confusion_matrix{$ref_pos}}){
print "\t$hyp_pos ($pos_confusion_matrix{$ref_pos}{$hyp_pos})";
}
print "\n";
}
}
if($TAGGING_ERRORS_PER_CATEGORY){
print "\n\n--------------------------------------------------------------------------------------\n";
print "TAGGING ERRORS PER CATEGORY\n";
foreach $pos (keys %false_pos_form){
print "\n$pos\n";
foreach $form (keys %{$false_pos_form{$pos}}){
print "\t$form $false_pos_form{$pos}{$form}\n";
}
}
}
if($PARSING_ACCURACY_PER_FUNCTION){
print "\n\n--------------------------------------------------------------------------------------\n";
printf "LABELED ATTACHMENT SCORE PER LABEL\n";
printf "LABEL FREQ ACC IMPACT\n";
foreach $fct (keys %correct_gov_fct_nb){
$acc = $correct_gov_fct_nb{$fct} / $fct_nb{$fct};
$freq = $fct_nb{$fct}/$word_nb;
$impact = ($fct_nb{$fct} - $correct_gov_fct_nb{$fct}) / ($word_nb - $correct_gov_fct_total_nb++);
printf("%-10s%6.2f\t%6.2f\t%6.2f\n", $fct, $freq*100, $acc*100, $impact*100);
}
}
if($ATTACHEMENT_CONFUSION_MATRIX){
print "\n\n--------------------------------------------------------------------------------------\n";
printf "ATTACHEMENT CONFUSION MATRIX\n";
foreach $ref_fct (keys %attachement_confusion_matrix){
$attachement_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct};
print "$ref_fct ($attachement_error_nb) :";
foreach $hyp_fct (keys %{$attachement_confusion_matrix{$ref_fct}}){
print "\t$hyp_fct ($attachement_confusion_matrix{$ref_fct}{$hyp_fct})";
}
print "\n";
}
}
if($LABELING_CONFUSION_MATRIX){
print "\n\n--------------------------------------------------------------------------------------\n";
printf "LABELING CONFUSION MATRIX\n";
foreach $ref_fct (keys %labeling_confusion_matrix){
$fct_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct};
print "$ref_fct ($fct_error_nb) :";
foreach $hyp_fct (keys %{$labeling_confusion_matrix{$ref_fct}}){
print "\t$hyp_fct ($labeling_confusion_matrix{$ref_fct}{$hyp_fct})";
}
print "\n";
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment