diff --git a/UD/template/maca_trans_tagger/Makefile b/UD/template/maca_trans_tagger/Makefile index 884cd6959dc58ff34d0a3c4e19df87f8c6dc3c06..b4a2855d474f0ad5d6e60a2f1ed23e4a89265774 100644 --- a/UD/template/maca_trans_tagger/Makefile +++ b/UD/template/maca_trans_tagger/Makefile @@ -3,11 +3,15 @@ MCF_DEV=../data/treebank/dev.mcf MCF_TEST=../data/treebank/test.mcf CFF_TRAIN=train.cff +CFF_FANN_TRAIN=train.fann.cff +FANN_TRAIN=train.fann CFF_CUTOFF_TRAIN=train.cutoff.cff PERCEPTRON_ITERATIONS=9 CFF_CUTOFF=1 FEATURES_MODEL_FILENAME=../../fm/maca_trans_tagger.fm +FEATURES_MODEL_FANN_FILENAME=../../fm/maca_trans_tagger_fann.fm VOCABS_FILENAME=maca_trans_tagger.vocab +VOCABS_FANN_FILENAME=maca_trans_parser_fann.vocab MCD_FILENAME=../../mcd/maca_trans_tagger.mcd MODEL_FILENAME=maca_trans_tagger.model NUMBER_OF_SENTENCES=10000000 @@ -18,3 +22,5 @@ FORM_POS_FILENAME=../data/morpho-lexicon/fP #include ./maca_trans_tagger.makefile include ../../makefiles/maca_trans_tagger.makefile + + diff --git a/UD/ud_template.tgz b/UD/ud_template.tgz index 002ed8421208367c50605e9025a7dae8e2cb125d..f4d2ffaea32cb3e490c6e2457c01a76f60ce6928 100644 Binary files a/UD/ud_template.tgz and b/UD/ud_template.tgz differ diff --git a/fm/maca_trans_parser.fm b/fm/maca_trans_parser.fm new file mode 100644 index 0000000000000000000000000000000000000000..6e18e5ed6fec6180187615dd6eafd3ff5df848d7 --- /dev/null +++ b/fm/maca_trans_parser.fm @@ -0,0 +1,56 @@ +#b0m +#s0m +#b0m s0m +#s0l s0m b0l b0m + + +b0g +s0g s0p +s0g b0p +s0g +s0sf +#s1g +#s1sf +s0l +s0p +s1p +s2p +b0l +b0p +b1l +b1p +b2p +b3p +ldep_s0r +rdep_s0r +ldep_s1r +rdep_s1r +ldep_b0r +rdep_b0r +s0l b0l + +s0p b0p +b0p b0l +b0p ldep_b0r +s1p b1p +b1p b2p +s0p b0p b0l +s0p ldep_s0r rdep_s0r +s0p s0l b0p +s0p b0p dist_s0_b0 +s1p s0p b0p +b0p b1p b2p +b1p b2p b3p +s0p b0p b1p +b1p b1l b2p b3p +b1p b1l b2p b2l b3p +t1 +#t2 +#t3 +#t4 +t1 t2 +#t2 t3 +t1 t2 t3 + +bm1p +bm2p diff --git a/fm/maca_trans_parser_fann.fm b/fm/maca_trans_parser_fann.fm new file mode 100644 index 0000000000000000000000000000000000000000..3fdb2ce20b7984daf982a5293ba17bb162c5485c --- /dev/null +++ b/fm/maca_trans_parser_fann.fm @@ -0,0 +1,25 @@ +b0l +b0p +b1l +b1p +b2l +b2p +b3p +bm1p +bm2p +dist_s0_b0 +ldep_b0r +ldep_s0r +ldep_s1r +rdep_b0r +rdep_s0r +rdep_s1r +s0g +s0l +s0p +s0sf +s1p +s2p +t1 +t2 +t3 diff --git a/fm/maca_trans_tagger.fm b/fm/maca_trans_tagger.fm new file mode 100644 index 0000000000000000000000000000000000000000..50af67ebe5ff61fa2c71a524cdcd40401183c8df --- /dev/null +++ b/fm/maca_trans_tagger.fm @@ -0,0 +1,37 @@ +b0U1 +b0len + +b0sgn +b1sgn + +b1f +b0f +bm1f +bm2f + +bm1p +bm2p +bm3p +bm2p bm1p +bm2p bm3p +bm1p b0sgn + +b0s1 +b0s2 +b0s3 +b0s4 +b0s5 +b0s1 b0s2 +b0s1 b0s2 b0s3 +b0s1 b0s2 b0s3 b0s4 + +b0p1 +b0p2 +b0p3 +b0p4 +b0p5 +b0p1 b0p2 +b0p1 b0p2 b0p3 +b0p1 b0p2 b0p3 b0p4 + + diff --git a/fm/maca_trans_tagger_fann.fm b/fm/maca_trans_tagger_fann.fm new file mode 100644 index 0000000000000000000000000000000000000000..3f5e2c2add768705ac78571e918e69d6c8ee3a24 --- /dev/null +++ b/fm/maca_trans_tagger_fann.fm @@ -0,0 +1,21 @@ +b0f +b0len +b0p1 +b0p2 +b0p3 +b0p4 +b0p5 +b0s1 +b0s2 +b0s3 +b0s4 +b0s5 +b0sgn +b0U1 +b1f +b1sgn +bm1f +bm1p +bm2f +bm2p +bm3p diff --git a/makefiles/maca_trans_tagger.makefile b/makefiles/maca_trans_tagger.makefile index f32b03a31bdfa0918700a58123f0eb1c8b0f73ca..ce3d13ed870e396e6f90caf8048c50ebe626a741 100644 --- a/makefiles/maca_trans_tagger.makefile +++ b/makefiles/maca_trans_tagger.makefile @@ -2,14 +2,21 @@ ## compile ##----------------------------------------------------------------------- -compile: $(MODEL_FILENAME) +compile: $(MODEL_FILENAME) $(FANN_TRAIN) $(CFF_TRAIN): $(MCF_TRAIN) maca_trans_tagger_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FILENAME) --vocabs $(VOCABS_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES) $(STREAM_MODE) -P $(FORM_POS_FILENAME) +$(CFF_FANN_TRAIN): $(MCF_TRAIN) + maca_trans_tagger_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FANN_FILENAME) --vocabs $(VOCABS_FANN_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES) $(STREAM_MODE) -P $(FORM_POS_FILENAME) + + $(CFF_CUTOFF_TRAIN): $(CFF_TRAIN) cff_cutoff --input $< --vocabs $(VOCABS_FILENAME) --cutoff $(CFF_CUTOFF) > $@ +$(FANN_TRAIN): $(CFF_FANN_TRAIN) + cff2fann --vocabs $(VOCABS_FANN_FILENAME) --cff $< --feat_model $(FEATURES_MODEL_FANN_FILENAME) -C $(MCD_FILENAME) > $@ + $(MODEL_FILENAME): $(CFF_CUTOFF_TRAIN) #$(MODEL_FILENAME): $(CFF_TRAIN) perceptron_train --cff $< --model $(MODEL_FILENAME) -n $(PERCEPTRON_ITERATIONS) diff --git a/tools/conllu2mcf.c b/tools/conllu2mcf.c new file mode 100644 index 0000000000000000000000000000000000000000..a2fc73b015c309270de065d82dcce484c5c0973b --- /dev/null +++ b/tools/conllu2mcf.c @@ -0,0 +1,158 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<strings.h> +#include<math.h> +#include<getopt.h> +#include"conll_lib.h" +#include"hash_str.h" + +#define NB_COL 7 + +typedef struct options +{ + FILE * fd_parses; // parser output + int verbose_level; + int snum; + char *filename; + char columns[NB_COL]; +} options; + +/*---------------------------------------------------------------------------------*/ + +options op; + +void print_options(options *op) +{ + fprintf(stderr, "file name = %s\n", op->filename); + fprintf(stderr, "verbose level = %d\n", op->verbose_level); + fprintf(stderr, "maximum number of sentences to process = %d\n", op->snum); +} + +void reset_options(options * op) +{ + int i; + op->filename = NULL; + op->fd_parses = stdin; + op->verbose_level = 0; + op->snum = 100000000; + for(i=0; i < NB_COL; i++) + op->columns[i] = '0'; +} + +/*---------------------------------------------------------------------------------*/ +void print_help_message(char *program_name) +{ + fprintf(stderr, "%s usage: %s [options]\n", program_name, program_name); + fprintf(stderr, "OPTIONS :\n"); + fprintf(stderr, " -f <file> : hypothesis conll file\n"); + fprintf(stderr, " -n <int> : process n sentences (default is 100 000 000)\n"); + fprintf(stderr, " -v 1|2|3 : verbosity level\n"); + fprintf(stderr, " -h : print this message\n"); + + fprintf(stderr, " -1 : content of column 1 in the mcf file produced\n"); + fprintf(stderr, " -2 : content of column 2 in the mcf file produced\n"); + fprintf(stderr, " -3 : content of column 3 in the mcf file produced\n"); + fprintf(stderr, " -4 : content of column 4 in the mcf file produced\n"); + fprintf(stderr, " -5 : content of column 5 in the mcf file produced\n"); + fprintf(stderr, " -6 : content of column 6 in the mcf file produced\n"); + fprintf(stderr, " -7 : content of column 7 in the mcf file produced\n"); + fprintf(stderr, " : values of options -1 to -7 must be one of\n"); + fprintf(stderr, " : I for id\n"); + fprintf(stderr, " : W for form\n"); + fprintf(stderr, " : L for lemma\n"); + fprintf(stderr, " : C for coarse part of speech\n"); + fprintf(stderr, " : P for part of speech\n"); + fprintf(stderr, " : F for features\n"); + fprintf(stderr, " : H for head\n"); + fprintf(stderr, " : D for deprel\n"); + +} + + + + +/*---------------------------------------------------------------------------------*/ + +void parse_options(int argc, char *argv[], options * op) +{ + char c; + + reset_options(op); + /* + if(argc ==1){ + print_help_message(argv[0]); + exit(1); + }*/ + + while ((c = getopt (argc, argv, "hIWLCPFHDf:n:v:1:2:3:4:5:6:7:8:9:")) != -1) + switch (c) + { + case 'h': + print_help_message(argv[0]); + exit(0); + case 'f': + op->filename = strdup(optarg); + if((op->fd_parses = fopen(op->filename, "r")) == NULL){ + fprintf(stderr, "cannot open hypothesis file %s : aborting\n", op->filename); + exit(1); + } + break; + case '1': + op->columns[0] = optarg[0]; + break; + case '2': + op->columns[1] = optarg[0]; + break; + case '3': + op->columns[2] = optarg[0]; + break; + case '4': + op->columns[3] = optarg[0]; + break; + case '5': + op->columns[4] = optarg[0]; + break; + case '6': + op->columns[5] = optarg[0]; + break; + case '7': + op->columns[6] = optarg[0]; + break; + case 'n': + op->snum = atoi(optarg); + break; + case 'v': + op->verbose_level = atoi(optarg); + break; + } + + /* if (op->fd_parses == NULL){ + fprintf(stderr, "error : cannot open parse file: aborting\n"); + exit(1); + }*/ +} + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ +int main(int argc, char *argv[]) +{ + sentence *s = allocate_sentence(); + int snum = 0; + int res; + parse_options(argc, argv, &op); + + print_options(&op); + + + for(res = load_sentence(op.fd_parses, s); res && (snum < op.snum); res = load_sentence(op.fd_parses, s)){ + s->num = snum; + snum++; + compute_relative_index_of_heads(s); + print_sentence_mcf3(s, op.columns, NB_COL); + } + if(op.filename) + fclose(op.fd_parses); + free_sentence(s); + return 0; +} diff --git a/tools/eval_mcf.pl b/tools/eval_mcf.pl new file mode 100755 index 0000000000000000000000000000000000000000..f8fb57be800915309d779c63b392f97aea15d6d9 --- /dev/null +++ b/tools/eval_mcf.pl @@ -0,0 +1,324 @@ +#!/usr/bin/perl + +$arg = shift; +while($arg){ + if($arg eq "-g"){$ref = shift;} + elsif($arg eq "-s"){$hyp = shift;} + elsif($arg eq "-G"){$ref_mcd = shift;} + elsif($arg eq "-S"){$hyp_mcd = shift;} + elsif($arg eq "-tac"){$TAGGING_ACCURACY_PER_CATEGORY = 1;} + elsif($arg eq "-tcm"){$TAGGING_CONFUSION_MATRIX = 1;} + elsif($arg eq "-tec"){$TAGGING_ERRORS_PER_CATEGORY = 1;} + elsif($arg eq "-paf"){$PARSING_ACCURACY_PER_FUNCTION = 1;} + elsif($arg eq "-lcm"){$LABELING_CONFUSION_MATRIX = 1;} + elsif($arg eq "-acm"){$ATTACHEMENT_CONFUSION_MATRIX = 1;} + elsif($arg eq "-all"){ + $TAGGING_ACCURACY_PER_CATEGORY = 1; + $TAGGING_CONFUSION_MATRIX = 1; + $TAGGING_ERRORS_PER_CATEGORY = 1; + $PARSING_ACCURACY_PER_FUNCTION = 1; + $LABELING_CONFUSION_MATRIX = 1; + $ATTACHEMENT_CONFUSION_MATRIX = 1; + } + elsif($arg eq "-h"){ + print "usage eval07.pl OPTIONS -g <reference file> -s <system output>\n"; + print "OPTIONS :\n"; + print "\t-tac tagging accuracy per category\n"; + print "\t-tcm tagging confusion matrix\n"; + print "\t-tec tagging errors per category\n"; + print "\t-paf parsing accuracy per function\n"; + print "\t-lcm labeling confusion matrix\n"; + print "\t-acm attachment confusion matrix\n"; + print "\t-all all options\n"; + exit; +} +$arg = shift; +} + +# determine the column in the reference file + +$ref_form_col = 0; +$ref_pos_col = 1; +$ref_lemma_col = 2; +$ref_gov_col = 3; +$ref_fct_col = 4; +$ref_seg_col = 5; +$ref_morph_col = 10; + +if($ref_mcd) +{ + for($i=0; $i<length $ref_mcd; $i++){ + $car = substr($ref_mcd, $i, 1); +# print "car $i = $car\n"; + if($car eq 'W'){$ref_form_col = $i; next;} + if($car eq 'P'){$ref_pos_col = $i; next;} + if($car eq 'L'){$ref_lemma_col = $i; next;} + if($car eq 'G'){$ref_gov_col = $i; next;} + if($car eq 'F'){$ref_fct_col = $i; next;} + if($car eq 'S'){$ref_seg_col = $i; next;} + if($car eq 'M'){$ref_morph_col = $i; next;} + } +} + +# determine the column in the hypothesis file + +$hyp_form_col = 0; +$hyp_pos_col = 1; +$hyp_lemma_col = 2; +$hyp_gov_col = 3; +$hyp_fct_col = 4; +$hyp_seg_col = 5; +$hyp_morph_col = 10; + +if($hyp_mcd) +{ + for($i=0; $i<length $hyp_mcd; $i++){ + $car = substr($hyp_mcd, $i, 1); +# print "car $i = $car\n"; + if($car eq 'W'){$hyp_form_col = $i; next;} + if($car eq 'P'){$hyp_pos_col = $i; next;} + if($car eq 'L'){$hyp_lemma_col = $i; next;} + if($car eq 'G'){$hyp_gov_col = $i; next;} + if($car eq 'F'){$hyp_fct_col = $i; next;} + if($car eq 'S'){$hyp_seg_col = $i; next;} + if($car eq 'M'){$hyp_morph_col = $i; next;} + } +} + + +open REF, $ref or die "cannot open file $ref"; +open HYP, $hyp or die "cannot open file $hyp"; + + + + +my $line_nb; +my $word_nb; +my $correct_pos_nb; +my $correct_gov_nb; +my $correct_gov_fct_nb; + +sub is_punctuation_ptb{ + my $pos = shift(@_); + + if($pos eq "``"){return 1;} + if($pos eq ","){return 1;} + if($pos eq ":"){return 1;} + if($pos eq "."){return 1;} + if($pos eq "''"){return 1;} + if($pos eq "-LRB-"){return 1;} + if($pos eq "-RRB-"){return 1;} + return 0; +} +sub is_punctuation_ftb{ + my $pos = shift(@_); + + if($pos eq "PCT"){return 1;} + if($pos eq "PONCT"){return 1;} + if($pos eq "ponctw"){return 1;} + if($pos eq "poncts"){return 1;} + return 0; +} +sub is_punctuation_ud{ + my $pos = shift(@_); + + if($pos eq "PUNCT"){return 1;} + return 0; +} + + +while(<REF>){ + chop; + $line_nb++; +# ($ref_form, $ref_pos, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split /\t/; + @ref_array = split /\t/; + $column_nb = -1; + foreach $elt (@ref_array){ + $column_nb++; + if($column_nb == $ref_form_col){$ref_form = $elt; next;} + if($column_nb == $ref_pos_col){$ref_pos = $elt; next;} + if($column_nb == $ref_lemma_col){$ref_lemma = $elt; next;} + if($column_nb == $ref_gov_col){$ref_gov = $elt; next;} + if($column_nb == $ref_fct_col){$ref_fct = $elt; next;} + if($column_nb == $ref_seg_col){$ref_seg = $elt; next;} + if($column_nb == $ref_morph_col){$ref_morph = $elt; next;} + } + $_ = <HYP>; + chop $_; +# print; +# ($hyp_form, $hyp_pos, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split /\t/; + + @hyp_array = split /\t/; + $column_nb = -1; + foreach $elt (@hyp_array){ + $column_nb++; + if($column_nb == $hyp_form_col){$hyp_form = $elt; next;} + if($column_nb == $hyp_pos_col){$hyp_pos = $elt; next;} + if($column_nb == $hyp_lemma_col){$hyp_lemma = $elt; next;} + if($column_nb == $hyp_gov_col){$hyp_gov = $elt; next;} + if($column_nb == $hyp_fct_col){$hyp_fct = $elt; next;} + if($column_nb == $hyp_seg_col){$hyp_seg = $elt; next;} + if($column_nb == $hyp_morph_col){$hyp_morph = $elt; next;} + } + + + +# print "ref = $hyp_seg\n"; + + if($ref_seg){ $nb_ref_seg++;} + if($hyp_seg){ $nb_hyp_seg++;} + + if(($ref_seg) && ($hyp_seg)){ $nb_hyp_ref_seg++;} + +# if(($ref_index) && (!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){ + if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){ +# if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos)) && (!is_punctuation_ud($ref_pos))){ + if($ref_form ne $hyp_form){die "mismatch line $line_nb\n";} + $word_nb++; + $pos_nb{$ref_pos}++; + $fct_nb{$ref_fct}++; + + if($ref_pos eq $hyp_pos){ + $correct_pos_total_nb++; + $correct_pos_nb{$ref_pos}++; + } + else{ + $false_pos_form{$ref_pos}{$ref_form}++; + $pos_confusion_matrix{$ref_pos}{$hyp_pos}++; + + +# print "$ref_form $ref_pos $hyp_pos\n"; +# print "$ref_pos $hyp_pos\n"; + } + + if($ref_lemma eq $hyp_lemma){ + $correct_lemma_total_nb++; + } + else{ +# print "$ref_form \t $ref_lemma \t $hyp_lemma\n"; + } + $ref_dist = $ref_gov - $ref_index; + $hyp_dist = $hyp_gov - $hyp_index; +# if($ref_gov eq $hyp_gov){ + if($ref_dist eq $hyp_dist){ + $correct_gov_nb++; + $correct_gov_total_nb++; + if($ref_fct eq $hyp_fct){ + $correct_gov_fct_total_nb++; + $correct_gov_fct_nb{$ref_fct}++; + } + else{ + $labeling_confusion_matrix{$ref_fct}{$hyp_fct}++; + } + } + else{ + $attachement_confusion_matrix{$ref_fct}{$hyp_fct}++; + } + + } + + $ref_index = ""; +} + + +close REF; +close HYP; + + +my $pos_acc = $correct_pos_total_nb / $word_nb * 100; +my $lemma_acc = $correct_lemma_total_nb / $word_nb * 100; +my $las = $correct_gov_fct_total_nb / $word_nb * 100; +my $uas = $correct_gov_total_nb / $word_nb * 100 ; + + + +my $seg_recall = $nb_hyp_ref_seg / $nb_ref_seg; +my $seg_precision = $nb_hyp_ref_seg / ($nb_hyp_seg + 1); + + +printf(stderr "pos acc = %.2f lemma acc = %.2f uas = %.2f las = %.2f seg recall = %.2f seg precision = %.2f size = %d\n", $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb); +printf(stdout "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n", $hyp, $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb); + + + +if($TAGGING_ACCURACY_PER_CATEGORY){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "TAGGING ACCURACY PER CATEGORY\n"; + printf "CAT\tFREQ\tACC\tIMPACT\n"; + foreach $pos (keys %correct_pos_nb){ + $acc = $correct_pos_nb{$pos} / $pos_nb{$pos}; + $freq = $pos_nb{$pos} / $word_nb; + if($word_nb == $correct_pos_total_nb){ + $impact = 0; + } + else{ + $impact = ($pos_nb{$pos} - $correct_pos_nb{$pos}) / ($word_nb - $correct_pos_total_nb); + } + printf("%s\t%6.2f\t%6.2f\t%6.2f\n", $pos, $freq*100, $acc*100, $impact*100); + } +} + +if($TAGGING_CONFUSION_MATRIX){ + print "\n\n--------------------------------------------------------------------------------------\n"; + print "TAGGING CONFUSION MATRIX\n"; + foreach $ref_pos (keys %pos_confusion_matrix){ + $pos_error_nb = $pos_nb{$ref_pos} - $correct_pos_nb{$ref_pos}; + print "$ref_pos ($pos_error_nb) :"; + foreach $hyp_pos (keys %{$pos_confusion_matrix{$ref_pos}}){ + print "\t$hyp_pos ($pos_confusion_matrix{$ref_pos}{$hyp_pos})"; + } + print "\n"; + } +} + + +if($TAGGING_ERRORS_PER_CATEGORY){ + print "\n\n--------------------------------------------------------------------------------------\n"; + print "TAGGING ERRORS PER CATEGORY\n"; + foreach $pos (keys %false_pos_form){ + print "\n$pos\n"; + foreach $form (keys %{$false_pos_form{$pos}}){ + print "\t$form $false_pos_form{$pos}{$form}\n"; + } + } +} + +if($PARSING_ACCURACY_PER_FUNCTION){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "LABELED ATTACHMENT SCORE PER LABEL\n"; + printf "LABEL FREQ ACC IMPACT\n"; + foreach $fct (keys %correct_gov_fct_nb){ + $acc = $correct_gov_fct_nb{$fct} / $fct_nb{$fct}; + $freq = $fct_nb{$fct}/$word_nb; + $impact = ($fct_nb{$fct} - $correct_gov_fct_nb{$fct}) / ($word_nb - $correct_gov_fct_total_nb++); + printf("%-10s%6.2f\t%6.2f\t%6.2f\n", $fct, $freq*100, $acc*100, $impact*100); + } +} + +if($ATTACHEMENT_CONFUSION_MATRIX){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "ATTACHEMENT CONFUSION MATRIX\n"; + foreach $ref_fct (keys %attachement_confusion_matrix){ + $attachement_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct}; + print "$ref_fct ($attachement_error_nb) :"; + foreach $hyp_fct (keys %{$attachement_confusion_matrix{$ref_fct}}){ + print "\t$hyp_fct ($attachement_confusion_matrix{$ref_fct}{$hyp_fct})"; + } + print "\n"; + } + +} + +if($LABELING_CONFUSION_MATRIX){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "LABELING CONFUSION MATRIX\n"; + foreach $ref_fct (keys %labeling_confusion_matrix){ + $fct_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct}; + print "$ref_fct ($fct_error_nb) :"; + foreach $hyp_fct (keys %{$labeling_confusion_matrix{$ref_fct}}){ + print "\t$hyp_fct ($labeling_confusion_matrix{$ref_fct}{$hyp_fct})"; + } + print "\n"; + } + +} diff --git a/tools/eval_wpmlgfs.pl b/tools/eval_wpmlgfs.pl new file mode 100755 index 0000000000000000000000000000000000000000..a6d10975698e9b8a58291e867e283d755a4e16e0 --- /dev/null +++ b/tools/eval_wpmlgfs.pl @@ -0,0 +1,234 @@ +#!/usr/bin/perl + +$arg = shift; +while($arg){ + if($arg eq "-g"){$ref = shift;} + elsif($arg eq "-s"){$hyp = shift;} + elsif($arg eq "-tac"){$TAGGING_ACCURACY_PER_CATEGORY = 1;} + elsif($arg eq "-tcm"){$TAGGING_CONFUSION_MATRIX = 1;} + elsif($arg eq "-tec"){$TAGGING_ERRORS_PER_CATEGORY = 1;} + elsif($arg eq "-paf"){$PARSING_ACCURACY_PER_FUNCTION = 1;} + elsif($arg eq "-lcm"){$LABELING_CONFUSION_MATRIX = 1;} + elsif($arg eq "-acm"){$ATTACHEMENT_CONFUSION_MATRIX = 1;} + elsif($arg eq "-all"){ + $TAGGING_ACCURACY_PER_CATEGORY = 1; + $TAGGING_CONFUSION_MATRIX = 1; + $TAGGING_ERRORS_PER_CATEGORY = 1; + $PARSING_ACCURACY_PER_FUNCTION = 1; + $LABELING_CONFUSION_MATRIX = 1; + $ATTACHEMENT_CONFUSION_MATRIX = 1; + } + elsif($arg eq "-h"){ + print "usage eval07.pl OPTIONS -g <gold file> -s <system output>\n"; + print "OPTIONS :\n"; + print "\t-tac tagging accuracy per category\n"; + print "\t-tcm tagging confusion matrix\n"; + print "\t-tec tagging errors per category\n"; + print "\t-paf parsing accuracy per function\n"; + print "\t-lcm labeling confusion matrix\n"; + print "\t-acm attachment confusion matrix\n"; + print "\t-all all options\n"; + exit; +} +$arg = shift; +} + +open REF, $ref or die "cannot open file $ref"; +open HYP, $hyp or die "cannot open file $hyp"; + + +my $line_nb; +my $word_nb; +my $correct_pos_nb; +my $correct_gov_nb; +my $correct_gov_fct_nb; + +sub is_punctuation_ptb{ + my $pos = shift(@_); + + if($pos eq "``"){return 1;} + if($pos eq ","){return 1;} + if($pos eq ":"){return 1;} + if($pos eq "."){return 1;} + if($pos eq "''"){return 1;} + if($pos eq "-LRB-"){return 1;} + if($pos eq "-RRB-"){return 1;} + return 0; +} +sub is_punctuation_ftb{ + my $pos = shift(@_); + + if($pos eq "PCT"){return 1;} + if($pos eq "PONCT"){return 1;} + if($pos eq "ponctw"){return 1;} + if($pos eq "poncts"){return 1;} + return 0; +} + + +while(<REF>){ + chop; + $line_nb++; + ($ref_form, $ref_pos, $ref_morpho, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split /\t/; + $_ = <HYP>; + chop $_; +# print; + ($hyp_form, $hyp_pos, $ref_morpho, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split /\t/; +# print "ref = $hyp_seg\n"; + + if($ref_seg){ $nb_ref_seg++;} + if($hyp_seg){ $nb_hyp_seg++;} + + if(($ref_seg) && ($hyp_seg)){ $nb_hyp_ref_seg++;} + +# if(($ref_index) && (!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){ + if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){ + if($ref_form ne $hyp_form){die "mismatch line $line_nb\n";} + $word_nb++; + $pos_nb{$ref_pos}++; + $fct_nb{$ref_fct}++; + + if($ref_pos eq $hyp_pos){ + $correct_pos_total_nb++; + $correct_pos_nb{$ref_pos}++; + } + else{ + $false_pos_form{$ref_pos}{$ref_form}++; + $pos_confusion_matrix{$ref_pos}{$hyp_pos}++; + + +# print "$ref_form $ref_pos $hyp_pos\n"; +# print "$ref_pos $hyp_pos\n"; + } + + if($ref_lemma eq $hyp_lemma){ + $correct_lemma_total_nb++; + } + else{ +# print "$ref_form \t $ref_lemma \t $hyp_lemma\n"; + } + $ref_dist = $ref_gov - $ref_index; + $hyp_dist = $hyp_gov - $hyp_index; +# if($ref_gov eq $hyp_gov){ + if($ref_dist eq $hyp_dist){ + $correct_gov_nb++; + $correct_gov_total_nb++; + if($ref_fct eq $hyp_fct){ + $correct_gov_fct_total_nb++; + $correct_gov_fct_nb{$ref_fct}++; + } + else{ + $labeling_confusion_matrix{$ref_fct}{$hyp_fct}++; + } + } + else{ + $attachement_confusion_matrix{$ref_fct}{$hyp_fct}++; + } + + } + + $ref_index = ""; +} + + +close REF; +close HYP; + + +my $pos_acc = $correct_pos_total_nb / $word_nb * 100; +my $lemma_acc = $correct_lemma_total_nb / $word_nb * 100; +my $las = $correct_gov_fct_total_nb / $word_nb * 100; +my $uas = $correct_gov_total_nb / $word_nb * 100 ; + + + +my $seg_recall = $nb_hyp_ref_seg / $nb_ref_seg; +my $seg_precision = $nb_hyp_ref_seg / ($nb_hyp_seg + 1); + + +printf(stderr "pos acc = %.2f lemma acc = %.2f uas = %.2f las = %.2f seg recall = %.2f seg precision = %.2f size = %d\n", $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb); +printf(stdout "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n", $hyp, $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb); + + + +if($TAGGING_ACCURACY_PER_CATEGORY){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "TAGGING ACCURACY PER CATEGORY\n"; + printf "CAT\tFREQ\tACC\tIMPACT\n"; + foreach $pos (keys %correct_pos_nb){ + $acc = $correct_pos_nb{$pos} / $pos_nb{$pos}; + $freq = $pos_nb{$pos} / $word_nb; + if($word_nb == $correct_pos_total_nb){ + $impact = 0; + } + else{ + $impact = ($pos_nb{$pos} - $correct_pos_nb{$pos}) / ($word_nb - $correct_pos_total_nb); + } + printf("%s\t%6.2f\t%6.2f\t%6.2f\n", $pos, $freq*100, $acc*100, $impact*100); + } +} + +if($TAGGING_CONFUSION_MATRIX){ + print "\n\n--------------------------------------------------------------------------------------\n"; + print "TAGGING CONFUSION MATRIX\n"; + foreach $ref_pos (keys %pos_confusion_matrix){ + $pos_error_nb = $pos_nb{$ref_pos} - $correct_pos_nb{$ref_pos}; + print "$ref_pos ($pos_error_nb) :"; + foreach $hyp_pos (keys %{$pos_confusion_matrix{$ref_pos}}){ + print "\t$hyp_pos ($pos_confusion_matrix{$ref_pos}{$hyp_pos})"; + } + print "\n"; + } +} + + +if($TAGGING_ERRORS_PER_CATEGORY){ + print "\n\n--------------------------------------------------------------------------------------\n"; + print "TAGGING ERRORS PER CATEGORY\n"; + foreach $pos (keys %false_pos_form){ + print "\n$pos\n"; + foreach $form (keys %{$false_pos_form{$pos}}){ + print "\t$form $false_pos_form{$pos}{$form}\n"; + } + } +} + +if($PARSING_ACCURACY_PER_FUNCTION){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "LABELED ATTACHMENT SCORE PER LABEL\n"; + printf "LABEL FREQ ACC IMPACT\n"; + foreach $fct (keys %correct_gov_fct_nb){ + $acc = $correct_gov_fct_nb{$fct} / $fct_nb{$fct}; + $freq = $fct_nb{$fct}/$word_nb; + $impact = ($fct_nb{$fct} - $correct_gov_fct_nb{$fct}) / ($word_nb - $correct_gov_fct_total_nb++); + printf("%-10s%6.2f\t%6.2f\t%6.2f\n", $fct, $freq*100, $acc*100, $impact*100); + } +} + +if($ATTACHEMENT_CONFUSION_MATRIX){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "ATTACHEMENT CONFUSION MATRIX\n"; + foreach $ref_fct (keys %attachement_confusion_matrix){ + $attachement_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct}; + print "$ref_fct ($attachement_error_nb) :"; + foreach $hyp_fct (keys %{$attachement_confusion_matrix{$ref_fct}}){ + print "\t$hyp_fct ($attachement_confusion_matrix{$ref_fct}{$hyp_fct})"; + } + print "\n"; + } + +} + +if($LABELING_CONFUSION_MATRIX){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "LABELING CONFUSION MATRIX\n"; + foreach $ref_fct (keys %labeling_confusion_matrix){ + $fct_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct}; + print "$ref_fct ($fct_error_nb) :"; + foreach $hyp_fct (keys %{$labeling_confusion_matrix{$ref_fct}}){ + print "\t$hyp_fct ($labeling_confusion_matrix{$ref_fct}{$hyp_fct})"; + } + print "\n"; + } + +} diff --git a/tools/fplm2fP_ud.pl b/tools/fplm2fP_ud.pl new file mode 100755 index 0000000000000000000000000000000000000000..0758ed5ef5224f8f105aa1edb7dde3c1c34372e3 --- /dev/null +++ b/tools/fplm2fP_ud.pl @@ -0,0 +1,58 @@ +#!/usr/bin/perl + + +$postag{"ADJ"} = 1; +$postag{"ADP"} = 1; +$postag{"ADV"} = 1; +$postag{"AUX"} = 1; +$postag{"CCONJ"} = 1; +$postag{"DET"} = 1; +$postag{"INTJ"} = 1; +$postag{"NOUN"} = 1; +$postag{"NUM"} = 1; +$postag{"PART"} = 1; +$postag{"PRON"} = 1; +$postag{"PROPN"} = 1; +$postag{"PUNCT"} = 1; +$postag{"SCONJ"} = 1; +$postag{"SYM"} = 1; +$postag{"VERB"} = 1; +$postag{"X"} = 1; + +while(<>){ + ($form, $pos, $lemma, $morpho) = split /\t/; + if($postag{$pos}){ + $h_form2pos{$form}{$pos} = 1; + $h_pos{$pos} += 1; + } +} + +$nbelem = keys %h_form2pos; +print "$nbelem\n"; + +$nbelem = keys %h_pos; +print "$nbelem\n"; +$first = 1; +foreach $pos (keys %h_pos){ + if($first){ + $first = 0; + } + else{ + print "\t"; + } + print $pos; +} +print "\n"; + +foreach $form (keys %h_form2pos){ + print "$form\t"; + foreach $pos (keys %h_pos){ + if($h_form2pos{$form}{$pos}){ + print "1"; + } + else{ + print "0"; + } + } + print "\n"; +}