From f21d8b995a314bb2d2abff263bc0d3b7eb71af87 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Tue, 11 Apr 2017 08:55:26 +0200 Subject: [PATCH] setting up architecture for universal dependencies --- UD/template/maca_trans_tagger/Makefile | 6 + UD/ud_template.tgz | Bin 1379 -> 1379 bytes fm/maca_trans_parser.fm | 56 +++++ fm/maca_trans_parser_fann.fm | 25 ++ fm/maca_trans_tagger.fm | 37 +++ fm/maca_trans_tagger_fann.fm | 21 ++ makefiles/maca_trans_tagger.makefile | 9 +- tools/conllu2mcf.c | 158 ++++++++++++ tools/eval_mcf.pl | 324 +++++++++++++++++++++++++ tools/eval_wpmlgfs.pl | 234 ++++++++++++++++++ tools/fplm2fP_ud.pl | 58 +++++ 11 files changed, 927 insertions(+), 1 deletion(-) create mode 100644 fm/maca_trans_parser.fm create mode 100644 fm/maca_trans_parser_fann.fm create mode 100644 fm/maca_trans_tagger.fm create mode 100644 fm/maca_trans_tagger_fann.fm create mode 100644 tools/conllu2mcf.c create mode 100755 tools/eval_mcf.pl create mode 100755 tools/eval_wpmlgfs.pl create mode 100755 tools/fplm2fP_ud.pl diff --git a/UD/template/maca_trans_tagger/Makefile b/UD/template/maca_trans_tagger/Makefile index 884cd69..b4a2855 100644 --- a/UD/template/maca_trans_tagger/Makefile +++ b/UD/template/maca_trans_tagger/Makefile @@ -3,11 +3,15 @@ MCF_DEV=../data/treebank/dev.mcf MCF_TEST=../data/treebank/test.mcf CFF_TRAIN=train.cff +CFF_FANN_TRAIN=train.fann.cff +FANN_TRAIN=train.fann CFF_CUTOFF_TRAIN=train.cutoff.cff PERCEPTRON_ITERATIONS=9 CFF_CUTOFF=1 FEATURES_MODEL_FILENAME=../../fm/maca_trans_tagger.fm +FEATURES_MODEL_FANN_FILENAME=../../fm/maca_trans_tagger_fann.fm VOCABS_FILENAME=maca_trans_tagger.vocab +VOCABS_FANN_FILENAME=maca_trans_parser_fann.vocab MCD_FILENAME=../../mcd/maca_trans_tagger.mcd MODEL_FILENAME=maca_trans_tagger.model NUMBER_OF_SENTENCES=10000000 @@ -18,3 +22,5 @@ FORM_POS_FILENAME=../data/morpho-lexicon/fP #include ./maca_trans_tagger.makefile include ../../makefiles/maca_trans_tagger.makefile + + diff --git a/UD/ud_template.tgz b/UD/ud_template.tgz index 002ed8421208367c50605e9025a7dae8e2cb125d..f4d2ffaea32cb3e490c6e2457c01a76f60ce6928 100644 GIT binary patch literal 1379 zcmb2|=3t1bc@x3F{5C2#f0c*Gx7Sa@C*0uw%6!WDrkdL6(41MKrd@aYTeQARVv;&~ zZ`uC3^tN!v#&=9_?^^rM%9v@KKksm5+Br-AZRRgbc%&RAvHUpc!8G|_&xJ&bs700Y z5+gS{8}lf0$p&RqB?>HGoG;_CDcR!L?Akqg;n)6kCuRR|yUZ34_<zv}j~<Or0sm#r zC;Z*cxTL<k{Ko$qfd%ceWB>kp;QZd+pZV`+ox1<G532r+u0DRX_wBZImJ6CLR>!{l z`@G>Ft7nR}O6+x}u7<DkFYnLVGh^D@8Kq|fm(`#6>G5C4`?5^)m-81R{^u^Ro@N>; zZ?Dt-_n*QKXSKpxLH`b#%;$Ok)Pr$xfsB|&?gqKnlbc`W)(OPsYTBP^Gv9uZVQ*vU zf&lZw-+rw)-Kmh1|9H+eHOc;WZO2N_M9i(b^=+})qs{eG@3VW{oVZNdileUMKkJGA zGCLR&{^c`2|F!=1XLgOx|6^YKSDltx;nwur-ce-Z|9ocN-~ZPwTeF4RDf+Xin51_9 zQ6X0qA>B_sb_Vq!Z9lUt;-CEM|Fd7i^Tj{?g#Y)sCj6US_}kFyuf1;TzxXAteE0ra zd}#k$FRidExUy{jwD(gJAKm^Rx8m~elV0KO4=x@K;oN!fZVtQs_jR9hWPd-f7TB#P z%B{%M5us|A@#EP=_OoJ)bL-cM{;In%qwr7X_4crCeV)<-re<yLW~|Hp@ojfyMhlmn z+<(@0^N;@9&zth~K8xs-AN6g|HvbZja69mCKktF$@;43Vm5o>~z76`y(BqO<_D*is zt6ywu%WJo>>l`e66%%jqMD}B^l6ke!jwfFNYGi|MO^*GY|7J#8_;WskjsMOI{`$YY zZH4?p=a2Sc$Non!#&8<^-`+6gpY(3lX?OoiOgsO<zW<fI!GH7qlS!+NZQq`K^1*BI zjYnz<zPz2z611S+V9(-x=6uK9G8U}*pYHm>WX1o44gd4mEB{{iXHNZY?>5EgmpDt~ z@qhbyS1j(7ODXzfXf@?TW1)pch{Ou##Ukv#Ixecr(mW#&DOP1RF@5u9w$mRbUAq`H zJKcKTpSh`uo!zyat(#;txpOV&zvQ2_Y0B32pE;%?L63PYlNO!0Rkd<WNvU1fvJR&7 zO;W6ey5$q9jekv>{hIap`U|1%)l7fopDX|Tt@U!!{3oT`4d<PR_>j}2{kSdGNctrI z+jrZRISXyx`YSf_)#uKAL20v$q%~J73h(Q75jo?Vt0Da@Hq+ZR?4rJb%F57_4<FR9 zTEzW6I^oj9d>i|EtN+{wf6kv)@Nd27ldt#f{ui&Aa?yT4J=^Dc=^v;5)X%(o+Mjj$ z&;R`<|BdAz{eN)wNBAH4lC_8aKUDeO_+LM`ypGG`(RF_Ai2<#Lo!+S!%+wF9HH@FT z_Hj~3qNBjwA9o5edoOw}JoHfMHf#IZ)r%iq;%^c9f8MA@=D%pB;I;o7OxFF+XMbJz z^pX9H)k?vv4I2tnLQO8oI+{0heP57rX<fznzS<rZAAU}Oz@G^l8ty%K7S=9%L0sd; z>8nC)rA@7$RxUcY;lqYi7LEs8Vy7(WnbfGFV`3?}k9YIJ#M`o-HM@6)KHWb>PBNFN z_M1nROI$%&;j_<C+Y45HSnS?*b;7A+qnIV#e;gc?IuFF<`vtqr40+%)IVaaIr+C9A zr@}?Te@v$Q-n5lpWvhB()#eJ#z*Ac*G{cup+SIBvb9+ef!#tsd_6xlin5};wcEmS; z|M<TDfBMQ?RniKl`$&ohxo&e)YVQBf9DJzmz=VWp-`?e{6+g?ST3j5|Y5L^W<!uE! qKi(1QN>LJJmwf7^eS(*HyQJN}od0c&=-{LEV|j;9mm3)Z7#IND?7`Cj literal 1379 zcmb2|=3r=>{yc(#`E68e{wgn#uV2rsSJ=q^mGSB9J8Ejv7vC`1z9R5VtxCwkc99Dj z-^~7gEl*2fa6PEzaQ^rjlY>^(ZIa*9bkn}oeLEDlI;}upmDXbI^`~mSd0lwq>i1No zQ=Db#(T_IkPw2X>t4`wh9h;Ibxh(bfx!v1$=Y(JT*PWE=UUrwUVCjF+*6>Q#i4*?2 zb}s*y&vfJe!K&RqH>O?Wn*RFL&j-Qh_aAZm@6A;EZ}vgU|GTB_H@vyJYE`uY`{U=o zRy~hz`8Pe(HN!9_R9J)QYyG+ZZGT_wnD;)UwBP8z^4a76!dd%-E8o;lT>LMe>FvU_ ztZN?<ue~p;Znu{a`YkG*EBpK=UwpX^57&M11(&P37n<I(diaSwW47_z#5?<Usxp^U zy_<IQZjSz&M|#x^dkS8@59gcQBX{KBR_jiy?ESAMnSHpr|I~cuH#v`9&WdBPU;5wZ zQoUdv!-{|AjmiJR&;DVb@^Qb!=lxn7dycGr@z+00<kWxP1!@2CXaBlmlf0$qp3vll z|3Y#CRg_XDxqou}`Z&uXf7}1bZ~hzp*Iw22{lC%E|85O`<N4pJYdo#@>-`!3P(<+U zf0rNbKlgJsd|Pa`FFxaYmHpFi^~;Yv{+p2}$^7H$;l*;<&KqLB{=3ccuY33FL+3BN z*%X@8!rHLH>(e4Z%iSDiyBQ|duM;h_&$3bcH|1;Kl}o(Nq8x9Q=Kk~(xH0kj!<_7i zP3PzTclsr+@!z~L`2Tf9R@Fb}i!ZJGpV~NG^S?Rcms@iMK78!tVR&m6&HKRPOp$Dk zKxnvm#@5fDog_H-#XtS?Nb&sv^J$TJGY+dQZ$5v0;u72HssE4dOKuNc*TD5Uzi-C> z``MG~di>}9=X~+Uf5O9Q4X^5%PyE=wEjHoizkcy0!hfe1{AIuT|Ciw`JE@cJ-Ub={ z_~fR;D_`~Nn>UlS!L3^l&)vFlJ66}IK==RQ<MC4p|AjxYocO6;_{8u1RgxOF|9|YN z`mkTgQ|a9Q69QBI&u%c3h-~-q``5##UUb1y#z@O%#o@&w@?RPccHGfCC$LlRU*3}5 z<L<ocWS6aNZnd<0yQBZwx^CTb-&lid%`Pl|mSeO{&Zl<SO#aCq_voh|d)=L|B>P#? zvh%vhMpxuFvh<lY%Ove`)c>ARa#!Ix_uc<p>%`)O-~2a`%((wNbNV0Od)K^w&-CQ! z&nrm0v*Ywhr%%r8uctmXPS|6x@3nEZ(WYm=Zd^IeTzl{N<8;9Zhwq$e^6RgC{><k( z&mZnDl4Wt9iVAF)tKxo7R(!npL|x_2pug_F?L80tI)5_czqI4M|7Ty@t55!A9~cnA z_j<p_=lb3UufOh(cz8bhXMCjYbNi)F+coqrKDyDt@w(FY%x&e;J!=jfdBG)m&ht%% zk@1?BDRn0On{R*BWO(Z`;r!W&9v?(k)!u47^)0`Bmbblu*N^xIOI`kp`Z@l#Z(RKK z|Jek?J-dGJKV3D$hfzUG=A=mP7WR&{1s~-$`P}~c;pq3WBL_G)GAcZ6Zs#niGT$!t zs=;32u;03A9|F`PGUjVa^>FjAH)Ur~edRG}k_xkANXDk^%KWL$kGEN~)$HDBx~2XL zpR-uwdf7HvgLDhalQnU##Qa5Lt$bp5SK2Q<EExZUy-DmP=bNX`-pH*v^XkCQ%xz}r z+wSaI?&i5j_@7D6hh3kx@$60vsM=hirSde&W@>j>=1r%GXRDX(EHGVp@qUNUji<NU zy;VehCSLE2v-_~hK}xLrrS4=WEum~4z8_EQE4;7pvk9HLbWir|*PI&<Qu5Ae^|r-S un~9w>mveWT$XobPadC_U>(0arMvvF;pLJgY1N=+*!~f=w`h11}1_l7%e$n0l diff --git a/fm/maca_trans_parser.fm b/fm/maca_trans_parser.fm new file mode 100644 index 0000000..6e18e5e --- /dev/null +++ b/fm/maca_trans_parser.fm @@ -0,0 +1,56 @@ +#b0m +#s0m +#b0m s0m +#s0l s0m b0l b0m + + +b0g +s0g s0p +s0g b0p +s0g +s0sf +#s1g +#s1sf +s0l +s0p +s1p +s2p +b0l +b0p +b1l +b1p +b2p +b3p +ldep_s0r +rdep_s0r +ldep_s1r +rdep_s1r +ldep_b0r +rdep_b0r +s0l b0l + +s0p b0p +b0p b0l +b0p ldep_b0r +s1p b1p +b1p b2p +s0p b0p b0l +s0p ldep_s0r rdep_s0r +s0p s0l b0p +s0p b0p dist_s0_b0 +s1p s0p b0p +b0p b1p b2p +b1p b2p b3p +s0p b0p b1p +b1p b1l b2p b3p +b1p b1l b2p b2l b3p +t1 +#t2 +#t3 +#t4 +t1 t2 +#t2 t3 +t1 t2 t3 + +bm1p +bm2p diff --git a/fm/maca_trans_parser_fann.fm b/fm/maca_trans_parser_fann.fm new file mode 100644 index 0000000..3fdb2ce --- /dev/null +++ b/fm/maca_trans_parser_fann.fm @@ -0,0 +1,25 @@ +b0l +b0p +b1l +b1p +b2l +b2p +b3p +bm1p +bm2p +dist_s0_b0 +ldep_b0r +ldep_s0r +ldep_s1r +rdep_b0r +rdep_s0r +rdep_s1r +s0g +s0l +s0p +s0sf +s1p +s2p +t1 +t2 +t3 diff --git a/fm/maca_trans_tagger.fm b/fm/maca_trans_tagger.fm new file mode 100644 index 0000000..50af67e --- /dev/null +++ b/fm/maca_trans_tagger.fm @@ -0,0 +1,37 @@ +b0U1 +b0len + +b0sgn +b1sgn + +b1f +b0f +bm1f +bm2f + +bm1p +bm2p +bm3p +bm2p bm1p +bm2p bm3p +bm1p b0sgn + +b0s1 +b0s2 +b0s3 +b0s4 +b0s5 +b0s1 b0s2 +b0s1 b0s2 b0s3 +b0s1 b0s2 b0s3 b0s4 + +b0p1 +b0p2 +b0p3 +b0p4 +b0p5 +b0p1 b0p2 +b0p1 b0p2 b0p3 +b0p1 b0p2 b0p3 b0p4 + + diff --git a/fm/maca_trans_tagger_fann.fm b/fm/maca_trans_tagger_fann.fm new file mode 100644 index 0000000..3f5e2c2 --- /dev/null +++ b/fm/maca_trans_tagger_fann.fm @@ -0,0 +1,21 @@ +b0f +b0len +b0p1 +b0p2 +b0p3 +b0p4 +b0p5 +b0s1 +b0s2 +b0s3 +b0s4 +b0s5 +b0sgn +b0U1 +b1f +b1sgn +bm1f +bm1p +bm2f +bm2p +bm3p diff --git a/makefiles/maca_trans_tagger.makefile b/makefiles/maca_trans_tagger.makefile index f32b03a..ce3d13e 100644 --- a/makefiles/maca_trans_tagger.makefile +++ b/makefiles/maca_trans_tagger.makefile @@ -2,14 +2,21 @@ ## compile ##----------------------------------------------------------------------- -compile: $(MODEL_FILENAME) +compile: $(MODEL_FILENAME) $(FANN_TRAIN) $(CFF_TRAIN): $(MCF_TRAIN) maca_trans_tagger_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FILENAME) --vocabs $(VOCABS_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES) $(STREAM_MODE) -P $(FORM_POS_FILENAME) +$(CFF_FANN_TRAIN): $(MCF_TRAIN) + maca_trans_tagger_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FANN_FILENAME) --vocabs $(VOCABS_FANN_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES) $(STREAM_MODE) -P $(FORM_POS_FILENAME) + + $(CFF_CUTOFF_TRAIN): $(CFF_TRAIN) cff_cutoff --input $< --vocabs $(VOCABS_FILENAME) --cutoff $(CFF_CUTOFF) > $@ +$(FANN_TRAIN): $(CFF_FANN_TRAIN) + cff2fann --vocabs $(VOCABS_FANN_FILENAME) --cff $< --feat_model $(FEATURES_MODEL_FANN_FILENAME) -C $(MCD_FILENAME) > $@ + $(MODEL_FILENAME): $(CFF_CUTOFF_TRAIN) #$(MODEL_FILENAME): $(CFF_TRAIN) perceptron_train --cff $< --model $(MODEL_FILENAME) -n $(PERCEPTRON_ITERATIONS) diff --git a/tools/conllu2mcf.c b/tools/conllu2mcf.c new file mode 100644 index 0000000..a2fc73b --- /dev/null +++ b/tools/conllu2mcf.c @@ -0,0 +1,158 @@ +#include<stdio.h> +#include<stdlib.h> +#include<string.h> +#include<strings.h> +#include<math.h> +#include<getopt.h> +#include"conll_lib.h" +#include"hash_str.h" + +#define NB_COL 7 + +typedef struct options +{ + FILE * fd_parses; // parser output + int verbose_level; + int snum; + char *filename; + char columns[NB_COL]; +} options; + +/*---------------------------------------------------------------------------------*/ + +options op; + +void print_options(options *op) +{ + fprintf(stderr, "file name = %s\n", op->filename); + fprintf(stderr, "verbose level = %d\n", op->verbose_level); + fprintf(stderr, "maximum number of sentences to process = %d\n", op->snum); +} + +void reset_options(options * op) +{ + int i; + op->filename = NULL; + op->fd_parses = stdin; + op->verbose_level = 0; + op->snum = 100000000; + for(i=0; i < NB_COL; i++) + op->columns[i] = '0'; +} + +/*---------------------------------------------------------------------------------*/ +void print_help_message(char *program_name) +{ + fprintf(stderr, "%s usage: %s [options]\n", program_name, program_name); + fprintf(stderr, "OPTIONS :\n"); + fprintf(stderr, " -f <file> : hypothesis conll file\n"); + fprintf(stderr, " -n <int> : process n sentences (default is 100 000 000)\n"); + fprintf(stderr, " -v 1|2|3 : verbosity level\n"); + fprintf(stderr, " -h : print this message\n"); + + fprintf(stderr, " -1 : content of column 1 in the mcf file produced\n"); + fprintf(stderr, " -2 : content of column 2 in the mcf file produced\n"); + fprintf(stderr, " -3 : content of column 3 in the mcf file produced\n"); + fprintf(stderr, " -4 : content of column 4 in the mcf file produced\n"); + fprintf(stderr, " -5 : content of column 5 in the mcf file produced\n"); + fprintf(stderr, " -6 : content of column 6 in the mcf file produced\n"); + fprintf(stderr, " -7 : content of column 7 in the mcf file produced\n"); + fprintf(stderr, " : values of options -1 to -7 must be one of\n"); + fprintf(stderr, " : I for id\n"); + fprintf(stderr, " : W for form\n"); + fprintf(stderr, " : L for lemma\n"); + fprintf(stderr, " : C for coarse part of speech\n"); + fprintf(stderr, " : P for part of speech\n"); + fprintf(stderr, " : F for features\n"); + fprintf(stderr, " : H for head\n"); + fprintf(stderr, " : D for deprel\n"); + +} + + + + +/*---------------------------------------------------------------------------------*/ + +void parse_options(int argc, char *argv[], options * op) +{ + char c; + + reset_options(op); + /* + if(argc ==1){ + print_help_message(argv[0]); + exit(1); + }*/ + + while ((c = getopt (argc, argv, "hIWLCPFHDf:n:v:1:2:3:4:5:6:7:8:9:")) != -1) + switch (c) + { + case 'h': + print_help_message(argv[0]); + exit(0); + case 'f': + op->filename = strdup(optarg); + if((op->fd_parses = fopen(op->filename, "r")) == NULL){ + fprintf(stderr, "cannot open hypothesis file %s : aborting\n", op->filename); + exit(1); + } + break; + case '1': + op->columns[0] = optarg[0]; + break; + case '2': + op->columns[1] = optarg[0]; + break; + case '3': + op->columns[2] = optarg[0]; + break; + case '4': + op->columns[3] = optarg[0]; + break; + case '5': + op->columns[4] = optarg[0]; + break; + case '6': + op->columns[5] = optarg[0]; + break; + case '7': + op->columns[6] = optarg[0]; + break; + case 'n': + op->snum = atoi(optarg); + break; + case 'v': + op->verbose_level = atoi(optarg); + break; + } + + /* if (op->fd_parses == NULL){ + fprintf(stderr, "error : cannot open parse file: aborting\n"); + exit(1); + }*/ +} + +/*---------------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------------*/ +int main(int argc, char *argv[]) +{ + sentence *s = allocate_sentence(); + int snum = 0; + int res; + parse_options(argc, argv, &op); + + print_options(&op); + + + for(res = load_sentence(op.fd_parses, s); res && (snum < op.snum); res = load_sentence(op.fd_parses, s)){ + s->num = snum; + snum++; + compute_relative_index_of_heads(s); + print_sentence_mcf3(s, op.columns, NB_COL); + } + if(op.filename) + fclose(op.fd_parses); + free_sentence(s); + return 0; +} diff --git a/tools/eval_mcf.pl b/tools/eval_mcf.pl new file mode 100755 index 0000000..f8fb57b --- /dev/null +++ b/tools/eval_mcf.pl @@ -0,0 +1,324 @@ +#!/usr/bin/perl + +$arg = shift; +while($arg){ + if($arg eq "-g"){$ref = shift;} + elsif($arg eq "-s"){$hyp = shift;} + elsif($arg eq "-G"){$ref_mcd = shift;} + elsif($arg eq "-S"){$hyp_mcd = shift;} + elsif($arg eq "-tac"){$TAGGING_ACCURACY_PER_CATEGORY = 1;} + elsif($arg eq "-tcm"){$TAGGING_CONFUSION_MATRIX = 1;} + elsif($arg eq "-tec"){$TAGGING_ERRORS_PER_CATEGORY = 1;} + elsif($arg eq "-paf"){$PARSING_ACCURACY_PER_FUNCTION = 1;} + elsif($arg eq "-lcm"){$LABELING_CONFUSION_MATRIX = 1;} + elsif($arg eq "-acm"){$ATTACHEMENT_CONFUSION_MATRIX = 1;} + elsif($arg eq "-all"){ + $TAGGING_ACCURACY_PER_CATEGORY = 1; + $TAGGING_CONFUSION_MATRIX = 1; + $TAGGING_ERRORS_PER_CATEGORY = 1; + $PARSING_ACCURACY_PER_FUNCTION = 1; + $LABELING_CONFUSION_MATRIX = 1; + $ATTACHEMENT_CONFUSION_MATRIX = 1; + } + elsif($arg eq "-h"){ + print "usage eval07.pl OPTIONS -g <reference file> -s <system output>\n"; + print "OPTIONS :\n"; + print "\t-tac tagging accuracy per category\n"; + print "\t-tcm tagging confusion matrix\n"; + print "\t-tec tagging errors per category\n"; + print "\t-paf parsing accuracy per function\n"; + print "\t-lcm labeling confusion matrix\n"; + print "\t-acm attachment confusion matrix\n"; + print "\t-all all options\n"; + exit; +} +$arg = shift; +} + +# determine the column in the reference file + +$ref_form_col = 0; +$ref_pos_col = 1; +$ref_lemma_col = 2; +$ref_gov_col = 3; +$ref_fct_col = 4; +$ref_seg_col = 5; +$ref_morph_col = 10; + +if($ref_mcd) +{ + for($i=0; $i<length $ref_mcd; $i++){ + $car = substr($ref_mcd, $i, 1); +# print "car $i = $car\n"; + if($car eq 'W'){$ref_form_col = $i; next;} + if($car eq 'P'){$ref_pos_col = $i; next;} + if($car eq 'L'){$ref_lemma_col = $i; next;} + if($car eq 'G'){$ref_gov_col = $i; next;} + if($car eq 'F'){$ref_fct_col = $i; next;} + if($car eq 'S'){$ref_seg_col = $i; next;} + if($car eq 'M'){$ref_morph_col = $i; next;} + } +} + +# determine the column in the hypothesis file + +$hyp_form_col = 0; +$hyp_pos_col = 1; +$hyp_lemma_col = 2; +$hyp_gov_col = 3; +$hyp_fct_col = 4; +$hyp_seg_col = 5; +$hyp_morph_col = 10; + +if($hyp_mcd) +{ + for($i=0; $i<length $hyp_mcd; $i++){ + $car = substr($hyp_mcd, $i, 1); +# print "car $i = $car\n"; + if($car eq 'W'){$hyp_form_col = $i; next;} + if($car eq 'P'){$hyp_pos_col = $i; next;} + if($car eq 'L'){$hyp_lemma_col = $i; next;} + if($car eq 'G'){$hyp_gov_col = $i; next;} + if($car eq 'F'){$hyp_fct_col = $i; next;} + if($car eq 'S'){$hyp_seg_col = $i; next;} + if($car eq 'M'){$hyp_morph_col = $i; next;} + } +} + + +open REF, $ref or die "cannot open file $ref"; +open HYP, $hyp or die "cannot open file $hyp"; + + + + +my $line_nb; +my $word_nb; +my $correct_pos_nb; +my $correct_gov_nb; +my $correct_gov_fct_nb; + +sub is_punctuation_ptb{ + my $pos = shift(@_); + + if($pos eq "``"){return 1;} + if($pos eq ","){return 1;} + if($pos eq ":"){return 1;} + if($pos eq "."){return 1;} + if($pos eq "''"){return 1;} + if($pos eq "-LRB-"){return 1;} + if($pos eq "-RRB-"){return 1;} + return 0; +} +sub is_punctuation_ftb{ + my $pos = shift(@_); + + if($pos eq "PCT"){return 1;} + if($pos eq "PONCT"){return 1;} + if($pos eq "ponctw"){return 1;} + if($pos eq "poncts"){return 1;} + return 0; +} +sub is_punctuation_ud{ + my $pos = shift(@_); + + if($pos eq "PUNCT"){return 1;} + return 0; +} + + +while(<REF>){ + chop; + $line_nb++; +# ($ref_form, $ref_pos, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split /\t/; + @ref_array = split /\t/; + $column_nb = -1; + foreach $elt (@ref_array){ + $column_nb++; + if($column_nb == $ref_form_col){$ref_form = $elt; next;} + if($column_nb == $ref_pos_col){$ref_pos = $elt; next;} + if($column_nb == $ref_lemma_col){$ref_lemma = $elt; next;} + if($column_nb == $ref_gov_col){$ref_gov = $elt; next;} + if($column_nb == $ref_fct_col){$ref_fct = $elt; next;} + if($column_nb == $ref_seg_col){$ref_seg = $elt; next;} + if($column_nb == $ref_morph_col){$ref_morph = $elt; next;} + } + $_ = <HYP>; + chop $_; +# print; +# ($hyp_form, $hyp_pos, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split /\t/; + + @hyp_array = split /\t/; + $column_nb = -1; + foreach $elt (@hyp_array){ + $column_nb++; + if($column_nb == $hyp_form_col){$hyp_form = $elt; next;} + if($column_nb == $hyp_pos_col){$hyp_pos = $elt; next;} + if($column_nb == $hyp_lemma_col){$hyp_lemma = $elt; next;} + if($column_nb == $hyp_gov_col){$hyp_gov = $elt; next;} + if($column_nb == $hyp_fct_col){$hyp_fct = $elt; next;} + if($column_nb == $hyp_seg_col){$hyp_seg = $elt; next;} + if($column_nb == $hyp_morph_col){$hyp_morph = $elt; next;} + } + + + +# print "ref = $hyp_seg\n"; + + if($ref_seg){ $nb_ref_seg++;} + if($hyp_seg){ $nb_hyp_seg++;} + + if(($ref_seg) && ($hyp_seg)){ $nb_hyp_ref_seg++;} + +# if(($ref_index) && (!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){ + if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){ +# if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos)) && (!is_punctuation_ud($ref_pos))){ + if($ref_form ne $hyp_form){die "mismatch line $line_nb\n";} + $word_nb++; + $pos_nb{$ref_pos}++; + $fct_nb{$ref_fct}++; + + if($ref_pos eq $hyp_pos){ + $correct_pos_total_nb++; + $correct_pos_nb{$ref_pos}++; + } + else{ + $false_pos_form{$ref_pos}{$ref_form}++; + $pos_confusion_matrix{$ref_pos}{$hyp_pos}++; + + +# print "$ref_form $ref_pos $hyp_pos\n"; +# print "$ref_pos $hyp_pos\n"; + } + + if($ref_lemma eq $hyp_lemma){ + $correct_lemma_total_nb++; + } + else{ +# print "$ref_form \t $ref_lemma \t $hyp_lemma\n"; + } + $ref_dist = $ref_gov - $ref_index; + $hyp_dist = $hyp_gov - $hyp_index; +# if($ref_gov eq $hyp_gov){ + if($ref_dist eq $hyp_dist){ + $correct_gov_nb++; + $correct_gov_total_nb++; + if($ref_fct eq $hyp_fct){ + $correct_gov_fct_total_nb++; + $correct_gov_fct_nb{$ref_fct}++; + } + else{ + $labeling_confusion_matrix{$ref_fct}{$hyp_fct}++; + } + } + else{ + $attachement_confusion_matrix{$ref_fct}{$hyp_fct}++; + } + + } + + $ref_index = ""; +} + + +close REF; +close HYP; + + +my $pos_acc = $correct_pos_total_nb / $word_nb * 100; +my $lemma_acc = $correct_lemma_total_nb / $word_nb * 100; +my $las = $correct_gov_fct_total_nb / $word_nb * 100; +my $uas = $correct_gov_total_nb / $word_nb * 100 ; + + + +my $seg_recall = $nb_hyp_ref_seg / $nb_ref_seg; +my $seg_precision = $nb_hyp_ref_seg / ($nb_hyp_seg + 1); + + +printf(stderr "pos acc = %.2f lemma acc = %.2f uas = %.2f las = %.2f seg recall = %.2f seg precision = %.2f size = %d\n", $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb); +printf(stdout "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n", $hyp, $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb); + + + +if($TAGGING_ACCURACY_PER_CATEGORY){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "TAGGING ACCURACY PER CATEGORY\n"; + printf "CAT\tFREQ\tACC\tIMPACT\n"; + foreach $pos (keys %correct_pos_nb){ + $acc = $correct_pos_nb{$pos} / $pos_nb{$pos}; + $freq = $pos_nb{$pos} / $word_nb; + if($word_nb == $correct_pos_total_nb){ + $impact = 0; + } + else{ + $impact = ($pos_nb{$pos} - $correct_pos_nb{$pos}) / ($word_nb - $correct_pos_total_nb); + } + printf("%s\t%6.2f\t%6.2f\t%6.2f\n", $pos, $freq*100, $acc*100, $impact*100); + } +} + +if($TAGGING_CONFUSION_MATRIX){ + print "\n\n--------------------------------------------------------------------------------------\n"; + print "TAGGING CONFUSION MATRIX\n"; + foreach $ref_pos (keys %pos_confusion_matrix){ + $pos_error_nb = $pos_nb{$ref_pos} - $correct_pos_nb{$ref_pos}; + print "$ref_pos ($pos_error_nb) :"; + foreach $hyp_pos (keys %{$pos_confusion_matrix{$ref_pos}}){ + print "\t$hyp_pos ($pos_confusion_matrix{$ref_pos}{$hyp_pos})"; + } + print "\n"; + } +} + + +if($TAGGING_ERRORS_PER_CATEGORY){ + print "\n\n--------------------------------------------------------------------------------------\n"; + print "TAGGING ERRORS PER CATEGORY\n"; + foreach $pos (keys %false_pos_form){ + print "\n$pos\n"; + foreach $form (keys %{$false_pos_form{$pos}}){ + print "\t$form $false_pos_form{$pos}{$form}\n"; + } + } +} + +if($PARSING_ACCURACY_PER_FUNCTION){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "LABELED ATTACHMENT SCORE PER LABEL\n"; + printf "LABEL FREQ ACC IMPACT\n"; + foreach $fct (keys %correct_gov_fct_nb){ + $acc = $correct_gov_fct_nb{$fct} / $fct_nb{$fct}; + $freq = $fct_nb{$fct}/$word_nb; + $impact = ($fct_nb{$fct} - $correct_gov_fct_nb{$fct}) / ($word_nb - $correct_gov_fct_total_nb++); + printf("%-10s%6.2f\t%6.2f\t%6.2f\n", $fct, $freq*100, $acc*100, $impact*100); + } +} + +if($ATTACHEMENT_CONFUSION_MATRIX){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "ATTACHEMENT CONFUSION MATRIX\n"; + foreach $ref_fct (keys %attachement_confusion_matrix){ + $attachement_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct}; + print "$ref_fct ($attachement_error_nb) :"; + foreach $hyp_fct (keys %{$attachement_confusion_matrix{$ref_fct}}){ + print "\t$hyp_fct ($attachement_confusion_matrix{$ref_fct}{$hyp_fct})"; + } + print "\n"; + } + +} + +if($LABELING_CONFUSION_MATRIX){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "LABELING CONFUSION MATRIX\n"; + foreach $ref_fct (keys %labeling_confusion_matrix){ + $fct_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct}; + print "$ref_fct ($fct_error_nb) :"; + foreach $hyp_fct (keys %{$labeling_confusion_matrix{$ref_fct}}){ + print "\t$hyp_fct ($labeling_confusion_matrix{$ref_fct}{$hyp_fct})"; + } + print "\n"; + } + +} diff --git a/tools/eval_wpmlgfs.pl b/tools/eval_wpmlgfs.pl new file mode 100755 index 0000000..a6d1097 --- /dev/null +++ b/tools/eval_wpmlgfs.pl @@ -0,0 +1,234 @@ +#!/usr/bin/perl + +$arg = shift; +while($arg){ + if($arg eq "-g"){$ref = shift;} + elsif($arg eq "-s"){$hyp = shift;} + elsif($arg eq "-tac"){$TAGGING_ACCURACY_PER_CATEGORY = 1;} + elsif($arg eq "-tcm"){$TAGGING_CONFUSION_MATRIX = 1;} + elsif($arg eq "-tec"){$TAGGING_ERRORS_PER_CATEGORY = 1;} + elsif($arg eq "-paf"){$PARSING_ACCURACY_PER_FUNCTION = 1;} + elsif($arg eq "-lcm"){$LABELING_CONFUSION_MATRIX = 1;} + elsif($arg eq "-acm"){$ATTACHEMENT_CONFUSION_MATRIX = 1;} + elsif($arg eq "-all"){ + $TAGGING_ACCURACY_PER_CATEGORY = 1; + $TAGGING_CONFUSION_MATRIX = 1; + $TAGGING_ERRORS_PER_CATEGORY = 1; + $PARSING_ACCURACY_PER_FUNCTION = 1; + $LABELING_CONFUSION_MATRIX = 1; + $ATTACHEMENT_CONFUSION_MATRIX = 1; + } + elsif($arg eq "-h"){ + print "usage eval07.pl OPTIONS -g <gold file> -s <system output>\n"; + print "OPTIONS :\n"; + print "\t-tac tagging accuracy per category\n"; + print "\t-tcm tagging confusion matrix\n"; + print "\t-tec tagging errors per category\n"; + print "\t-paf parsing accuracy per function\n"; + print "\t-lcm labeling confusion matrix\n"; + print "\t-acm attachment confusion matrix\n"; + print "\t-all all options\n"; + exit; +} +$arg = shift; +} + +open REF, $ref or die "cannot open file $ref"; +open HYP, $hyp or die "cannot open file $hyp"; + + +my $line_nb; +my $word_nb; +my $correct_pos_nb; +my $correct_gov_nb; +my $correct_gov_fct_nb; + +sub is_punctuation_ptb{ + my $pos = shift(@_); + + if($pos eq "``"){return 1;} + if($pos eq ","){return 1;} + if($pos eq ":"){return 1;} + if($pos eq "."){return 1;} + if($pos eq "''"){return 1;} + if($pos eq "-LRB-"){return 1;} + if($pos eq "-RRB-"){return 1;} + return 0; +} +sub is_punctuation_ftb{ + my $pos = shift(@_); + + if($pos eq "PCT"){return 1;} + if($pos eq "PONCT"){return 1;} + if($pos eq "ponctw"){return 1;} + if($pos eq "poncts"){return 1;} + return 0; +} + + +while(<REF>){ + chop; + $line_nb++; + ($ref_form, $ref_pos, $ref_morpho, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split /\t/; + $_ = <HYP>; + chop $_; +# print; + ($hyp_form, $hyp_pos, $ref_morpho, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split /\t/; +# print "ref = $hyp_seg\n"; + + if($ref_seg){ $nb_ref_seg++;} + if($hyp_seg){ $nb_hyp_seg++;} + + if(($ref_seg) && ($hyp_seg)){ $nb_hyp_ref_seg++;} + +# if(($ref_index) && (!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){ + if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){ + if($ref_form ne $hyp_form){die "mismatch line $line_nb\n";} + $word_nb++; + $pos_nb{$ref_pos}++; + $fct_nb{$ref_fct}++; + + if($ref_pos eq $hyp_pos){ + $correct_pos_total_nb++; + $correct_pos_nb{$ref_pos}++; + } + else{ + $false_pos_form{$ref_pos}{$ref_form}++; + $pos_confusion_matrix{$ref_pos}{$hyp_pos}++; + + +# print "$ref_form $ref_pos $hyp_pos\n"; +# print "$ref_pos $hyp_pos\n"; + } + + if($ref_lemma eq $hyp_lemma){ + $correct_lemma_total_nb++; + } + else{ +# print "$ref_form \t $ref_lemma \t $hyp_lemma\n"; + } + $ref_dist = $ref_gov - $ref_index; + $hyp_dist = $hyp_gov - $hyp_index; +# if($ref_gov eq $hyp_gov){ + if($ref_dist eq $hyp_dist){ + $correct_gov_nb++; + $correct_gov_total_nb++; + if($ref_fct eq $hyp_fct){ + $correct_gov_fct_total_nb++; + $correct_gov_fct_nb{$ref_fct}++; + } + else{ + $labeling_confusion_matrix{$ref_fct}{$hyp_fct}++; + } + } + else{ + $attachement_confusion_matrix{$ref_fct}{$hyp_fct}++; + } + + } + + $ref_index = ""; +} + + +close REF; +close HYP; + + +my $pos_acc = $correct_pos_total_nb / $word_nb * 100; +my $lemma_acc = $correct_lemma_total_nb / $word_nb * 100; +my $las = $correct_gov_fct_total_nb / $word_nb * 100; +my $uas = $correct_gov_total_nb / $word_nb * 100 ; + + + +my $seg_recall = $nb_hyp_ref_seg / $nb_ref_seg; +my $seg_precision = $nb_hyp_ref_seg / ($nb_hyp_seg + 1); + + +printf(stderr "pos acc = %.2f lemma acc = %.2f uas = %.2f las = %.2f seg recall = %.2f seg precision = %.2f size = %d\n", $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb); +printf(stdout "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n", $hyp, $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb); + + + +if($TAGGING_ACCURACY_PER_CATEGORY){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "TAGGING ACCURACY PER CATEGORY\n"; + printf "CAT\tFREQ\tACC\tIMPACT\n"; + foreach $pos (keys %correct_pos_nb){ + $acc = $correct_pos_nb{$pos} / $pos_nb{$pos}; + $freq = $pos_nb{$pos} / $word_nb; + if($word_nb == $correct_pos_total_nb){ + $impact = 0; + } + else{ + $impact = ($pos_nb{$pos} - $correct_pos_nb{$pos}) / ($word_nb - $correct_pos_total_nb); + } + printf("%s\t%6.2f\t%6.2f\t%6.2f\n", $pos, $freq*100, $acc*100, $impact*100); + } +} + +if($TAGGING_CONFUSION_MATRIX){ + print "\n\n--------------------------------------------------------------------------------------\n"; + print "TAGGING CONFUSION MATRIX\n"; + foreach $ref_pos (keys %pos_confusion_matrix){ + $pos_error_nb = $pos_nb{$ref_pos} - $correct_pos_nb{$ref_pos}; + print "$ref_pos ($pos_error_nb) :"; + foreach $hyp_pos (keys %{$pos_confusion_matrix{$ref_pos}}){ + print "\t$hyp_pos ($pos_confusion_matrix{$ref_pos}{$hyp_pos})"; + } + print "\n"; + } +} + + +if($TAGGING_ERRORS_PER_CATEGORY){ + print "\n\n--------------------------------------------------------------------------------------\n"; + print "TAGGING ERRORS PER CATEGORY\n"; + foreach $pos (keys %false_pos_form){ + print "\n$pos\n"; + foreach $form (keys %{$false_pos_form{$pos}}){ + print "\t$form $false_pos_form{$pos}{$form}\n"; + } + } +} + +if($PARSING_ACCURACY_PER_FUNCTION){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "LABELED ATTACHMENT SCORE PER LABEL\n"; + printf "LABEL FREQ ACC IMPACT\n"; + foreach $fct (keys %correct_gov_fct_nb){ + $acc = $correct_gov_fct_nb{$fct} / $fct_nb{$fct}; + $freq = $fct_nb{$fct}/$word_nb; + $impact = ($fct_nb{$fct} - $correct_gov_fct_nb{$fct}) / ($word_nb - $correct_gov_fct_total_nb++); + printf("%-10s%6.2f\t%6.2f\t%6.2f\n", $fct, $freq*100, $acc*100, $impact*100); + } +} + +if($ATTACHEMENT_CONFUSION_MATRIX){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "ATTACHEMENT CONFUSION MATRIX\n"; + foreach $ref_fct (keys %attachement_confusion_matrix){ + $attachement_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct}; + print "$ref_fct ($attachement_error_nb) :"; + foreach $hyp_fct (keys %{$attachement_confusion_matrix{$ref_fct}}){ + print "\t$hyp_fct ($attachement_confusion_matrix{$ref_fct}{$hyp_fct})"; + } + print "\n"; + } + +} + +if($LABELING_CONFUSION_MATRIX){ + print "\n\n--------------------------------------------------------------------------------------\n"; + printf "LABELING CONFUSION MATRIX\n"; + foreach $ref_fct (keys %labeling_confusion_matrix){ + $fct_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct}; + print "$ref_fct ($fct_error_nb) :"; + foreach $hyp_fct (keys %{$labeling_confusion_matrix{$ref_fct}}){ + print "\t$hyp_fct ($labeling_confusion_matrix{$ref_fct}{$hyp_fct})"; + } + print "\n"; + } + +} diff --git a/tools/fplm2fP_ud.pl b/tools/fplm2fP_ud.pl new file mode 100755 index 0000000..0758ed5 --- /dev/null +++ b/tools/fplm2fP_ud.pl @@ -0,0 +1,58 @@ +#!/usr/bin/perl + + +$postag{"ADJ"} = 1; +$postag{"ADP"} = 1; +$postag{"ADV"} = 1; +$postag{"AUX"} = 1; +$postag{"CCONJ"} = 1; +$postag{"DET"} = 1; +$postag{"INTJ"} = 1; +$postag{"NOUN"} = 1; +$postag{"NUM"} = 1; +$postag{"PART"} = 1; +$postag{"PRON"} = 1; +$postag{"PROPN"} = 1; +$postag{"PUNCT"} = 1; +$postag{"SCONJ"} = 1; +$postag{"SYM"} = 1; +$postag{"VERB"} = 1; +$postag{"X"} = 1; + +while(<>){ + ($form, $pos, $lemma, $morpho) = split /\t/; + if($postag{$pos}){ + $h_form2pos{$form}{$pos} = 1; + $h_pos{$pos} += 1; + } +} + +$nbelem = keys %h_form2pos; +print "$nbelem\n"; + +$nbelem = keys %h_pos; +print "$nbelem\n"; +$first = 1; +foreach $pos (keys %h_pos){ + if($first){ + $first = 0; + } + else{ + print "\t"; + } + print $pos; +} +print "\n"; + +foreach $form (keys %h_form2pos){ + print "$form\t"; + foreach $pos (keys %h_pos){ + if($h_form2pos{$form}{$pos}){ + print "1"; + } + else{ + print "0"; + } + } + print "\n"; +} -- GitLab