From f21d8b995a314bb2d2abff263bc0d3b7eb71af87 Mon Sep 17 00:00:00 2001
From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
Date: Tue, 11 Apr 2017 08:55:26 +0200
Subject: [PATCH] setting up architecture for universal dependencies

---
 UD/template/maca_trans_tagger/Makefile |   6 +
 UD/ud_template.tgz                     | Bin 1379 -> 1379 bytes
 fm/maca_trans_parser.fm                |  56 +++++
 fm/maca_trans_parser_fann.fm           |  25 ++
 fm/maca_trans_tagger.fm                |  37 +++
 fm/maca_trans_tagger_fann.fm           |  21 ++
 makefiles/maca_trans_tagger.makefile   |   9 +-
 tools/conllu2mcf.c                     | 158 ++++++++++++
 tools/eval_mcf.pl                      | 324 +++++++++++++++++++++++++
 tools/eval_wpmlgfs.pl                  | 234 ++++++++++++++++++
 tools/fplm2fP_ud.pl                    |  58 +++++
 11 files changed, 927 insertions(+), 1 deletion(-)
 create mode 100644 fm/maca_trans_parser.fm
 create mode 100644 fm/maca_trans_parser_fann.fm
 create mode 100644 fm/maca_trans_tagger.fm
 create mode 100644 fm/maca_trans_tagger_fann.fm
 create mode 100644 tools/conllu2mcf.c
 create mode 100755 tools/eval_mcf.pl
 create mode 100755 tools/eval_wpmlgfs.pl
 create mode 100755 tools/fplm2fP_ud.pl

diff --git a/UD/template/maca_trans_tagger/Makefile b/UD/template/maca_trans_tagger/Makefile
index 884cd69..b4a2855 100644
--- a/UD/template/maca_trans_tagger/Makefile
+++ b/UD/template/maca_trans_tagger/Makefile
@@ -3,11 +3,15 @@ MCF_DEV=../data/treebank/dev.mcf
 MCF_TEST=../data/treebank/test.mcf
 
 CFF_TRAIN=train.cff
+CFF_FANN_TRAIN=train.fann.cff
+FANN_TRAIN=train.fann
 CFF_CUTOFF_TRAIN=train.cutoff.cff
 PERCEPTRON_ITERATIONS=9
 CFF_CUTOFF=1
 FEATURES_MODEL_FILENAME=../../fm/maca_trans_tagger.fm
+FEATURES_MODEL_FANN_FILENAME=../../fm/maca_trans_tagger_fann.fm
 VOCABS_FILENAME=maca_trans_tagger.vocab 
+VOCABS_FANN_FILENAME=maca_trans_parser_fann.vocab 
 MCD_FILENAME=../../mcd/maca_trans_tagger.mcd
 MODEL_FILENAME=maca_trans_tagger.model 
 NUMBER_OF_SENTENCES=10000000
@@ -18,3 +22,5 @@ FORM_POS_FILENAME=../data/morpho-lexicon/fP
 #include ./maca_trans_tagger.makefile
 include ../../makefiles/maca_trans_tagger.makefile
 
+
+
diff --git a/UD/ud_template.tgz b/UD/ud_template.tgz
index 002ed8421208367c50605e9025a7dae8e2cb125d..f4d2ffaea32cb3e490c6e2457c01a76f60ce6928 100644
GIT binary patch
literal 1379
zcmb2|=3t1bc@x3F{5C2#f0c*Gx7Sa@C*0uw%6!WDrkdL6(41MKrd@aYTeQARVv;&~
zZ`uC3^tN!v#&=9_?^^rM%9v@KKksm5+Br-AZRRgbc%&RAvHUpc!8G|_&xJ&bs700Y
z5+gS{8}lf0$p&RqB?>HGoG;_CDcR!L?Akqg;n)6kCuRR|yUZ34_<zv}j~<Or0sm#r
zC;Z*cxTL<k{Ko$qfd%ceWB>kp;QZd+pZV`+ox1<G532r+u0DRX_wBZImJ6CLR>!{l
z`@G>Ft7nR}O6+x}u7<DkFYnLVGh^D@8Kq|fm(`#6>G5C4`?5^)m-81R{^u^Ro@N>;
zZ?Dt-_n*QKXSKpxLH`b#%;$Ok)Pr$xfsB|&?gqKnlbc`W)(OPsYTBP^Gv9uZVQ*vU
zf&lZw-+rw)-Kmh1|9H+eHOc;WZO2N_M9i(b^=+})qs{eG@3VW{oVZNdileUMKkJGA
zGCLR&{^c`2|F!=1XLgOx|6^YKSDltx;nwur-ce-Z|9ocN-~ZPwTeF4RDf+Xin51_9
zQ6X0qA>B_sb_Vq!Z9lUt;-CEM|Fd7i^Tj{?g#Y)sCj6US_}kFyuf1;TzxXAteE0ra
zd}#k$FRidExUy{jwD(gJAKm^Rx8m~elV0KO4=x@K;oN!fZVtQs_jR9hWPd-f7TB#P
z%B{%M5us|A@#EP=_OoJ)bL-cM{;In%qwr7X_4crCeV)<-re<yLW~|Hp@ojfyMhlmn
z+<(@0^N;@9&zth~K8xs-AN6g|HvbZja69mCKktF$@;43Vm5o>~z76`y(BqO<_D*is
zt6ywu%WJo>>l`e66%%jqMD}B^l6ke!jwfFNYGi|MO^*GY|7J#8_;WskjsMOI{`$YY
zZH4?p=a2Sc$Non!#&8<^-`+6gpY(3lX?OoiOgsO<zW<fI!GH7qlS!+NZQq`K^1*BI
zjYnz<zPz2z611S+V9(-x=6uK9G8U}*pYHm>WX1o44gd4mEB{{iXHNZY?>5EgmpDt~
z@qhbyS1j(7ODXzfXf@?TW1)pch{Ou##Ukv#Ixecr(mW#&DOP1RF@5u9w$mRbUAq`H
zJKcKTpSh`uo!zyat(#;txpOV&zvQ2_Y0B32pE;%?L63PYlNO!0Rkd<WNvU1fvJR&7
zO;W6ey5$q9jekv>{hIap`U|1%)l7fopDX|Tt@U!!{3oT`4d<PR_>j}2{kSdGNctrI
z+jrZRISXyx`YSf_)#uKAL20v$q%~J73h(Q75jo?Vt0Da@Hq+ZR?4rJb%F57_4<FR9
zTEzW6I^oj9d>i|EtN+{wf6kv)@Nd27ldt#f{ui&Aa?yT4J=^Dc=^v;5)X%(o+Mjj$
z&;R`<|BdAz{eN)wNBAH4lC_8aKUDeO_+LM`ypGG`(RF_Ai2<#Lo!+S!%+wF9HH@FT
z_Hj~3qNBjwA9o5edoOw}JoHfMHf#IZ)r%iq;%^c9f8MA@=D%pB;I;o7OxFF+XMbJz
z^pX9H)k?vv4I2tnLQO8oI+{0heP57rX<fznzS<rZAAU}Oz@G^l8ty%K7S=9%L0sd;
z>8nC)rA@7$RxUcY;lqYi7LEs8Vy7(WnbfGFV`3?}k9YIJ#M`o-HM@6)KHWb>PBNFN
z_M1nROI$%&;j_<C+Y45HSnS?*b;7A+qnIV#e;gc?IuFF<`vtqr40+%)IVaaIr+C9A
zr@}?Te@v$Q-n5lpWvhB()#eJ#z*Ac*G{cup+SIBvb9+ef!#tsd_6xlin5};wcEmS;
z|M<TDfBMQ?RniKl`$&ohxo&e)YVQBf9DJzmz=VWp-`?e{6+g?ST3j5|Y5L^W<!uE!
qKi(1QN>LJJmwf7^eS(*HyQJN}od0c&=-{LEV|j;9mm3)Z7#IND?7`Cj

literal 1379
zcmb2|=3r=>{yc(#`E68e{wgn#uV2rsSJ=q^mGSB9J8Ejv7vC`1z9R5VtxCwkc99Dj
z-^~7gEl*2fa6PEzaQ^rjlY>^(ZIa*9bkn}oeLEDlI;}upmDXbI^`~mSd0lwq>i1No
zQ=Db#(T_IkPw2X>t4`wh9h;Ibxh(bfx!v1$=Y(JT*PWE=UUrwUVCjF+*6>Q#i4*?2
zb}s*y&vfJe!K&RqH>O?Wn*RFL&j-Qh_aAZm@6A;EZ}vgU|GTB_H@vyJYE`uY`{U=o
zRy~hz`8Pe(HN!9_R9J)QYyG+ZZGT_wnD;)UwBP8z^4a76!dd%-E8o;lT>LMe>FvU_
ztZN?<ue~p;Znu{a`YkG*EBpK=UwpX^57&M11(&P37n<I(diaSwW47_z#5?<Usxp^U
zy_<IQZjSz&M|#x^dkS8@59gcQBX{KBR_jiy?ESAMnSHpr|I~cuH#v`9&WdBPU;5wZ
zQoUdv!-{|AjmiJR&;DVb@^Qb!=lxn7dycGr@z+00<kWxP1!@2CXaBlmlf0$qp3vll
z|3Y#CRg_XDxqou}`Z&uXf7}1bZ~hzp*Iw22{lC%E|85O`<N4pJYdo#@>-`!3P(<+U
zf0rNbKlgJsd|Pa`FFxaYmHpFi^~;Yv{+p2}$^7H$;l*;<&KqLB{=3ccuY33FL+3BN
z*%X@8!rHLH>(e4Z%iSDiyBQ|duM;h_&$3bcH|1;Kl}o(Nq8x9Q=Kk~(xH0kj!<_7i
zP3PzTclsr+@!z~L`2Tf9R@Fb}i!ZJGpV~NG^S?Rcms@iMK78!tVR&m6&HKRPOp$Dk
zKxnvm#@5fDog_H-#XtS?Nb&sv^J$TJGY+dQZ$5v0;u72HssE4dOKuNc*TD5Uzi-C>
z``MG~di>}9=X~+Uf5O9Q4X^5%PyE=wEjHoizkcy0!hfe1{AIuT|Ciw`JE@cJ-Ub={
z_~fR;D_`~Nn>UlS!L3^l&)vFlJ66}IK==RQ<MC4p|AjxYocO6;_{8u1RgxOF|9|YN
z`mkTgQ|a9Q69QBI&u%c3h-~-q``5##UUb1y#z@O%#o@&w@?RPccHGfCC$LlRU*3}5
z<L<ocWS6aNZnd<0yQBZwx^CTb-&lid%`Pl|mSeO{&Zl<SO#aCq_voh|d)=L|B>P#?
zvh%vhMpxuFvh<lY%Ove`)c>ARa#!Ix_uc<p>%`)O-~2a`%((wNbNV0Od)K^w&-CQ!
z&nrm0v*Ywhr%%r8uctmXPS|6x@3nEZ(WYm=Zd^IeTzl{N<8;9Zhwq$e^6RgC{><k(
z&mZnDl4Wt9iVAF)tKxo7R(!npL|x_2pug_F?L80tI)5_czqI4M|7Ty@t55!A9~cnA
z_j<p_=lb3UufOh(cz8bhXMCjYbNi)F+coqrKDyDt@w(FY%x&e;J!=jfdBG)m&ht%%
zk@1?BDRn0On{R*BWO(Z`;r!W&9v?(k)!u47^)0`Bmbblu*N^xIOI`kp`Z@l#Z(RKK
z|Jek?J-dGJKV3D$hfzUG=A=mP7WR&{1s~-$`P}~c;pq3WBL_G)GAcZ6Zs#niGT$!t
zs=;32u;03A9|F`PGUjVa^>FjAH)Ur~edRG}k_xkANXDk^%KWL$kGEN~)$HDBx~2XL
zpR-uwdf7HvgLDhalQnU##Qa5Lt$bp5SK2Q<EExZUy-DmP=bNX`-pH*v^XkCQ%xz}r
z+wSaI?&i5j_@7D6hh3kx@$60vsM=hirSde&W@>j>=1r%GXRDX(EHGVp@qUNUji<NU
zy;VehCSLE2v-_~hK}xLrrS4=WEum~4z8_EQE4;7pvk9HLbWir|*PI&<Qu5Ae^|r-S
un~9w>mveWT$XobPadC_U>(0arMvvF;pLJgY1N=+*!~f=w`h11}1_l7%e$n0l

diff --git a/fm/maca_trans_parser.fm b/fm/maca_trans_parser.fm
new file mode 100644
index 0000000..6e18e5e
--- /dev/null
+++ b/fm/maca_trans_parser.fm
@@ -0,0 +1,56 @@
+#b0m
+#s0m
+#b0m s0m
+#s0l s0m b0l b0m
+
+
+b0g
+s0g s0p
+s0g b0p
+s0g
+s0sf
+#s1g
+#s1sf
+s0l
+s0p
+s1p
+s2p
+b0l
+b0p
+b1l
+b1p
+b2p
+b3p
+ldep_s0r
+rdep_s0r
+ldep_s1r
+rdep_s1r
+ldep_b0r
+rdep_b0r
+s0l b0l
+
+s0p b0p
+b0p b0l
+b0p ldep_b0r
+s1p b1p
+b1p b2p
+s0p b0p b0l
+s0p ldep_s0r rdep_s0r
+s0p s0l b0p
+s0p b0p dist_s0_b0
+s1p s0p b0p
+b0p b1p b2p
+b1p b2p b3p
+s0p b0p b1p
+b1p b1l b2p b3p
+b1p b1l b2p b2l b3p
+t1
+#t2
+#t3
+#t4
+t1 t2
+#t2 t3
+t1 t2 t3
+
+bm1p
+bm2p
diff --git a/fm/maca_trans_parser_fann.fm b/fm/maca_trans_parser_fann.fm
new file mode 100644
index 0000000..3fdb2ce
--- /dev/null
+++ b/fm/maca_trans_parser_fann.fm
@@ -0,0 +1,25 @@
+b0l
+b0p
+b1l
+b1p
+b2l
+b2p
+b3p
+bm1p
+bm2p
+dist_s0_b0
+ldep_b0r
+ldep_s0r
+ldep_s1r
+rdep_b0r
+rdep_s0r
+rdep_s1r
+s0g
+s0l
+s0p
+s0sf
+s1p
+s2p
+t1
+t2
+t3
diff --git a/fm/maca_trans_tagger.fm b/fm/maca_trans_tagger.fm
new file mode 100644
index 0000000..50af67e
--- /dev/null
+++ b/fm/maca_trans_tagger.fm
@@ -0,0 +1,37 @@
+b0U1
+b0len
+
+b0sgn
+b1sgn
+
+b1f
+b0f
+bm1f
+bm2f
+
+bm1p
+bm2p
+bm3p
+bm2p bm1p
+bm2p bm3p
+bm1p b0sgn
+
+b0s1
+b0s2
+b0s3
+b0s4
+b0s5
+b0s1 b0s2
+b0s1 b0s2 b0s3
+b0s1 b0s2 b0s3 b0s4
+
+b0p1
+b0p2
+b0p3
+b0p4
+b0p5
+b0p1 b0p2
+b0p1 b0p2 b0p3
+b0p1 b0p2 b0p3 b0p4
+
+
diff --git a/fm/maca_trans_tagger_fann.fm b/fm/maca_trans_tagger_fann.fm
new file mode 100644
index 0000000..3f5e2c2
--- /dev/null
+++ b/fm/maca_trans_tagger_fann.fm
@@ -0,0 +1,21 @@
+b0f
+b0len
+b0p1
+b0p2
+b0p3
+b0p4
+b0p5
+b0s1
+b0s2
+b0s3
+b0s4
+b0s5
+b0sgn
+b0U1
+b1f
+b1sgn
+bm1f
+bm1p
+bm2f
+bm2p
+bm3p
diff --git a/makefiles/maca_trans_tagger.makefile b/makefiles/maca_trans_tagger.makefile
index f32b03a..ce3d13e 100644
--- a/makefiles/maca_trans_tagger.makefile
+++ b/makefiles/maca_trans_tagger.makefile
@@ -2,14 +2,21 @@
 ## compile
 ##-----------------------------------------------------------------------
 
-compile: $(MODEL_FILENAME)
+compile: $(MODEL_FILENAME) $(FANN_TRAIN)
 
 $(CFF_TRAIN): $(MCF_TRAIN)
 	maca_trans_tagger_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FILENAME) --vocabs $(VOCABS_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES)  $(STREAM_MODE) -P $(FORM_POS_FILENAME)
 
+$(CFF_FANN_TRAIN): $(MCF_TRAIN)
+	maca_trans_tagger_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FANN_FILENAME) --vocabs $(VOCABS_FANN_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES) $(STREAM_MODE) -P $(FORM_POS_FILENAME)
+
+
 $(CFF_CUTOFF_TRAIN): $(CFF_TRAIN)
 	cff_cutoff --input $< --vocabs $(VOCABS_FILENAME) --cutoff $(CFF_CUTOFF) > $@
 
+$(FANN_TRAIN): $(CFF_FANN_TRAIN)
+	cff2fann --vocabs $(VOCABS_FANN_FILENAME) --cff $< --feat_model $(FEATURES_MODEL_FANN_FILENAME) -C $(MCD_FILENAME) > $@
+
 $(MODEL_FILENAME): $(CFF_CUTOFF_TRAIN)
 #$(MODEL_FILENAME): $(CFF_TRAIN)
 	perceptron_train --cff $< --model $(MODEL_FILENAME) -n $(PERCEPTRON_ITERATIONS)
diff --git a/tools/conllu2mcf.c b/tools/conllu2mcf.c
new file mode 100644
index 0000000..a2fc73b
--- /dev/null
+++ b/tools/conllu2mcf.c
@@ -0,0 +1,158 @@
+#include<stdio.h>
+#include<stdlib.h>
+#include<string.h>
+#include<strings.h>
+#include<math.h>
+#include<getopt.h>
+#include"conll_lib.h" 
+#include"hash_str.h" 
+
+#define NB_COL 7
+
+typedef struct options
+{
+  FILE * fd_parses;                    // parser output
+  int verbose_level;
+  int snum;
+  char *filename;
+  char columns[NB_COL];
+} options;
+
+/*---------------------------------------------------------------------------------*/
+
+options op;
+
+void print_options(options *op)
+{
+  fprintf(stderr, "file name = %s\n", op->filename);
+  fprintf(stderr, "verbose level = %d\n", op->verbose_level);
+  fprintf(stderr, "maximum number of sentences to process = %d\n", op->snum);
+}
+
+void reset_options(options * op)
+{
+  int i;
+  op->filename = NULL;
+  op->fd_parses = stdin;
+  op->verbose_level = 0;
+  op->snum = 100000000;
+  for(i=0; i < NB_COL; i++)
+    op->columns[i] = '0';
+}
+
+/*---------------------------------------------------------------------------------*/
+void  print_help_message(char *program_name)
+{
+  fprintf(stderr, "%s usage: %s [options]\n", program_name, program_name);
+  fprintf(stderr, "OPTIONS :\n");
+  fprintf(stderr, "      -f <file>     : hypothesis conll file\n");
+  fprintf(stderr, "      -n <int>      : process n sentences (default is 100 000 000)\n");
+  fprintf(stderr, "      -v 1|2|3      : verbosity level\n");
+  fprintf(stderr, "      -h            : print this message\n");
+
+  fprintf(stderr, "      -1            : content of column 1 in the mcf file produced\n");
+  fprintf(stderr, "      -2            : content of column 2 in the mcf file produced\n");
+  fprintf(stderr, "      -3            : content of column 3 in the mcf file produced\n");
+  fprintf(stderr, "      -4            : content of column 4 in the mcf file produced\n");
+  fprintf(stderr, "      -5            : content of column 5 in the mcf file produced\n");
+  fprintf(stderr, "      -6            : content of column 6 in the mcf file produced\n");
+  fprintf(stderr, "      -7            : content of column 7 in the mcf file produced\n");
+  fprintf(stderr, "                    : values of options -1 to -7 must be one of\n");
+  fprintf(stderr, "                    : I for id\n");
+  fprintf(stderr, "                    : W for form\n");
+  fprintf(stderr, "                    : L for lemma\n");
+  fprintf(stderr, "                    : C for coarse part of speech\n");
+  fprintf(stderr, "                    : P for part of speech\n");
+  fprintf(stderr, "                    : F for features\n");
+  fprintf(stderr, "                    : H for head\n");
+  fprintf(stderr, "                    : D for deprel\n");
+
+}
+
+
+
+
+/*---------------------------------------------------------------------------------*/
+
+void parse_options(int argc, char *argv[], options * op)
+{
+  char c;
+
+  reset_options(op);
+  /*
+  if(argc ==1){
+    print_help_message(argv[0]);
+    exit(1);
+    }*/
+  
+  while ((c = getopt (argc, argv, "hIWLCPFHDf:n:v:1:2:3:4:5:6:7:8:9:")) != -1)
+    switch (c)
+      {
+      case 'h':
+	print_help_message(argv[0]);
+	exit(0);
+      case 'f':
+	op->filename = strdup(optarg);
+	if((op->fd_parses = fopen(op->filename, "r")) == NULL){
+	  fprintf(stderr, "cannot open hypothesis file %s : aborting\n", op->filename);
+	  exit(1);
+	}
+	break;
+      case '1':
+	op->columns[0] = optarg[0];
+	break;
+      case '2':
+	op->columns[1] = optarg[0];
+	break;
+      case '3':
+	op->columns[2] = optarg[0];
+	break;
+      case '4':
+	op->columns[3] = optarg[0];
+	break;
+      case '5':
+	op->columns[4] = optarg[0];
+	break;
+      case '6':
+	op->columns[5] = optarg[0];
+	break;
+      case '7':
+	op->columns[6] = optarg[0];
+	break;
+      case 'n':
+	op->snum = atoi(optarg);
+	break;
+      case 'v':
+	op->verbose_level = atoi(optarg);
+	break;
+      }
+  
+  /*  if (op->fd_parses == NULL){
+    fprintf(stderr, "error : cannot open parse file: aborting\n");
+    exit(1);
+    }*/
+}
+
+/*---------------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------------*/
+int main(int argc, char *argv[])
+{
+  sentence *s = allocate_sentence();
+  int snum = 0;
+  int res;
+  parse_options(argc, argv, &op);
+  
+  print_options(&op); 
+  
+  
+  for(res = load_sentence(op.fd_parses, s); res && (snum < op.snum); res = load_sentence(op.fd_parses, s)){
+    s->num = snum;
+    snum++;
+    compute_relative_index_of_heads(s);
+    print_sentence_mcf3(s, op.columns, NB_COL);
+  }
+  if(op.filename)
+    fclose(op.fd_parses);
+  free_sentence(s);
+  return 0;
+}
diff --git a/tools/eval_mcf.pl b/tools/eval_mcf.pl
new file mode 100755
index 0000000..f8fb57b
--- /dev/null
+++ b/tools/eval_mcf.pl
@@ -0,0 +1,324 @@
+#!/usr/bin/perl
+
+$arg = shift;
+while($arg){
+    if($arg eq "-g"){$ref = shift;}
+    elsif($arg eq "-s"){$hyp = shift;}
+    elsif($arg eq "-G"){$ref_mcd = shift;}
+    elsif($arg eq "-S"){$hyp_mcd = shift;}
+    elsif($arg eq "-tac"){$TAGGING_ACCURACY_PER_CATEGORY = 1;}
+    elsif($arg eq "-tcm"){$TAGGING_CONFUSION_MATRIX = 1;}
+    elsif($arg eq "-tec"){$TAGGING_ERRORS_PER_CATEGORY = 1;}
+    elsif($arg eq "-paf"){$PARSING_ACCURACY_PER_FUNCTION = 1;}
+    elsif($arg eq "-lcm"){$LABELING_CONFUSION_MATRIX = 1;}
+    elsif($arg eq "-acm"){$ATTACHEMENT_CONFUSION_MATRIX = 1;}
+    elsif($arg eq "-all"){
+	$TAGGING_ACCURACY_PER_CATEGORY = 1;
+	$TAGGING_CONFUSION_MATRIX = 1;
+	$TAGGING_ERRORS_PER_CATEGORY = 1;
+	$PARSING_ACCURACY_PER_FUNCTION = 1;
+	$LABELING_CONFUSION_MATRIX = 1;
+	$ATTACHEMENT_CONFUSION_MATRIX = 1;
+    }
+    elsif($arg eq "-h"){
+	print "usage eval07.pl OPTIONS -g <reference file> -s <system output>\n";
+	print "OPTIONS :\n";
+	print "\t-tac tagging accuracy per category\n";
+	print "\t-tcm tagging confusion matrix\n";
+	print "\t-tec tagging errors per category\n";
+	print "\t-paf parsing accuracy per function\n";
+	print "\t-lcm labeling confusion matrix\n";
+	print "\t-acm attachment confusion matrix\n";
+	print "\t-all all options\n";
+	exit;
+}
+$arg = shift;
+}
+
+# determine the column in the reference file
+
+$ref_form_col = 0;
+$ref_pos_col = 1;
+$ref_lemma_col = 2;
+$ref_gov_col = 3;
+$ref_fct_col = 4;
+$ref_seg_col = 5;
+$ref_morph_col = 10;
+
+if($ref_mcd)
+{    
+    for($i=0; $i<length $ref_mcd; $i++){
+	$car = substr($ref_mcd, $i, 1);
+#	print "car $i = $car\n";
+	if($car eq 'W'){$ref_form_col = $i; next;}
+	if($car eq 'P'){$ref_pos_col = $i; next;}
+	if($car eq 'L'){$ref_lemma_col = $i; next;}
+	if($car eq 'G'){$ref_gov_col = $i; next;}
+	if($car eq 'F'){$ref_fct_col = $i; next;}
+	if($car eq 'S'){$ref_seg_col = $i; next;}
+	if($car eq 'M'){$ref_morph_col = $i; next;}
+    }
+}
+
+# determine the column in the hypothesis file
+
+$hyp_form_col = 0;
+$hyp_pos_col = 1;
+$hyp_lemma_col = 2;
+$hyp_gov_col = 3;
+$hyp_fct_col = 4;
+$hyp_seg_col = 5;
+$hyp_morph_col = 10;
+
+if($hyp_mcd)
+{    
+    for($i=0; $i<length $hyp_mcd; $i++){
+	$car = substr($hyp_mcd, $i, 1);
+#	print "car $i = $car\n";
+	if($car eq 'W'){$hyp_form_col = $i; next;}
+	if($car eq 'P'){$hyp_pos_col = $i; next;}
+	if($car eq 'L'){$hyp_lemma_col = $i; next;}
+	if($car eq 'G'){$hyp_gov_col = $i; next;}
+	if($car eq 'F'){$hyp_fct_col = $i; next;}
+	if($car eq 'S'){$hyp_seg_col = $i; next;}
+	if($car eq 'M'){$hyp_morph_col = $i; next;}
+    }
+}
+
+
+open REF, $ref or die "cannot open file $ref";
+open HYP, $hyp or die "cannot open file $hyp";
+
+
+
+
+my $line_nb;
+my $word_nb;
+my $correct_pos_nb;
+my $correct_gov_nb;
+my $correct_gov_fct_nb;
+
+sub is_punctuation_ptb{
+    my $pos = shift(@_);
+
+    if($pos eq "``"){return 1;}
+    if($pos eq ","){return 1;}
+    if($pos eq ":"){return 1;}
+    if($pos eq "."){return 1;}
+    if($pos eq "''"){return 1;}
+    if($pos eq "-LRB-"){return 1;}
+    if($pos eq "-RRB-"){return 1;}
+    return 0;
+}
+sub is_punctuation_ftb{
+    my $pos = shift(@_);
+
+    if($pos eq "PCT"){return 1;}
+    if($pos eq "PONCT"){return 1;}
+    if($pos eq "ponctw"){return 1;}
+    if($pos eq "poncts"){return 1;}
+    return 0;
+}
+sub is_punctuation_ud{
+    my $pos = shift(@_);
+
+    if($pos eq "PUNCT"){return 1;}
+    return 0;
+}
+
+
+while(<REF>){
+    chop;
+    $line_nb++;
+#    ($ref_form, $ref_pos, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split /\t/;
+    @ref_array = split /\t/;
+    $column_nb = -1;
+    foreach $elt (@ref_array){
+	$column_nb++;
+	if($column_nb == $ref_form_col){$ref_form = $elt; next;}
+	if($column_nb == $ref_pos_col){$ref_pos = $elt; next;}
+	if($column_nb == $ref_lemma_col){$ref_lemma = $elt; next;}
+	if($column_nb == $ref_gov_col){$ref_gov = $elt; next;}
+	if($column_nb == $ref_fct_col){$ref_fct = $elt; next;}
+	if($column_nb == $ref_seg_col){$ref_seg = $elt; next;}
+	if($column_nb == $ref_morph_col){$ref_morph = $elt; next;}
+    }
+    $_ = <HYP>;
+    chop $_;
+#    print;
+#    ($hyp_form, $hyp_pos, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split /\t/;
+
+    @hyp_array = split /\t/;
+    $column_nb = -1;
+    foreach $elt (@hyp_array){
+	$column_nb++;
+	if($column_nb == $hyp_form_col){$hyp_form = $elt; next;}
+	if($column_nb == $hyp_pos_col){$hyp_pos = $elt; next;}
+	if($column_nb == $hyp_lemma_col){$hyp_lemma = $elt; next;}
+	if($column_nb == $hyp_gov_col){$hyp_gov = $elt; next;}
+	if($column_nb == $hyp_fct_col){$hyp_fct = $elt; next;}
+	if($column_nb == $hyp_seg_col){$hyp_seg = $elt; next;}
+	if($column_nb == $hyp_morph_col){$hyp_morph = $elt; next;}
+    }
+
+
+
+#    print "ref = $hyp_seg\n";
+
+    if($ref_seg){ $nb_ref_seg++;}
+    if($hyp_seg){ $nb_hyp_seg++;}
+    
+    if(($ref_seg) && ($hyp_seg)){ $nb_hyp_ref_seg++;}
+    
+#    if(($ref_index) && (!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){
+    if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){
+#    if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))  && (!is_punctuation_ud($ref_pos))){
+	if($ref_form ne $hyp_form){die "mismatch line $line_nb\n";}
+	$word_nb++;
+	$pos_nb{$ref_pos}++;
+	$fct_nb{$ref_fct}++;
+
+	if($ref_pos eq $hyp_pos){
+	    $correct_pos_total_nb++; 
+	    $correct_pos_nb{$ref_pos}++;
+	}
+	else{
+	    $false_pos_form{$ref_pos}{$ref_form}++;
+	    $pos_confusion_matrix{$ref_pos}{$hyp_pos}++;
+
+
+#	    print "$ref_form $ref_pos $hyp_pos\n";
+#	    print "$ref_pos $hyp_pos\n";
+	}
+	
+	if($ref_lemma eq $hyp_lemma){
+	    $correct_lemma_total_nb++; 
+	}
+	else{
+#	    print "$ref_form \t $ref_lemma \t $hyp_lemma\n";
+	}
+	$ref_dist = $ref_gov - $ref_index;
+	$hyp_dist = $hyp_gov - $hyp_index;
+#	if($ref_gov eq $hyp_gov){
+	if($ref_dist eq $hyp_dist){
+	    $correct_gov_nb++;
+	    $correct_gov_total_nb++;
+	    if($ref_fct eq $hyp_fct){
+		$correct_gov_fct_total_nb++; 
+		$correct_gov_fct_nb{$ref_fct}++;
+	    }
+	    else{
+		$labeling_confusion_matrix{$ref_fct}{$hyp_fct}++;
+	    }
+	}
+	else{
+	    $attachement_confusion_matrix{$ref_fct}{$hyp_fct}++;
+	}
+	
+    }
+    
+    $ref_index = "";
+}
+
+
+close REF;
+close HYP;
+
+
+my $pos_acc = $correct_pos_total_nb / $word_nb * 100;
+my $lemma_acc = $correct_lemma_total_nb / $word_nb * 100;
+my $las = $correct_gov_fct_total_nb / $word_nb * 100;
+my $uas = $correct_gov_total_nb / $word_nb  * 100 ;
+
+
+
+my $seg_recall = $nb_hyp_ref_seg / $nb_ref_seg;
+my $seg_precision = $nb_hyp_ref_seg / ($nb_hyp_seg + 1);
+
+
+printf(stderr "pos acc = %.2f lemma acc = %.2f uas = %.2f las = %.2f seg recall = %.2f seg precision = %.2f size = %d\n", $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb);
+printf(stdout "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n", $hyp, $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb);
+
+
+
+if($TAGGING_ACCURACY_PER_CATEGORY){
+    print "\n\n--------------------------------------------------------------------------------------\n";
+    printf "TAGGING ACCURACY PER CATEGORY\n";
+    printf "CAT\tFREQ\tACC\tIMPACT\n";
+    foreach $pos (keys %correct_pos_nb){
+	$acc = $correct_pos_nb{$pos} / $pos_nb{$pos}; 
+	$freq = $pos_nb{$pos} / $word_nb;
+	if($word_nb == $correct_pos_total_nb){
+	    $impact = 0;
+	}
+	else{
+	$impact =  ($pos_nb{$pos} - $correct_pos_nb{$pos}) / ($word_nb - $correct_pos_total_nb);
+	}
+	printf("%s\t%6.2f\t%6.2f\t%6.2f\n", $pos, $freq*100, $acc*100, $impact*100);
+    } 
+}
+
+if($TAGGING_CONFUSION_MATRIX){
+    print "\n\n--------------------------------------------------------------------------------------\n";
+    print "TAGGING CONFUSION MATRIX\n";
+    foreach $ref_pos (keys %pos_confusion_matrix){
+	$pos_error_nb = $pos_nb{$ref_pos} - $correct_pos_nb{$ref_pos};
+	print "$ref_pos ($pos_error_nb) :";
+	    foreach $hyp_pos (keys %{$pos_confusion_matrix{$ref_pos}}){
+	    print "\t$hyp_pos ($pos_confusion_matrix{$ref_pos}{$hyp_pos})";
+	}
+	print "\n";
+    }
+}
+
+
+if($TAGGING_ERRORS_PER_CATEGORY){
+    print "\n\n--------------------------------------------------------------------------------------\n";
+    print "TAGGING ERRORS PER CATEGORY\n";
+    foreach $pos (keys %false_pos_form){
+	print "\n$pos\n";
+	foreach $form (keys %{$false_pos_form{$pos}}){
+	    print "\t$form $false_pos_form{$pos}{$form}\n";
+	}
+    }
+}
+
+if($PARSING_ACCURACY_PER_FUNCTION){
+    print "\n\n--------------------------------------------------------------------------------------\n";
+    printf "LABELED ATTACHMENT SCORE PER LABEL\n";
+    printf "LABEL       FREQ           ACC   IMPACT\n";
+    foreach $fct (keys %correct_gov_fct_nb){
+	$acc = $correct_gov_fct_nb{$fct} / $fct_nb{$fct}; 
+	$freq = $fct_nb{$fct}/$word_nb;
+	$impact =  ($fct_nb{$fct} - $correct_gov_fct_nb{$fct}) / ($word_nb - $correct_gov_fct_total_nb++);
+	printf("%-10s%6.2f\t%6.2f\t%6.2f\n", $fct, $freq*100, $acc*100, $impact*100);
+    }
+}
+
+if($ATTACHEMENT_CONFUSION_MATRIX){
+    print "\n\n--------------------------------------------------------------------------------------\n";
+    printf "ATTACHEMENT CONFUSION MATRIX\n";
+    foreach $ref_fct (keys %attachement_confusion_matrix){
+	$attachement_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct};
+	print "$ref_fct ($attachement_error_nb) :";
+	foreach $hyp_fct (keys %{$attachement_confusion_matrix{$ref_fct}}){
+	    print "\t$hyp_fct ($attachement_confusion_matrix{$ref_fct}{$hyp_fct})";
+	}
+	print "\n";
+    }
+
+}
+
+if($LABELING_CONFUSION_MATRIX){
+    print "\n\n--------------------------------------------------------------------------------------\n";
+    printf "LABELING CONFUSION MATRIX\n";
+    foreach $ref_fct (keys %labeling_confusion_matrix){
+	$fct_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct};
+	print "$ref_fct ($fct_error_nb) :";
+	foreach $hyp_fct (keys %{$labeling_confusion_matrix{$ref_fct}}){
+	    print "\t$hyp_fct ($labeling_confusion_matrix{$ref_fct}{$hyp_fct})";
+	}
+	print "\n";
+    }
+
+}
diff --git a/tools/eval_wpmlgfs.pl b/tools/eval_wpmlgfs.pl
new file mode 100755
index 0000000..a6d1097
--- /dev/null
+++ b/tools/eval_wpmlgfs.pl
@@ -0,0 +1,234 @@
+#!/usr/bin/perl
+
+$arg = shift;
+while($arg){
+    if($arg eq "-g"){$ref = shift;}
+    elsif($arg eq "-s"){$hyp = shift;}
+    elsif($arg eq "-tac"){$TAGGING_ACCURACY_PER_CATEGORY = 1;}
+    elsif($arg eq "-tcm"){$TAGGING_CONFUSION_MATRIX = 1;}
+    elsif($arg eq "-tec"){$TAGGING_ERRORS_PER_CATEGORY = 1;}
+    elsif($arg eq "-paf"){$PARSING_ACCURACY_PER_FUNCTION = 1;}
+    elsif($arg eq "-lcm"){$LABELING_CONFUSION_MATRIX = 1;}
+    elsif($arg eq "-acm"){$ATTACHEMENT_CONFUSION_MATRIX = 1;}
+    elsif($arg eq "-all"){
+	$TAGGING_ACCURACY_PER_CATEGORY = 1;
+	$TAGGING_CONFUSION_MATRIX = 1;
+	$TAGGING_ERRORS_PER_CATEGORY = 1;
+	$PARSING_ACCURACY_PER_FUNCTION = 1;
+	$LABELING_CONFUSION_MATRIX = 1;
+	$ATTACHEMENT_CONFUSION_MATRIX = 1;
+    }
+    elsif($arg eq "-h"){
+	print "usage eval07.pl OPTIONS -g <gold file> -s <system output>\n";
+	print "OPTIONS :\n";
+	print "\t-tac tagging accuracy per category\n";
+	print "\t-tcm tagging confusion matrix\n";
+	print "\t-tec tagging errors per category\n";
+	print "\t-paf parsing accuracy per function\n";
+	print "\t-lcm labeling confusion matrix\n";
+	print "\t-acm attachment confusion matrix\n";
+	print "\t-all all options\n";
+	exit;
+}
+$arg = shift;
+}
+
+open REF, $ref or die "cannot open file $ref";
+open HYP, $hyp or die "cannot open file $hyp";
+
+
+my $line_nb;
+my $word_nb;
+my $correct_pos_nb;
+my $correct_gov_nb;
+my $correct_gov_fct_nb;
+
+sub is_punctuation_ptb{
+    my $pos = shift(@_);
+
+    if($pos eq "``"){return 1;}
+    if($pos eq ","){return 1;}
+    if($pos eq ":"){return 1;}
+    if($pos eq "."){return 1;}
+    if($pos eq "''"){return 1;}
+    if($pos eq "-LRB-"){return 1;}
+    if($pos eq "-RRB-"){return 1;}
+    return 0;
+}
+sub is_punctuation_ftb{
+    my $pos = shift(@_);
+
+    if($pos eq "PCT"){return 1;}
+    if($pos eq "PONCT"){return 1;}
+    if($pos eq "ponctw"){return 1;}
+    if($pos eq "poncts"){return 1;}
+    return 0;
+}
+
+
+while(<REF>){
+    chop;
+    $line_nb++;
+    ($ref_form, $ref_pos, $ref_morpho, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split /\t/;
+    $_ = <HYP>;
+    chop $_;
+#    print;
+    ($hyp_form, $hyp_pos, $ref_morpho, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split /\t/;
+#    print "ref = $hyp_seg\n";
+
+    if($ref_seg){ $nb_ref_seg++;}
+    if($hyp_seg){ $nb_hyp_seg++;}
+    
+    if(($ref_seg) && ($hyp_seg)){ $nb_hyp_ref_seg++;}
+    
+#    if(($ref_index) && (!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){
+    if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){
+	if($ref_form ne $hyp_form){die "mismatch line $line_nb\n";}
+	$word_nb++;
+	$pos_nb{$ref_pos}++;
+	$fct_nb{$ref_fct}++;
+
+	if($ref_pos eq $hyp_pos){
+	    $correct_pos_total_nb++; 
+	    $correct_pos_nb{$ref_pos}++;
+	}
+	else{
+	    $false_pos_form{$ref_pos}{$ref_form}++;
+	    $pos_confusion_matrix{$ref_pos}{$hyp_pos}++;
+
+
+#	    print "$ref_form $ref_pos $hyp_pos\n";
+#	    print "$ref_pos $hyp_pos\n";
+	}
+	
+	if($ref_lemma eq $hyp_lemma){
+	    $correct_lemma_total_nb++; 
+	}
+	else{
+#	    print "$ref_form \t $ref_lemma \t $hyp_lemma\n";
+	}
+	$ref_dist = $ref_gov - $ref_index;
+	$hyp_dist = $hyp_gov - $hyp_index;
+#	if($ref_gov eq $hyp_gov){
+	if($ref_dist eq $hyp_dist){
+	    $correct_gov_nb++;
+	    $correct_gov_total_nb++;
+	    if($ref_fct eq $hyp_fct){
+		$correct_gov_fct_total_nb++; 
+		$correct_gov_fct_nb{$ref_fct}++;
+	    }
+	    else{
+		$labeling_confusion_matrix{$ref_fct}{$hyp_fct}++;
+	    }
+	}
+	else{
+	    $attachement_confusion_matrix{$ref_fct}{$hyp_fct}++;
+	}
+	
+    }
+    
+    $ref_index = "";
+}
+
+
+close REF;
+close HYP;
+
+
+my $pos_acc = $correct_pos_total_nb / $word_nb * 100;
+my $lemma_acc = $correct_lemma_total_nb / $word_nb * 100;
+my $las = $correct_gov_fct_total_nb / $word_nb * 100;
+my $uas = $correct_gov_total_nb / $word_nb  * 100 ;
+
+
+
+my $seg_recall = $nb_hyp_ref_seg / $nb_ref_seg;
+my $seg_precision = $nb_hyp_ref_seg / ($nb_hyp_seg + 1);
+
+
+printf(stderr "pos acc = %.2f lemma acc = %.2f uas = %.2f las = %.2f seg recall = %.2f seg precision = %.2f size = %d\n", $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb);
+printf(stdout "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n", $hyp, $pos_acc, $lemma_acc, $uas, $las, $seg_recall, $seg_precision, $word_nb);
+
+
+
+if($TAGGING_ACCURACY_PER_CATEGORY){
+    print "\n\n--------------------------------------------------------------------------------------\n";
+    printf "TAGGING ACCURACY PER CATEGORY\n";
+    printf "CAT\tFREQ\tACC\tIMPACT\n";
+    foreach $pos (keys %correct_pos_nb){
+	$acc = $correct_pos_nb{$pos} / $pos_nb{$pos}; 
+	$freq = $pos_nb{$pos} / $word_nb;
+	if($word_nb == $correct_pos_total_nb){
+	    $impact = 0;
+	}
+	else{
+	$impact =  ($pos_nb{$pos} - $correct_pos_nb{$pos}) / ($word_nb - $correct_pos_total_nb);
+	}
+	printf("%s\t%6.2f\t%6.2f\t%6.2f\n", $pos, $freq*100, $acc*100, $impact*100);
+    } 
+}
+
+if($TAGGING_CONFUSION_MATRIX){
+    print "\n\n--------------------------------------------------------------------------------------\n";
+    print "TAGGING CONFUSION MATRIX\n";
+    foreach $ref_pos (keys %pos_confusion_matrix){
+	$pos_error_nb = $pos_nb{$ref_pos} - $correct_pos_nb{$ref_pos};
+	print "$ref_pos ($pos_error_nb) :";
+	    foreach $hyp_pos (keys %{$pos_confusion_matrix{$ref_pos}}){
+	    print "\t$hyp_pos ($pos_confusion_matrix{$ref_pos}{$hyp_pos})";
+	}
+	print "\n";
+    }
+}
+
+
+if($TAGGING_ERRORS_PER_CATEGORY){
+    print "\n\n--------------------------------------------------------------------------------------\n";
+    print "TAGGING ERRORS PER CATEGORY\n";
+    foreach $pos (keys %false_pos_form){
+	print "\n$pos\n";
+	foreach $form (keys %{$false_pos_form{$pos}}){
+	    print "\t$form $false_pos_form{$pos}{$form}\n";
+	}
+    }
+}
+
+if($PARSING_ACCURACY_PER_FUNCTION){
+    print "\n\n--------------------------------------------------------------------------------------\n";
+    printf "LABELED ATTACHMENT SCORE PER LABEL\n";
+    printf "LABEL       FREQ           ACC   IMPACT\n";
+    foreach $fct (keys %correct_gov_fct_nb){
+	$acc = $correct_gov_fct_nb{$fct} / $fct_nb{$fct}; 
+	$freq = $fct_nb{$fct}/$word_nb;
+	$impact =  ($fct_nb{$fct} - $correct_gov_fct_nb{$fct}) / ($word_nb - $correct_gov_fct_total_nb++);
+	printf("%-10s%6.2f\t%6.2f\t%6.2f\n", $fct, $freq*100, $acc*100, $impact*100);
+    }
+}
+
+if($ATTACHEMENT_CONFUSION_MATRIX){
+    print "\n\n--------------------------------------------------------------------------------------\n";
+    printf "ATTACHEMENT CONFUSION MATRIX\n";
+    foreach $ref_fct (keys %attachement_confusion_matrix){
+	$attachement_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct};
+	print "$ref_fct ($attachement_error_nb) :";
+	foreach $hyp_fct (keys %{$attachement_confusion_matrix{$ref_fct}}){
+	    print "\t$hyp_fct ($attachement_confusion_matrix{$ref_fct}{$hyp_fct})";
+	}
+	print "\n";
+    }
+
+}
+
+if($LABELING_CONFUSION_MATRIX){
+    print "\n\n--------------------------------------------------------------------------------------\n";
+    printf "LABELING CONFUSION MATRIX\n";
+    foreach $ref_fct (keys %labeling_confusion_matrix){
+	$fct_error_nb = $fct_nb{$ref_fct} - $correct_gov_fct_nb{$ref_fct};
+	print "$ref_fct ($fct_error_nb) :";
+	foreach $hyp_fct (keys %{$labeling_confusion_matrix{$ref_fct}}){
+	    print "\t$hyp_fct ($labeling_confusion_matrix{$ref_fct}{$hyp_fct})";
+	}
+	print "\n";
+    }
+
+}
diff --git a/tools/fplm2fP_ud.pl b/tools/fplm2fP_ud.pl
new file mode 100755
index 0000000..0758ed5
--- /dev/null
+++ b/tools/fplm2fP_ud.pl
@@ -0,0 +1,58 @@
+#!/usr/bin/perl
+
+
+$postag{"ADJ"} = 1;
+$postag{"ADP"} = 1;
+$postag{"ADV"} = 1;
+$postag{"AUX"} = 1;
+$postag{"CCONJ"} = 1;
+$postag{"DET"} = 1;
+$postag{"INTJ"} = 1;
+$postag{"NOUN"} = 1;
+$postag{"NUM"} = 1;
+$postag{"PART"} = 1;
+$postag{"PRON"} = 1;
+$postag{"PROPN"} = 1;
+$postag{"PUNCT"} = 1;
+$postag{"SCONJ"} = 1;
+$postag{"SYM"} = 1;
+$postag{"VERB"} = 1;
+$postag{"X"} = 1;
+
+while(<>){
+    ($form, $pos, $lemma, $morpho) = split /\t/;
+    if($postag{$pos}){
+	$h_form2pos{$form}{$pos} = 1;
+	$h_pos{$pos} += 1;
+    }
+}
+
+$nbelem = keys %h_form2pos;
+print "$nbelem\n";
+
+$nbelem = keys %h_pos;
+print "$nbelem\n";
+$first = 1;
+foreach $pos (keys %h_pos){
+    if($first){
+	$first = 0;
+    }
+    else{
+	print "\t";
+    }
+    print $pos;
+}
+print "\n";
+
+foreach $form (keys %h_form2pos){
+    print "$form\t";
+    foreach $pos (keys %h_pos){
+	if($h_form2pos{$form}{$pos}){
+	    print "1";
+	}
+	else{
+	    print "0";
+	}
+    }
+    print "\n";
+}
-- 
GitLab