From 9f27e7bb9d28d3238e25b46ddad5ab36073a1897 Mon Sep 17 00:00:00 2001
From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr>
Date: Thu, 26 Jan 2017 09:37:51 +0100
Subject: [PATCH] bug fixed in eval_wplgfs.pl and in conll_lib

---
 orfeo/data/treebank/Makefile     | 50 ++++++--------------------------
 orfeo/maca_trans_parser/Makefile | 15 +++++-----
 orfeo/maca_trans_tagger/Makefile | 13 ++++-----
 tools/conll_lib.c                |  7 +++--
 tools/eval_wplgfs.pl             |  4 +--
 5 files changed, 29 insertions(+), 60 deletions(-)

diff --git a/orfeo/data/treebank/Makefile b/orfeo/data/treebank/Makefile
index 802e4dc..2bd0183 100644
--- a/orfeo/data/treebank/Makefile
+++ b/orfeo/data/treebank/Makefile
@@ -1,46 +1,14 @@
-DECODA_DIR=../../../data/decoda
-TOOLS_GEN=../../../tools
-#TOOLS_ORFEO=../../tools
-TOOLS_ORFEO=$(TOOLS_GEN)
+TOOLS=../../../tools
+TRAIN=orfeo.train.conll07
+TEST=orfeo.test.conll07
 
-CORPUS=$(DECODA_DIR)/corpus_decoda_lot1_2_silver.tsv
-TEST=$(DECODA_DIR)/corpus_decoda_gold1_checked_tbaz11decembre2013.tsv
+compile: train.mcf  test.mcf #dev.mcf
 
-TRAIN=decoda.train.conll07
-TRAIN_NODISF=decoda.train.nodisf.conll07
-TRAIN_NOTRONC=decoda.train.notronc.conll07
+train.mcf: $(TRAIN)
+	$(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@
 
-compile: train.conll07 test.conll07
-
-#elimine les mots tronques
-train.conll07: decoda.train.conll07 
-	$(TOOLS_ORFEO)/decoda2orfeo -t -f $< > $@ 
-
-#elimine les mots tronques et les disfluences
-#train.conll07: decoda.train.conll07 
-#	$(TOOLS_ORFEO)/decoda2orfeo -td -f $< > $@ 
-
-#train.conll07: decoda.train.conll07
-#	$(TOOLS_ORFEO)/decoda2orfeo -f $< > $@ 
-
-decoda.train.conll07: $(CORPUS) ./split_decoda.pl
-	./split_decoda.pl < $<
-	cat decoda.train.tsv |$(TOOLS_ORFEO)/process_decoda_tsv -lemma list_mot_pos_lemme_lefff.txt -fmtout conll07 -addlinkdisf  > $@
-
-test.conll07: decoda.test.conll07
-	$(TOOLS_ORFEO)/decoda2orfeo -t -f $< > $@ 
-
-#test.nodisf.conll07: decoda.test.conll07
-#	$(TOOLS_ORFEO)/decoda2orfeo -td -f $< > $@ 
-
-#test.conll07: decoda.test.conll07
-#	$(TOOLS_ORFEO)/decoda2orfeo -f $< > $@ 
-
-decoda.test.conll07: $(TEST)
-	cat $< |$(TOOLS_ORFEO)/process_decoda_tsv -lemma list_mot_pos_lemme_lefff.txt -fmtout conll07 -addlinkdisf > $@
+test.mcf: $(TEST)
+	$(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@
 
 clean: 
-	- rm decoda.train.conll07
-	- rm decoda.test.conll07
-	- rm test.conll07
-	- rm train.conll07
+	- rm test.mcf train.mcf
diff --git a/orfeo/maca_trans_parser/Makefile b/orfeo/maca_trans_parser/Makefile
index 08836b6..1cbd002 100644
--- a/orfeo/maca_trans_parser/Makefile
+++ b/orfeo/maca_trans_parser/Makefile
@@ -1,16 +1,17 @@
-CONLL07TRAIN=../data/treebank/train.conll07
-CONLL07DEV=../data/treebank/dev.conll07
-CONLL07TEST=../data/treebank/test.conll07
+MCF_TRAIN=../data/treebank/train.mcf
+MCF_DEV=../data/treebank/dev.mcf
+MCF_TEST=../data/treebank/test.mcf
 
 CFF_TRAIN=train.cff
 CFF_CUTOFF_TRAIN=train.cutoff.cff
 PERCEPTRON_ITERATIONS=5
-CFF_CUTOFF=1
+CFF_CUTOFF=3
 FEATURES_MODEL_FILENAME=maca_trans_parser.fm
 VOCABS_FILENAME=maca_trans_parser.vocab 
 MODEL_FILENAME=maca_trans_parser.model 
-NUMBER_OF_SENTENCES=10000000
-STREAM_MODE=
-#STREAM_MODE= -S
+NUMBER_OF_SENTENCES=4218
+#NUMBER_OF_SENTENCES=1000
+MCD_FILENAME=wplgfs.mcd
+STREAM_MODE= -S
 
 include ../../makefiles/maca_trans_parser.makefile
diff --git a/orfeo/maca_trans_tagger/Makefile b/orfeo/maca_trans_tagger/Makefile
index b7f0632..baa1fbd 100644
--- a/orfeo/maca_trans_tagger/Makefile
+++ b/orfeo/maca_trans_tagger/Makefile
@@ -1,22 +1,21 @@
-CONLL07TRAIN=../data/treebank/train.conll07
-CONLL07DEV=../data/treebank/dev.conll07
-CONLL07TEST=../data/treebank/test.conll07
+MCF_TRAIN=../data/treebank/train.mcf
+MCF_DEV=../data/treebank/dev.mcf
+MCF_TEST=../data/treebank/test.mcf
 
 
 CFF_TRAIN=train.cff
 CFF_CUTOFF_TRAIN=train.cutoff.cff
-PERCEPTRON_ITERATIONS=5
+PERCEPTRON_ITERATIONS=9
 CFF_CUTOFF=1
-
 FEATURES_MODEL_FILENAME=maca_trans_tagger.fm
 VOCABS_FILENAME=maca_trans_tagger.vocab 
 MCD_FILENAME=maca_trans_tagger.mcd
 MODEL_FILENAME=maca_trans_tagger.model 
 NUMBER_OF_SENTENCES=10000000
-STREAM_MODE=
-#STREAM_MODE= -S
+STREAM_MODE= -S
 
 FORM_POS_FILENAME=../data/morpho-lexicon/fP
 
+#include ./maca_trans_tagger.makefile
 include ../../makefiles/maca_trans_tagger.makefile
 
diff --git a/tools/conll_lib.c b/tools/conll_lib.c
index b55d86a..fd05b6c 100644
--- a/tools/conll_lib.c
+++ b/tools/conll_lib.c
@@ -164,9 +164,10 @@ int parse_line(FILE *f, sentence *s)
     /* 3	storm	storm	_	NN	_	4	nsubj	_	_ */
     /* 4	swept	sweep	_	VBD	_	26	ccomp	_	_ */
     /* 5	through	through	_	IN	_	4	prep	_	_ */
-    
-    sscanf(buff, "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel);
-    /*    printf("form = %s\n", w->form);
+     
+    /* sscanf(buff, "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel); */
+    sscanf(buff, "%d\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel);
+    /*       printf("form = %s\n", w->form);
     printf("lemma = %s\n", w->lemma);
     printf("cpostag = %s\n", w->cpostag);
     printf("postag = %s\n", w->postag);
diff --git a/tools/eval_wplgfs.pl b/tools/eval_wplgfs.pl
index bdc0eeb..de3c408 100755
--- a/tools/eval_wplgfs.pl
+++ b/tools/eval_wplgfs.pl
@@ -68,9 +68,9 @@ sub is_punctuation_ftb{
 
 while(<REF>){
     $line_nb++;
-    ($ref_form, $ref_pos, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split;
+    ($ref_form, $ref_pos, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split /\t/;
     $_ = <HYP>;
-    ($hyp_form, $hyp_pos, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split;
+    ($hyp_form, $hyp_pos, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split /\t/;
 
     if($ref_seg){ $nb_ref_seg++;}
     if($hyp_seg){ $nb_hyp_seg++;}
-- 
GitLab