From 9f27e7bb9d28d3238e25b46ddad5ab36073a1897 Mon Sep 17 00:00:00 2001 From: Alexis Nasr <alexis.nasr@lif.univ-mrs.fr> Date: Thu, 26 Jan 2017 09:37:51 +0100 Subject: [PATCH] bug fixed in eval_wplgfs.pl and in conll_lib --- orfeo/data/treebank/Makefile | 50 ++++++-------------------------- orfeo/maca_trans_parser/Makefile | 15 +++++----- orfeo/maca_trans_tagger/Makefile | 13 ++++----- tools/conll_lib.c | 7 +++-- tools/eval_wplgfs.pl | 4 +-- 5 files changed, 29 insertions(+), 60 deletions(-) diff --git a/orfeo/data/treebank/Makefile b/orfeo/data/treebank/Makefile index 802e4dc..2bd0183 100644 --- a/orfeo/data/treebank/Makefile +++ b/orfeo/data/treebank/Makefile @@ -1,46 +1,14 @@ -DECODA_DIR=../../../data/decoda -TOOLS_GEN=../../../tools -#TOOLS_ORFEO=../../tools -TOOLS_ORFEO=$(TOOLS_GEN) +TOOLS=../../../tools +TRAIN=orfeo.train.conll07 +TEST=orfeo.test.conll07 -CORPUS=$(DECODA_DIR)/corpus_decoda_lot1_2_silver.tsv -TEST=$(DECODA_DIR)/corpus_decoda_gold1_checked_tbaz11decembre2013.tsv +compile: train.mcf test.mcf #dev.mcf -TRAIN=decoda.train.conll07 -TRAIN_NODISF=decoda.train.nodisf.conll07 -TRAIN_NOTRONC=decoda.train.notronc.conll07 +train.mcf: $(TRAIN) + $(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@ -compile: train.conll07 test.conll07 - -#elimine les mots tronques -train.conll07: decoda.train.conll07 - $(TOOLS_ORFEO)/decoda2orfeo -t -f $< > $@ - -#elimine les mots tronques et les disfluences -#train.conll07: decoda.train.conll07 -# $(TOOLS_ORFEO)/decoda2orfeo -td -f $< > $@ - -#train.conll07: decoda.train.conll07 -# $(TOOLS_ORFEO)/decoda2orfeo -f $< > $@ - -decoda.train.conll07: $(CORPUS) ./split_decoda.pl - ./split_decoda.pl < $< - cat decoda.train.tsv |$(TOOLS_ORFEO)/process_decoda_tsv -lemma list_mot_pos_lemme_lefff.txt -fmtout conll07 -addlinkdisf > $@ - -test.conll07: decoda.test.conll07 - $(TOOLS_ORFEO)/decoda2orfeo -t -f $< > $@ - -#test.nodisf.conll07: decoda.test.conll07 -# $(TOOLS_ORFEO)/decoda2orfeo -td -f $< > $@ - -#test.conll07: decoda.test.conll07 -# $(TOOLS_ORFEO)/decoda2orfeo -f $< > $@ - -decoda.test.conll07: $(TEST) - cat $< |$(TOOLS_ORFEO)/process_decoda_tsv -lemma list_mot_pos_lemme_lefff.txt -fmtout conll07 -addlinkdisf > $@ +test.mcf: $(TEST) + $(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@ clean: - - rm decoda.train.conll07 - - rm decoda.test.conll07 - - rm test.conll07 - - rm train.conll07 + - rm test.mcf train.mcf diff --git a/orfeo/maca_trans_parser/Makefile b/orfeo/maca_trans_parser/Makefile index 08836b6..1cbd002 100644 --- a/orfeo/maca_trans_parser/Makefile +++ b/orfeo/maca_trans_parser/Makefile @@ -1,16 +1,17 @@ -CONLL07TRAIN=../data/treebank/train.conll07 -CONLL07DEV=../data/treebank/dev.conll07 -CONLL07TEST=../data/treebank/test.conll07 +MCF_TRAIN=../data/treebank/train.mcf +MCF_DEV=../data/treebank/dev.mcf +MCF_TEST=../data/treebank/test.mcf CFF_TRAIN=train.cff CFF_CUTOFF_TRAIN=train.cutoff.cff PERCEPTRON_ITERATIONS=5 -CFF_CUTOFF=1 +CFF_CUTOFF=3 FEATURES_MODEL_FILENAME=maca_trans_parser.fm VOCABS_FILENAME=maca_trans_parser.vocab MODEL_FILENAME=maca_trans_parser.model -NUMBER_OF_SENTENCES=10000000 -STREAM_MODE= -#STREAM_MODE= -S +NUMBER_OF_SENTENCES=4218 +#NUMBER_OF_SENTENCES=1000 +MCD_FILENAME=wplgfs.mcd +STREAM_MODE= -S include ../../makefiles/maca_trans_parser.makefile diff --git a/orfeo/maca_trans_tagger/Makefile b/orfeo/maca_trans_tagger/Makefile index b7f0632..baa1fbd 100644 --- a/orfeo/maca_trans_tagger/Makefile +++ b/orfeo/maca_trans_tagger/Makefile @@ -1,22 +1,21 @@ -CONLL07TRAIN=../data/treebank/train.conll07 -CONLL07DEV=../data/treebank/dev.conll07 -CONLL07TEST=../data/treebank/test.conll07 +MCF_TRAIN=../data/treebank/train.mcf +MCF_DEV=../data/treebank/dev.mcf +MCF_TEST=../data/treebank/test.mcf CFF_TRAIN=train.cff CFF_CUTOFF_TRAIN=train.cutoff.cff -PERCEPTRON_ITERATIONS=5 +PERCEPTRON_ITERATIONS=9 CFF_CUTOFF=1 - FEATURES_MODEL_FILENAME=maca_trans_tagger.fm VOCABS_FILENAME=maca_trans_tagger.vocab MCD_FILENAME=maca_trans_tagger.mcd MODEL_FILENAME=maca_trans_tagger.model NUMBER_OF_SENTENCES=10000000 -STREAM_MODE= -#STREAM_MODE= -S +STREAM_MODE= -S FORM_POS_FILENAME=../data/morpho-lexicon/fP +#include ./maca_trans_tagger.makefile include ../../makefiles/maca_trans_tagger.makefile diff --git a/tools/conll_lib.c b/tools/conll_lib.c index b55d86a..fd05b6c 100644 --- a/tools/conll_lib.c +++ b/tools/conll_lib.c @@ -164,9 +164,10 @@ int parse_line(FILE *f, sentence *s) /* 3 storm storm _ NN _ 4 nsubj _ _ */ /* 4 swept sweep _ VBD _ 26 ccomp _ _ */ /* 5 through through _ IN _ 4 prep _ _ */ - - sscanf(buff, "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel); - /* printf("form = %s\n", w->form); + + /* sscanf(buff, "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel); */ + sscanf(buff, "%d\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel); + /* printf("form = %s\n", w->form); printf("lemma = %s\n", w->lemma); printf("cpostag = %s\n", w->cpostag); printf("postag = %s\n", w->postag); diff --git a/tools/eval_wplgfs.pl b/tools/eval_wplgfs.pl index bdc0eeb..de3c408 100755 --- a/tools/eval_wplgfs.pl +++ b/tools/eval_wplgfs.pl @@ -68,9 +68,9 @@ sub is_punctuation_ftb{ while(<REF>){ $line_nb++; - ($ref_form, $ref_pos, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split; + ($ref_form, $ref_pos, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split /\t/; $_ = <HYP>; - ($hyp_form, $hyp_pos, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split; + ($hyp_form, $hyp_pos, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split /\t/; if($ref_seg){ $nb_ref_seg++;} if($hyp_seg){ $nb_hyp_seg++;} -- GitLab