diff --git a/orfeo/data/treebank/Makefile b/orfeo/data/treebank/Makefile index 802e4dc563909f6690ab9aa6d84921f606259cbd..2bd0183d59d46fd923edb8c19e70e1988999f188 100644 --- a/orfeo/data/treebank/Makefile +++ b/orfeo/data/treebank/Makefile @@ -1,46 +1,14 @@ -DECODA_DIR=../../../data/decoda -TOOLS_GEN=../../../tools -#TOOLS_ORFEO=../../tools -TOOLS_ORFEO=$(TOOLS_GEN) +TOOLS=../../../tools +TRAIN=orfeo.train.conll07 +TEST=orfeo.test.conll07 -CORPUS=$(DECODA_DIR)/corpus_decoda_lot1_2_silver.tsv -TEST=$(DECODA_DIR)/corpus_decoda_gold1_checked_tbaz11decembre2013.tsv +compile: train.mcf test.mcf #dev.mcf -TRAIN=decoda.train.conll07 -TRAIN_NODISF=decoda.train.nodisf.conll07 -TRAIN_NOTRONC=decoda.train.notronc.conll07 +train.mcf: $(TRAIN) + $(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@ -compile: train.conll07 test.conll07 - -#elimine les mots tronques -train.conll07: decoda.train.conll07 - $(TOOLS_ORFEO)/decoda2orfeo -t -f $< > $@ - -#elimine les mots tronques et les disfluences -#train.conll07: decoda.train.conll07 -# $(TOOLS_ORFEO)/decoda2orfeo -td -f $< > $@ - -#train.conll07: decoda.train.conll07 -# $(TOOLS_ORFEO)/decoda2orfeo -f $< > $@ - -decoda.train.conll07: $(CORPUS) ./split_decoda.pl - ./split_decoda.pl < $< - cat decoda.train.tsv |$(TOOLS_ORFEO)/process_decoda_tsv -lemma list_mot_pos_lemme_lefff.txt -fmtout conll07 -addlinkdisf > $@ - -test.conll07: decoda.test.conll07 - $(TOOLS_ORFEO)/decoda2orfeo -t -f $< > $@ - -#test.nodisf.conll07: decoda.test.conll07 -# $(TOOLS_ORFEO)/decoda2orfeo -td -f $< > $@ - -#test.conll07: decoda.test.conll07 -# $(TOOLS_ORFEO)/decoda2orfeo -f $< > $@ - -decoda.test.conll07: $(TEST) - cat $< |$(TOOLS_ORFEO)/process_decoda_tsv -lemma list_mot_pos_lemme_lefff.txt -fmtout conll07 -addlinkdisf > $@ +test.mcf: $(TEST) + $(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@ clean: - - rm decoda.train.conll07 - - rm decoda.test.conll07 - - rm test.conll07 - - rm train.conll07 + - rm test.mcf train.mcf diff --git a/orfeo/maca_trans_parser/Makefile b/orfeo/maca_trans_parser/Makefile index 08836b6f4bca6579022affa3c3f92cc05baa0e39..1cbd002e475b1346554c41a2a9196c08c091bcf9 100644 --- a/orfeo/maca_trans_parser/Makefile +++ b/orfeo/maca_trans_parser/Makefile @@ -1,16 +1,17 @@ -CONLL07TRAIN=../data/treebank/train.conll07 -CONLL07DEV=../data/treebank/dev.conll07 -CONLL07TEST=../data/treebank/test.conll07 +MCF_TRAIN=../data/treebank/train.mcf +MCF_DEV=../data/treebank/dev.mcf +MCF_TEST=../data/treebank/test.mcf CFF_TRAIN=train.cff CFF_CUTOFF_TRAIN=train.cutoff.cff PERCEPTRON_ITERATIONS=5 -CFF_CUTOFF=1 +CFF_CUTOFF=3 FEATURES_MODEL_FILENAME=maca_trans_parser.fm VOCABS_FILENAME=maca_trans_parser.vocab MODEL_FILENAME=maca_trans_parser.model -NUMBER_OF_SENTENCES=10000000 -STREAM_MODE= -#STREAM_MODE= -S +NUMBER_OF_SENTENCES=4218 +#NUMBER_OF_SENTENCES=1000 +MCD_FILENAME=wplgfs.mcd +STREAM_MODE= -S include ../../makefiles/maca_trans_parser.makefile diff --git a/orfeo/maca_trans_tagger/Makefile b/orfeo/maca_trans_tagger/Makefile index b7f0632281d0ad8cf39882bab33bb20851239722..baa1fbdc6149fa78935b2be6131675411f9fd59f 100644 --- a/orfeo/maca_trans_tagger/Makefile +++ b/orfeo/maca_trans_tagger/Makefile @@ -1,22 +1,21 @@ -CONLL07TRAIN=../data/treebank/train.conll07 -CONLL07DEV=../data/treebank/dev.conll07 -CONLL07TEST=../data/treebank/test.conll07 +MCF_TRAIN=../data/treebank/train.mcf +MCF_DEV=../data/treebank/dev.mcf +MCF_TEST=../data/treebank/test.mcf CFF_TRAIN=train.cff CFF_CUTOFF_TRAIN=train.cutoff.cff -PERCEPTRON_ITERATIONS=5 +PERCEPTRON_ITERATIONS=9 CFF_CUTOFF=1 - FEATURES_MODEL_FILENAME=maca_trans_tagger.fm VOCABS_FILENAME=maca_trans_tagger.vocab MCD_FILENAME=maca_trans_tagger.mcd MODEL_FILENAME=maca_trans_tagger.model NUMBER_OF_SENTENCES=10000000 -STREAM_MODE= -#STREAM_MODE= -S +STREAM_MODE= -S FORM_POS_FILENAME=../data/morpho-lexicon/fP +#include ./maca_trans_tagger.makefile include ../../makefiles/maca_trans_tagger.makefile diff --git a/tools/conll_lib.c b/tools/conll_lib.c index b55d86ad6ae86652fd923eee57ee13063d48ee0c..fd05b6cb91b093b60877635a9f42e1d245a71b3e 100644 --- a/tools/conll_lib.c +++ b/tools/conll_lib.c @@ -164,9 +164,10 @@ int parse_line(FILE *f, sentence *s) /* 3 storm storm _ NN _ 4 nsubj _ _ */ /* 4 swept sweep _ VBD _ 26 ccomp _ _ */ /* 5 through through _ IN _ 4 prep _ _ */ - - sscanf(buff, "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel); - /* printf("form = %s\n", w->form); + + /* sscanf(buff, "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel); */ + sscanf(buff, "%d\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%[^\t]\t%s",&(w->id), w->form, w->lemma, w->cpostag, w->postag, w->feats, head_str, w->deprel); + /* printf("form = %s\n", w->form); printf("lemma = %s\n", w->lemma); printf("cpostag = %s\n", w->cpostag); printf("postag = %s\n", w->postag); diff --git a/tools/eval_wplgfs.pl b/tools/eval_wplgfs.pl index bdc0eeb9d866088d591254cae06cff52e8aeff05..de3c408598018a9311217717a4aafe472af433c1 100755 --- a/tools/eval_wplgfs.pl +++ b/tools/eval_wplgfs.pl @@ -68,9 +68,9 @@ sub is_punctuation_ftb{ while(<REF>){ $line_nb++; - ($ref_form, $ref_pos, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split; + ($ref_form, $ref_pos, $ref_lemma, $ref_gov, $ref_fct, $ref_seg) = split /\t/; $_ = <HYP>; - ($hyp_form, $hyp_pos, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split; + ($hyp_form, $hyp_pos, $hyp_lemma, $hyp_gov, $hyp_fct, $hyp_seg) = split /\t/; if($ref_seg){ $nb_ref_seg++;} if($hyp_seg){ $nb_hyp_seg++;}