Skip to content
Snippets Groups Projects
Commit 32168603 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

new version of orfeo gold corpus, with speakers and duration

parent 6743e5fb
No related branches found
No related tags found
No related merge requests found
TOOLS=../../../tools TOOLS=../../../tools
#TRAIN=orfeo.train.conll07 #TRAIN=orfeo.train.conll07
#TEST=orfeo.test.conll07 #TEST=orfeo.test.conll07
#TRAIN=corpus_orfeo_gold_v1.train TEST=corpus_orfeo_gold_aligned_v1.test.mcf
#TEST=corpus_orfeo_gold_v1.test TRAIN=corpus_orfeo_gold_aligned_v1.train.mcf
TRAIN=corpus_orfeo_gold_v2.train
TEST=corpus_orfeo_gold_v2.test
compile: train.mcf test.mcf #dev.mcf compile: train.mcf test.mcf #dev.mcf
train.mcf: $(TRAIN) train.mcf: $(TRAIN)
$(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@ # $(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@
./add_time_and_speaker.perl $< > $@
test.mcf: $(TEST) test.mcf: $(TEST)
$(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@ # $(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@
./add_time_and_speaker.perl $< > $@
clean: clean:
- rm test.mcf train.mcf - rm test.mcf train.mcf
#!/usr/bin/perl
#bonjour INT bonjour -1 dm 0 5.45 5.59 appelant
$first = 1;
while(<>){
chop;
($word, $pos, $lemma, $head, $label, $eos, $start, $end, $speaker) = split /\t/;
if($first){
$first = 0;
$intervalle_int = 0;
$change_speaker = 0;
}
else{
# print "end prec = $end_prec start =
$intervalle = $start - $end_prec;
$intervalle_int = int($intervalle * 10);
if($intervalle_int < -10){
$intervalle_int = -10;
}
if($intervalle_int > 10){
$intervalle_int = 10;
}
if($speaker ne $speaker_prec){
$change_speaker = 1;
}
else{
$change_speaker = 0;
}
}
# print "$word\t$pos\t$lemma\t$head\t$label\t$eos\t$intervalle_int\t$change_speaker\n";
print "$word\t$intervalle_int\t$change_speaker\t$pos\t$lemma\t$head\t$label\t$eos\n";
$end_prec = $end;
$speaker_prec = $speaker;
}
This diff is collapsed.
This diff is collapsed.
...@@ -9,17 +9,19 @@ EVAL_WPLSGF=../../tools/eval_wplsgf.pl ...@@ -9,17 +9,19 @@ EVAL_WPLSGF=../../tools/eval_wplsgf.pl
EVAL_WPMLGFS=../../tools/eval_wpmlgfs.pl EVAL_WPMLGFS=../../tools/eval_wpmlgfs.pl
EVAL_MCF=../../tools/eval_mcf.pl EVAL_MCF=../../tools/eval_mcf.pl
WORD_COLUMN=1 WORD_COLUMN=1
POS_COLUMN=2 INTERVAL_COLUMN=2
SPKR_CHANGE_COLUMN=3
POS_COLUMN=4
#MORPHO_COLUMN=3 #MORPHO_COLUMN=3
LEMMA_COLUMN=3 LEMMA_COLUMN=5
GOV_COLUMN=4 GOV_COLUMN=6
LABEL_COLUMN=5 LABEL_COLUMN=7
SENT_SEG_COLUMN=6 SENT_SEG_COLUMN=8
RESULT_FILE=$(LANGUAGE).res RESULT_FILE=$(LANGUAGE).res
WPMLGFS_MCD_FILE=../../mcd/wpmlgfs.mcd WPMLGFS_MCD_FILE=../../mcd/wpmlgfs.mcd
WPLGFS_MCD_FILE=../../mcd/wplgfs.mcd WPLGFS_MCD_FILE=../../mcd/wplgfs.mcd
WPLSGF_MCD_FILE=../../mcd/wplsgf.mcd WPLSGF_MCD_FILE=../../mcd/wplsgf.mcd
WABPLGFS_MCD_FILE=./wABplgfs.mcd
all: eval all: eval
test_W: $(TEST) test_W: $(TEST)
...@@ -43,6 +45,20 @@ test_F:$(TEST) ...@@ -43,6 +45,20 @@ test_F:$(TEST)
test_S:$(TEST) test_S:$(TEST)
cut -f $(SENT_SEG_COLUMN) $< > $@ cut -f $(SENT_SEG_COLUMN) $< > $@
test_INTERVAL:$(TEST)
cut -f $(INTERVAL_COLUMN) $< > $@
test_SPKR_CHANGE:$(TEST)
cut -f $(SPKR_CHANGE_COLUMN) $< > $@
test_WAB:test_W test_INTERVAL test_SPKR_CHANGE
paste test_W test_INTERVAL test_SPKR_CHANGE > $@
test_WABP:test_WAB test_P
paste test_WAB test_P > $@
test_WP:test_W test_P test_WP:test_W test_P
paste test_W test_P > $@ paste test_W test_P > $@
...@@ -52,12 +68,18 @@ test_WP:test_W test_P ...@@ -52,12 +68,18 @@ test_WP:test_W test_P
test_WPL:test_W test_P test_L test_WPL:test_W test_P test_L
paste test_W test_P test_L > $@ paste test_W test_P test_L > $@
test_WABPL:test_WAB test_P test_L
paste test_WAB test_P test_L > $@
test_WPLGFS:test_W test_P test_L test_G test_F test_S test_WPLGFS:test_W test_P test_L test_G test_F test_S
paste test_W test_P test_L test_G test_F test_S > $@ paste test_W test_P test_L test_G test_F test_S > $@
test_WABPLGFS:test_WAB test_P test_L test_G test_F test_S
paste test_WAB test_P test_L test_G test_F test_S > $@
test_Wp: test_W test_WABp: test_WAB
$(TAGGER) -L $(LANGUAGE) -C $(WPLGFS_MCD_FILE) -i $< -S > $@ $(TAGGER) -L $(LANGUAGE) -C $(WABPLGFS_MCD_FILE) -i $< -S > $@
# cat $< | $(CRF_TAGGER) -L $(LANGUAGE) > $@ # cat $< | $(CRF_TAGGER) -L $(LANGUAGE) > $@
#test_WPm: test_WP #test_WPm: test_WP
...@@ -69,46 +91,46 @@ test_Wp: test_W ...@@ -69,46 +91,46 @@ test_Wp: test_W
#test_WPMl: test_WPM #test_WPMl: test_WPM
# $(LEMMATIZER) -C $(WPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@ # $(LEMMATIZER) -C $(WPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@
test_WPl: test_WP test_WABPl: test_WABP
$(LEMMATIZER) -C $(WPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@ $(LEMMATIZER) -C $(WABPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@
test_Wpl: test_Wp test_WABpl: test_WABp
$(LEMMATIZER) -C $(WPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@ $(LEMMATIZER) -C $(WABPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@
test_WPLgfs: test_WPL test_WABPLgfs: test_WABPL
$(PARSER) -L $(LANGUAGE) -C $(WPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@ $(PARSER) -L $(LANGUAGE) -C $(WABPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
test_WPlgfs: test_WPl test_WABPlgfs: test_WABPl
$(PARSER) -L $(LANGUAGE) -C $(WPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@ $(PARSER) -L $(LANGUAGE) -C $(WABPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
test_WPlgfs: test_WPl test_WABPlgfs: test_WABPl
$(PARSER) -L $(LANGUAGE) -C $(WPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@ $(PARSER) -L $(LANGUAGE) -C $(WABPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
test_Wplgfs: test_Wpl test_WABplgfs: test_WABpl
$(PARSER) -L $(LANGUAGE) -C $(WPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@ $(PARSER) -L $(LANGUAGE) -C $(WABPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
eval_header: eval_header:
echo "file pos morpho lemma uas las srec sacc" > $(RESULT_FILE) echo "file pos morpho lemma uas las srec sacc" > $(RESULT_FILE)
eval: eval_header test_WPLGFS test_WPLgfs test_WPlgfs test_WPlgfs test_Wplgfs eval: eval_header test_WABPLGFS test_WABPLgfs test_WABPlgfs test_WABPlgfs test_WABplgfs
$(EVAL_MCF) -G WPLGFS -g test_WPLGFS -S WPLGFS -s test_WPLGFS >> $(RESULT_FILE) $(EVAL_MCF) -G WABPLGFS -g test_WABPLGFS -S WABPLGFS -s test_WABPLGFS >> $(RESULT_FILE)
$(EVAL_MCF) -G WPLGFS -g test_WPLGFS -S WPLGFS -s test_WPLgfs >> $(RESULT_FILE) $(EVAL_MCF) -G WABPLGFS -g test_WABPLGFS -S WABPLGFS -s test_WABPLgfs >> $(RESULT_FILE)
$(EVAL_MCF) -G WPLGFS -g test_WPLGFS -S WPLGFS -s test_WPlgfs >> $(RESULT_FILE) $(EVAL_MCF) -G WABPLGFS -g test_WABPLGFS -S WABPLGFS -s test_WABPlgfs >> $(RESULT_FILE)
$(EVAL_MCF) -G WPLGFS -g test_WPLGFS -S WPLGFS -s test_WPlgfs >> $(RESULT_FILE) $(EVAL_MCF) -G WABPLGFS -g test_WABPLGFS -S WABPLGFS -s test_WABPlgfs >> $(RESULT_FILE)
$(EVAL_MCF) -G WPLGFS -g test_WPLGFS -S WPLGFS -s test_Wplgfs >> $(RESULT_FILE) $(EVAL_MCF) -G WABPLGFS -g test_WABPLGFS -S WABPLGFS -s test_WABplgfs >> $(RESULT_FILE)
test_WPLGFS.conll: test_WPLGFS test_WABPLGFS.conll: test_WABPLGFS
mcf2conll -i $< > $@ mcf2conll -i $< > $@
test_Wplgfs.conll: test_Wplgfs test_WABplgfs.conll: test_WABplgfs
mcf2conll -i $< > $@ mcf2conll -i $< > $@
test_WPLSgf.conll: test_WPLSgf test_WABPLSgf.conll: test_WABPLSgf
mcf2conll -C ../../mcd/wplsgf.mcd -i $< > $@ mcf2conll -C ../../mcd/wplsgf.mcd -i $< > $@
eval_ud: test_WPLGFS.conll test_WPLSgf.conll eval_ud: test_WABPLGFS.conll test_WABPLSgf.conll
python ../../tools/conll17_ud_eval.py test_WPLGFS.conll test_WPLSgf.conll python ../../tools/conll17_ud_eval.py test_WABPLGFS.conll test_WABPLSgf.conll
clean: clean:
# -rm $(RESULT_FILE) # -rm $(RESULT_FILE)
-rm test_* -rm test_*
...@@ -9,9 +9,10 @@ CFF_CUTOFF=3 ...@@ -9,9 +9,10 @@ CFF_CUTOFF=3
FEATURES_MODEL_FILENAME=maca_trans_parser.fm FEATURES_MODEL_FILENAME=maca_trans_parser.fm
VOCABS_FILENAME=maca_trans_parser.vocab VOCABS_FILENAME=maca_trans_parser.vocab
MODEL_FILENAME=maca_trans_parser.model MODEL_FILENAME=maca_trans_parser.model
NUMBER_OF_SENTENCES=4218 #NUMBER_OF_SENTENCES=4218
#NUMBER_OF_SENTENCES=1000 #NUMBER_OF_SENTENCES=1000
MCD_FILENAME=../../mcd/wplgfs.mcd #MCD_FILENAME=../../mcd/wplgfs.mcd
MCD_FILENAME=./wABplgfs.mcd
STREAM_MODE= -S STREAM_MODE= -S
include ../../makefiles/maca_trans_parser.makefile include ../../makefiles/maca_trans_parser.makefile
...@@ -70,3 +70,37 @@ t1 t2 t3 ...@@ -70,3 +70,37 @@ t1 t2 t3
bm1p bm1p
bm2p bm2p
#features taking into account speaker change and duration
b1A
b1B
b1A b0p
b1B b0p
b1A s0p b0p
b1B s0p b0p
b1A b1B
b1A b1B b0p
b1A b1B s0p b0p
b0A
b0B
b0A b0p
b0B b0p
b0A s0p b0p
b0B s0p b0p
b0A b0B
b0A b0B b0p
b0A b0B s0p b0p
b0A b0f
b0B b0f
b1A b0f
b1B b0f
b0A b1f
b0B b1f
b1A b1f
b1B b1f
...@@ -7,9 +7,11 @@ CFF_TRAIN=train.cff ...@@ -7,9 +7,11 @@ CFF_TRAIN=train.cff
CFF_CUTOFF_TRAIN=train.cutoff.cff CFF_CUTOFF_TRAIN=train.cutoff.cff
PERCEPTRON_ITERATIONS=9 PERCEPTRON_ITERATIONS=9
CFF_CUTOFF=1 CFF_CUTOFF=1
FEATURES_MODEL_FILENAME=../../fm/maca_trans_tagger.fm #FEATURES_MODEL_FILENAME=../../fm/maca_trans_tagger.fm
FEATURES_MODEL_FILENAME=./maca_trans_tagger.fm
VOCABS_FILENAME=maca_trans_tagger.vocab VOCABS_FILENAME=maca_trans_tagger.vocab
MCD_FILENAME=../../mcd/wplgfs.mcd #MCD_FILENAME=../../mcd/wplgfs.mcd
MCD_FILENAME=./wABplgfs.mcd
MODEL_FILENAME=maca_trans_tagger.model MODEL_FILENAME=maca_trans_tagger.model
NUMBER_OF_SENTENCES=10000000 NUMBER_OF_SENTENCES=10000000
STREAM_MODE= -S STREAM_MODE= -S
......
b0U1 b0U1
b0len
b0sgn b0sgn
b1sgn b1sgn
b0f
b1f b1f
b2f b0f
s0f bm1f
s1f bm2f
s0p
s1p bm1p
s2p bm2p
s0p s1p bm3p
s0p s1p s2p bm2p bm1p
s1p s2p bm2p bm3p
bm1p b0sgn
b0s1
#b0s2
#b0s3
#b0s4
#b0s5
b0s1 b0s2
b0s1 b0s2 b0s3
b0s1 b0s2 b0s3 b0s4
b0p1
b0p2
b0p3
b0p4
b0p5
b0p1 b0p2
b0p1 b0p2 b0p3
b0p1 b0p2 b0p3 b0p4
b0A
b0B
b0A b0B
b1A
b1B
b1A b1B
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment