Skip to content
Snippets Groups Projects
Commit 32168603 authored by Alexis Nasr's avatar Alexis Nasr
Browse files

new version of orfeo gold corpus, with speakers and duration

parent 6743e5fb
No related branches found
No related tags found
No related merge requests found
TOOLS=../../../tools
#TRAIN=orfeo.train.conll07
#TEST=orfeo.test.conll07
#TRAIN=corpus_orfeo_gold_v1.train
#TEST=corpus_orfeo_gold_v1.test
TRAIN=corpus_orfeo_gold_v2.train
TEST=corpus_orfeo_gold_v2.test
TEST=corpus_orfeo_gold_aligned_v1.test.mcf
TRAIN=corpus_orfeo_gold_aligned_v1.train.mcf
compile: train.mcf test.mcf #dev.mcf
train.mcf: $(TRAIN)
$(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@
# $(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@
./add_time_and_speaker.perl $< > $@
test.mcf: $(TEST)
$(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@
# $(TOOLS)/conll2mcf -f $< -1W -2C -3L -4H -5D > $@
./add_time_and_speaker.perl $< > $@
clean:
- rm test.mcf train.mcf
#!/usr/bin/perl
#bonjour INT bonjour -1 dm 0 5.45 5.59 appelant
$first = 1;
while(<>){
chop;
($word, $pos, $lemma, $head, $label, $eos, $start, $end, $speaker) = split /\t/;
if($first){
$first = 0;
$intervalle_int = 0;
$change_speaker = 0;
}
else{
# print "end prec = $end_prec start =
$intervalle = $start - $end_prec;
$intervalle_int = int($intervalle * 10);
if($intervalle_int < -10){
$intervalle_int = -10;
}
if($intervalle_int > 10){
$intervalle_int = 10;
}
if($speaker ne $speaker_prec){
$change_speaker = 1;
}
else{
$change_speaker = 0;
}
}
# print "$word\t$pos\t$lemma\t$head\t$label\t$eos\t$intervalle_int\t$change_speaker\n";
print "$word\t$intervalle_int\t$change_speaker\t$pos\t$lemma\t$head\t$label\t$eos\n";
$end_prec = $end;
$speaker_prec = $speaker;
}
This diff is collapsed.
This diff is collapsed.
......@@ -9,17 +9,19 @@ EVAL_WPLSGF=../../tools/eval_wplsgf.pl
EVAL_WPMLGFS=../../tools/eval_wpmlgfs.pl
EVAL_MCF=../../tools/eval_mcf.pl
WORD_COLUMN=1
POS_COLUMN=2
INTERVAL_COLUMN=2
SPKR_CHANGE_COLUMN=3
POS_COLUMN=4
#MORPHO_COLUMN=3
LEMMA_COLUMN=3
GOV_COLUMN=4
LABEL_COLUMN=5
SENT_SEG_COLUMN=6
LEMMA_COLUMN=5
GOV_COLUMN=6
LABEL_COLUMN=7
SENT_SEG_COLUMN=8
RESULT_FILE=$(LANGUAGE).res
WPMLGFS_MCD_FILE=../../mcd/wpmlgfs.mcd
WPLGFS_MCD_FILE=../../mcd/wplgfs.mcd
WPLSGF_MCD_FILE=../../mcd/wplsgf.mcd
WABPLGFS_MCD_FILE=./wABplgfs.mcd
all: eval
test_W: $(TEST)
......@@ -43,6 +45,20 @@ test_F:$(TEST)
test_S:$(TEST)
cut -f $(SENT_SEG_COLUMN) $< > $@
test_INTERVAL:$(TEST)
cut -f $(INTERVAL_COLUMN) $< > $@
test_SPKR_CHANGE:$(TEST)
cut -f $(SPKR_CHANGE_COLUMN) $< > $@
test_WAB:test_W test_INTERVAL test_SPKR_CHANGE
paste test_W test_INTERVAL test_SPKR_CHANGE > $@
test_WABP:test_WAB test_P
paste test_WAB test_P > $@
test_WP:test_W test_P
paste test_W test_P > $@
......@@ -52,12 +68,18 @@ test_WP:test_W test_P
test_WPL:test_W test_P test_L
paste test_W test_P test_L > $@
test_WABPL:test_WAB test_P test_L
paste test_WAB test_P test_L > $@
test_WPLGFS:test_W test_P test_L test_G test_F test_S
paste test_W test_P test_L test_G test_F test_S > $@
test_WABPLGFS:test_WAB test_P test_L test_G test_F test_S
paste test_WAB test_P test_L test_G test_F test_S > $@
test_Wp: test_W
$(TAGGER) -L $(LANGUAGE) -C $(WPLGFS_MCD_FILE) -i $< -S > $@
test_WABp: test_WAB
$(TAGGER) -L $(LANGUAGE) -C $(WABPLGFS_MCD_FILE) -i $< -S > $@
# cat $< | $(CRF_TAGGER) -L $(LANGUAGE) > $@
#test_WPm: test_WP
......@@ -69,46 +91,46 @@ test_Wp: test_W
#test_WPMl: test_WPM
# $(LEMMATIZER) -C $(WPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@
test_WPl: test_WP
$(LEMMATIZER) -C $(WPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@
test_WABPl: test_WABP
$(LEMMATIZER) -C $(WABPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@
test_Wpl: test_Wp
$(LEMMATIZER) -C $(WPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@
test_WABpl: test_WABp
$(LEMMATIZER) -C $(WABPLGFS_MCD_FILE) -L $(LANGUAGE) -i $< > $@
test_WPLgfs: test_WPL
$(PARSER) -L $(LANGUAGE) -C $(WPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
test_WABPLgfs: test_WABPL
$(PARSER) -L $(LANGUAGE) -C $(WABPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
test_WPlgfs: test_WPl
$(PARSER) -L $(LANGUAGE) -C $(WPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
test_WABPlgfs: test_WABPl
$(PARSER) -L $(LANGUAGE) -C $(WABPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
test_WPlgfs: test_WPl
$(PARSER) -L $(LANGUAGE) -C $(WPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
test_WABPlgfs: test_WABPl
$(PARSER) -L $(LANGUAGE) -C $(WABPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
test_Wplgfs: test_Wpl
$(PARSER) -L $(LANGUAGE) -C $(WPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
test_WABplgfs: test_WABpl
$(PARSER) -L $(LANGUAGE) -C $(WABPLGFS_MCD_FILE) -i $< $(PARSER_OPTIONS) > $@
eval_header:
echo "file pos morpho lemma uas las srec sacc" > $(RESULT_FILE)
eval: eval_header test_WPLGFS test_WPLgfs test_WPlgfs test_WPlgfs test_Wplgfs
$(EVAL_MCF) -G WPLGFS -g test_WPLGFS -S WPLGFS -s test_WPLGFS >> $(RESULT_FILE)
$(EVAL_MCF) -G WPLGFS -g test_WPLGFS -S WPLGFS -s test_WPLgfs >> $(RESULT_FILE)
$(EVAL_MCF) -G WPLGFS -g test_WPLGFS -S WPLGFS -s test_WPlgfs >> $(RESULT_FILE)
$(EVAL_MCF) -G WPLGFS -g test_WPLGFS -S WPLGFS -s test_WPlgfs >> $(RESULT_FILE)
$(EVAL_MCF) -G WPLGFS -g test_WPLGFS -S WPLGFS -s test_Wplgfs >> $(RESULT_FILE)
eval: eval_header test_WABPLGFS test_WABPLgfs test_WABPlgfs test_WABPlgfs test_WABplgfs
$(EVAL_MCF) -G WABPLGFS -g test_WABPLGFS -S WABPLGFS -s test_WABPLGFS >> $(RESULT_FILE)
$(EVAL_MCF) -G WABPLGFS -g test_WABPLGFS -S WABPLGFS -s test_WABPLgfs >> $(RESULT_FILE)
$(EVAL_MCF) -G WABPLGFS -g test_WABPLGFS -S WABPLGFS -s test_WABPlgfs >> $(RESULT_FILE)
$(EVAL_MCF) -G WABPLGFS -g test_WABPLGFS -S WABPLGFS -s test_WABPlgfs >> $(RESULT_FILE)
$(EVAL_MCF) -G WABPLGFS -g test_WABPLGFS -S WABPLGFS -s test_WABplgfs >> $(RESULT_FILE)
test_WPLGFS.conll: test_WPLGFS
test_WABPLGFS.conll: test_WABPLGFS
mcf2conll -i $< > $@
test_Wplgfs.conll: test_Wplgfs
test_WABplgfs.conll: test_WABplgfs
mcf2conll -i $< > $@
test_WPLSgf.conll: test_WPLSgf
test_WABPLSgf.conll: test_WABPLSgf
mcf2conll -C ../../mcd/wplsgf.mcd -i $< > $@
eval_ud: test_WPLGFS.conll test_WPLSgf.conll
python ../../tools/conll17_ud_eval.py test_WPLGFS.conll test_WPLSgf.conll
eval_ud: test_WABPLGFS.conll test_WABPLSgf.conll
python ../../tools/conll17_ud_eval.py test_WABPLGFS.conll test_WABPLSgf.conll
clean:
# -rm $(RESULT_FILE)
-rm test_*
......@@ -9,9 +9,10 @@ CFF_CUTOFF=3
FEATURES_MODEL_FILENAME=maca_trans_parser.fm
VOCABS_FILENAME=maca_trans_parser.vocab
MODEL_FILENAME=maca_trans_parser.model
NUMBER_OF_SENTENCES=4218
#NUMBER_OF_SENTENCES=4218
#NUMBER_OF_SENTENCES=1000
MCD_FILENAME=../../mcd/wplgfs.mcd
#MCD_FILENAME=../../mcd/wplgfs.mcd
MCD_FILENAME=./wABplgfs.mcd
STREAM_MODE= -S
include ../../makefiles/maca_trans_parser.makefile
......@@ -70,3 +70,37 @@ t1 t2 t3
bm1p
bm2p
#features taking into account speaker change and duration
b1A
b1B
b1A b0p
b1B b0p
b1A s0p b0p
b1B s0p b0p
b1A b1B
b1A b1B b0p
b1A b1B s0p b0p
b0A
b0B
b0A b0p
b0B b0p
b0A s0p b0p
b0B s0p b0p
b0A b0B
b0A b0B b0p
b0A b0B s0p b0p
b0A b0f
b0B b0f
b1A b0f
b1B b0f
b0A b1f
b0B b1f
b1A b1f
b1B b1f
......@@ -7,9 +7,11 @@ CFF_TRAIN=train.cff
CFF_CUTOFF_TRAIN=train.cutoff.cff
PERCEPTRON_ITERATIONS=9
CFF_CUTOFF=1
FEATURES_MODEL_FILENAME=../../fm/maca_trans_tagger.fm
#FEATURES_MODEL_FILENAME=../../fm/maca_trans_tagger.fm
FEATURES_MODEL_FILENAME=./maca_trans_tagger.fm
VOCABS_FILENAME=maca_trans_tagger.vocab
MCD_FILENAME=../../mcd/wplgfs.mcd
#MCD_FILENAME=../../mcd/wplgfs.mcd
MCD_FILENAME=./wABplgfs.mcd
MODEL_FILENAME=maca_trans_tagger.model
NUMBER_OF_SENTENCES=10000000
STREAM_MODE= -S
......
b0U1
b0len
b0sgn
b1sgn
b0f
b1f
b2f
s0f
s1f
s0p
s1p
s2p
s0p s1p
s0p s1p s2p
s1p s2p
b0f
bm1f
bm2f
bm1p
bm2p
bm3p
bm2p bm1p
bm2p bm3p
bm1p b0sgn
b0s1
#b0s2
#b0s3
#b0s4
#b0s5
b0s1 b0s2
b0s1 b0s2 b0s3
b0s1 b0s2 b0s3 b0s4
b0p1
b0p2
b0p3
b0p4
b0p5
b0p1 b0p2
b0p1 b0p2 b0p3
b0p1 b0p2 b0p3 b0p4
b0A
b0B
b0A b0B
b1A
b1B
b1A b1B
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment