Skip to content
Snippets Groups Projects
Commit 1c005dbf authored by Alexis Nasr's avatar Alexis Nasr
Browse files

new architecture for datcha

parent 97e89ac4
No related branches found
No related tags found
No related merge requests found
......@@ -3,17 +3,17 @@
all: compile install evaluation
compile:
$(MAKE) -C data/morpho-lexicon compile
$(MAKE) -C data/treebank compile
$(MAKE) -C maca_trans_parser compile
$(MAKE) -C data/morpho-lexicon compile
# $(MAKE) -C maca_trans_parser compile
$(MAKE) -C maca_trans_tagger compile
# $(MAKE) -C maca_crf_tagger compile
install:
-mkdir -p bin
$(MAKE) -C maca_trans_parser install
# $(MAKE) -C maca_trans_parser install
$(MAKE) -C maca_trans_tagger install
$(MAKE) -C maca_lemmatizer install
# $(MAKE) -C maca_lemmatizer install
# $(MAKE) -C maca_crf_tagger install
# @tar -cvzf ./maca_datas.tgz bin
......@@ -23,7 +23,7 @@ evaluation:
clean:
$(MAKE) -C data/morpho-lexicon clean
$(MAKE) -C data/treebank clean
$(MAKE) -C maca_trans_parser clean
# $(MAKE) -C maca_trans_parser clean
$(MAKE) -C maca_trans_tagger clean
# $(MAKE) -C maca_crf_tagger clean
$(MAKE) -C eval clean
......@@ -2,14 +2,25 @@ LEXIQUE_ORFEO_DIR=../../../data/lexique-orfeo
TOOLS_MACA_DATA=../../../tools/
TOOLS=../../tools/
TOOLS_MACA_DATA=../../../tools
CORPUS=../treebank/train.mcf
compile: fplm fP
fplm: ./purge_sfplm.pl $(LEXIQUE_ORFEO_DIR)/*.sfplm
./purge_sfplm.pl $(LEXIQUE_ORFEO_DIR) | sort | uniq | ./fplm_change_pos.pl > fplm
fplm: fplm_datcha fplm_orfeo
cat fplm_datcha fplm_orfeo | sort | uniq > fplm
fplm_datcha: $(CORPUS)
./tchat2fplm.perl $(CORPUS) | sort | uniq > $@
fplm_orfeo: ./purge_sfplm.pl $(LEXIQUE_ORFEO_DIR)/*.sfplm
./purge_sfplm.pl $(LEXIQUE_ORFEO_DIR) | sort | uniq | ./fplm_change_pos.pl > $@
fP: fplm
$(TOOLS_MACA_DATA)fplm2fP.pl $< > $@
$(TOOLS_MACA_DATA)/fplm2fP.pl $< > $@
clean:
-rm fplm_datcha
-rm fplm_orfeo
-rm fplm
-rm fP
#!/usr/bin/perl
while(<>){
# chop;
($mot, $mot_corrige, $err, $pos, $lemme, $num, $locuteur, $timestamp) = split /\t/;
# print "$mot\t$pos\t$lemme\t#####\n";
print "$mot_corrige\t$pos\t$lemme\t#####\n";
}
TOOLS_DIR=../../../tools
TOOLS_DATCHA_DIR=../../tools
FTB_DIR=../../../data/ftb
TRAIN=$(FTB_DIR)/ftb.train.conll07
TEST=$(FTB_DIR)/ftb.test.conll07
DEV=$(FTB_DIR)/ftb.dev.conll07
TCHAT91=../../../data/orange-tchat91/tchat91.refMan.norm
compile: dev.mcf test.mcf train.mcf tchat91.mcf
train.mcf: $(TRAIN)
$(TOOLS_DIR)/ftb2datcha -f $< > $@
tchat91.mcf: $(TCHAT91)
$(TOOLS_DATCHA_DIR)/datcha2mcf.pl < $< > $@
dev.mcf: $(DEV)
$(TOOLS_DIR)/ftb2datcha -f $< > $@
test.mcf: $(TEST)
$(TOOLS_DIR)/ftb2datcha -f $< > $@
CORPUS=tchat91.corpus_aligne.lc
compile: $(CORPUS)
../../tools/datcha2mcf.pl $< > tt
./split.perl tt
clean:
- rm test.mcf train.mcf dev.mcf tmp tchat91.mcf
- rm test.mcf train.mcf tt
#!/usr/bin/perl
open(TEST, ">", "test.mcf");
open(TRAIN, ">", "train.mcf");
while(<>){
# chop;
($mot, $mot_corrige, $err, $pos, $lemme, $num, $locuteur, $timestamp) = split /\t/;
$num =~ /tchat([0-9]*)/;
$n = $1;
if($n < 47){
print TRAIN;
}
else{
print TEST;
}
# print;
# print "num = $1";
}
This diff is collapsed.
TEST=../data/treebank/tchat91.mcf
#TEST=../data/treebank/test.mcf
DEV=../data/treebank/dev.mcf
LANGUAGE=datcha
MCD_FILE=eval.mcd
MCF_TRAIN=../data/treebank/train.mcf
MCF_DEV=../data/treebank/test.mcf
MCF_TEST=../data/treebank/test.mcf
EVAL_MCF=/home/alexis/gitlab/maca_data2/tools/eval_mcf.pl
CFF_TRAIN=train.cff
CFF_CUTOFF_TRAIN=train.cutoff.cff
PERCEPTRON_ITERATIONS=5
CFF_CUTOFF=1
FEATURES_MODEL_FILENAME=../bin/maca_trans_tagger.fm
VOCABS_FILENAME=../bin/maca_trans_tagger.vocab
MCD_FILENAME=./maca_trans_tagger.mcd
MODEL_FILENAME=../bin/maca_trans_tagger.model
NUMBER_OF_SENTENCES=10000000
STREAM_MODE= -S
FORM_POS_FILENAME=../data/morpho-lexicon/fP
eval: test_Wp test_cp test_wp
test_W: $(MCF_TEST)
cut -f 2 $< > $@
test_w: $(MCF_TEST)
cut -f 1 $< > $@
test_c: $(MCF_TEST)
cut -f 9 $< > $@
test_P:$(MCF_TEST)
cut -f 4 $< > $@
test_S:$(MCF_TEST)
cut -f 3 $< > $@
test_L:$(MCF_TEST)
cut -f 7 $< > $@
test_WP:test_W test_P
paste test_W test_P > $@
test_Wp: test_W test_WP
maca_trans_tagger -m $(MODEL_FILENAME) -V $(VOCABS_FILENAME) --feat_model $(FEATURES_MODEL_FILENAME) -C $(MCD_FILENAME) -i $< -P $(FORM_POS_FILENAME) > $@
$(EVAL_MCF) -G WP -g test_WP -S WP -s $@
test_wp: test_w test_WP
maca_trans_tagger -m $(MODEL_FILENAME) -V $(VOCABS_FILENAME) --feat_model $(FEATURES_MODEL_FILENAME) -C $(MCD_FILENAME) -i $< -P $(FORM_POS_FILENAME) > $@
$(EVAL_MCF) -G WP -g test_WP -S WP -s $@
test_cp: test_c test_WP
maca_trans_tagger -m $(MODEL_FILENAME) -V $(VOCABS_FILENAME) --feat_model $(FEATURES_MODEL_FILENAME) -C $(MCD_FILENAME) -i $< -P $(FORM_POS_FILENAME) > $@
$(EVAL_MCF) -G WP -g test_WP -S WP -s $@
total: test_Wp test_WP test_wp test_cp test_S test_L
cut -f 2 test_Wp > 'test_p[W]'
cut -f 2 test_wp > 'test_p[w]'
cut -f 2 test_cp > 'test_p[c]'
paste test_w test_c test_W test_S test_P 'test_p[w]' 'test_p[c]' 'test_p[W]' test_L > $@
##-----------------------------------------------------------------------
## clean
##-----------------------------------------------------------------------
clean:
- rm -f test_W
- rm -f test_P
- rm -f test_WP
- rm -f test_Wp
- rm -f test_cp
- rm -f total
include ../../makefiles/eval.makefile
0 INDEX INT _
1 FORM VOCAB _
2 POS VOCAB _
3 LEMMA VOCAB _
1 FORM VOCAB _
4 POS VOCAB _
MCF_TRAIN=../data/treebank/train.mcf
MCF_DEV=../data/treebank/dev.mcf
MCF_DEV=../data/treebank/test.mcf
MCF_TEST=../data/treebank/test.mcf
EVAL_MCF=/home/alexis/gitlab/maca_data2/tools/eval_mcf.pl
CFF_TRAIN=train.cff
CFF_CUTOFF_TRAIN=train.cutoff.cff
PERCEPTRON_ITERATIONS=5
CFF_CUTOFF=1
FEATURES_MODEL_FILENAME=maca_trans_tagger.fm
VOCABS_FILENAME=maca_trans_tagger.vocab
MCD_FILENAME=maca_trans_tagger.mcd
......@@ -17,5 +16,47 @@ STREAM_MODE= -S
FORM_POS_FILENAME=../data/morpho-lexicon/fP
include ../../makefiles/maca_trans_tagger.makefile
##-----------------------------------------------------------------------
## compile
##-----------------------------------------------------------------------
compile: $(MODEL_FILENAME) $(FANN_TRAIN)
$(CFF_TRAIN): $(MCF_TRAIN)
maca_trans_tagger_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FILENAME) --vocabs $(VOCABS_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES) $(STREAM_MODE) -P $(FORM_POS_FILENAME)
$(CFF_FANN_TRAIN): $(MCF_TRAIN)
maca_trans_tagger_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FANN_FILENAME) --vocabs $(VOCABS_FANN_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES) $(STREAM_MODE) -P $(FORM_POS_FILENAME)
$(CFF_CUTOFF_TRAIN): $(CFF_TRAIN)
cff_cutoff --input $< --vocabs $(VOCABS_FILENAME) --cutoff $(CFF_CUTOFF) > $@
$(FANN_TRAIN): $(CFF_FANN_TRAIN)
cff2fann --vocabs $(VOCABS_FANN_FILENAME) --cff $< --feat_model $(FEATURES_MODEL_FANN_FILENAME) -C $(MCD_FILENAME) > $@
$(MODEL_FILENAME): $(CFF_CUTOFF_TRAIN)
#$(MODEL_FILENAME): $(CFF_TRAIN)
perceptron_train --cff $< --model $(MODEL_FILENAME) -n $(PERCEPTRON_ITERATIONS)
##-----------------------------------------------------------------------
## install
##-----------------------------------------------------------------------
install:
- cp $(FEATURES_MODEL_FILENAME) ../bin
- cp $(VOCABS_FILENAME) ../bin
- cp $(MODEL_FILENAME) ../bin
- cp $(FORM_POS_FILENAME) ../bin
##-----------------------------------------------------------------------
## clean
##-----------------------------------------------------------------------
clean:
- rm -f $(VOCABS_FILENAME)
- rm -f $(MODEL_FILENAME)
- rm -f $(CFF_TRAIN)
- rm -f $(CFF_CUTOFF_TRAIN)
b0U1
b0sgn
b1sgn
b2sgn
b0f
b1f
b2f
s0f
s1f
s0p
s1p
s2p
s0p s1p
s0p s1p s2p
s1p s2p
#b1f
#b2f
b0len
bm1f
bm2f
bm1p
bm2p
bm3p
bm2p bm1p
bm2p bm3p
bm1p b0sgn
b0s1
b0s1 b0s2
b0s1 b0s2 b0s3
b0s1 b0s2 b0s3 b0s4
1 FORM VOCAB _
2 POS VOCAB _
9 FORM VOCAB _
4 POS VOCAB _
......@@ -173,7 +173,7 @@ while(<REF>){
# if(($ref_index) && (!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){
if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){
# if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos)) && (!is_punctuation_ud($ref_pos))){
if($ref_form ne $hyp_form){die "mismatch line $line_nb\n";}
# if($ref_form ne $hyp_form){die "mismatch line $line_nb\n";}
$word_nb++;
$pos_nb{$ref_pos}++;
$fct_nb{$ref_fct}++;
......@@ -240,7 +240,7 @@ my $uas = $correct_gov_total_nb / $word_nb * 100 ;
my $seg_recall = $nb_hyp_ref_seg / $nb_ref_seg;
my $seg_recall = $nb_hyp_ref_seg / ($nb_ref_seg + 1);
my $seg_precision = $nb_hyp_ref_seg / ($nb_hyp_seg + 1);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment