Skip to content
Snippets Groups Projects
Commit 1c005dbf authored by Alexis Nasr's avatar Alexis Nasr
Browse files

new architecture for datcha

parent 97e89ac4
No related branches found
No related tags found
No related merge requests found
...@@ -3,17 +3,17 @@ ...@@ -3,17 +3,17 @@
all: compile install evaluation all: compile install evaluation
compile: compile:
$(MAKE) -C data/morpho-lexicon compile
$(MAKE) -C data/treebank compile $(MAKE) -C data/treebank compile
$(MAKE) -C maca_trans_parser compile $(MAKE) -C data/morpho-lexicon compile
# $(MAKE) -C maca_trans_parser compile
$(MAKE) -C maca_trans_tagger compile $(MAKE) -C maca_trans_tagger compile
# $(MAKE) -C maca_crf_tagger compile # $(MAKE) -C maca_crf_tagger compile
install: install:
-mkdir -p bin -mkdir -p bin
$(MAKE) -C maca_trans_parser install # $(MAKE) -C maca_trans_parser install
$(MAKE) -C maca_trans_tagger install $(MAKE) -C maca_trans_tagger install
$(MAKE) -C maca_lemmatizer install # $(MAKE) -C maca_lemmatizer install
# $(MAKE) -C maca_crf_tagger install # $(MAKE) -C maca_crf_tagger install
# @tar -cvzf ./maca_datas.tgz bin # @tar -cvzf ./maca_datas.tgz bin
...@@ -23,7 +23,7 @@ evaluation: ...@@ -23,7 +23,7 @@ evaluation:
clean: clean:
$(MAKE) -C data/morpho-lexicon clean $(MAKE) -C data/morpho-lexicon clean
$(MAKE) -C data/treebank clean $(MAKE) -C data/treebank clean
$(MAKE) -C maca_trans_parser clean # $(MAKE) -C maca_trans_parser clean
$(MAKE) -C maca_trans_tagger clean $(MAKE) -C maca_trans_tagger clean
# $(MAKE) -C maca_crf_tagger clean # $(MAKE) -C maca_crf_tagger clean
$(MAKE) -C eval clean $(MAKE) -C eval clean
...@@ -2,14 +2,25 @@ LEXIQUE_ORFEO_DIR=../../../data/lexique-orfeo ...@@ -2,14 +2,25 @@ LEXIQUE_ORFEO_DIR=../../../data/lexique-orfeo
TOOLS_MACA_DATA=../../../tools/ TOOLS_MACA_DATA=../../../tools/
TOOLS=../../tools/ TOOLS=../../tools/
TOOLS_MACA_DATA=../../../tools
CORPUS=../treebank/train.mcf
compile: fplm fP compile: fplm fP
fplm: ./purge_sfplm.pl $(LEXIQUE_ORFEO_DIR)/*.sfplm fplm: fplm_datcha fplm_orfeo
./purge_sfplm.pl $(LEXIQUE_ORFEO_DIR) | sort | uniq | ./fplm_change_pos.pl > fplm cat fplm_datcha fplm_orfeo | sort | uniq > fplm
fplm_datcha: $(CORPUS)
./tchat2fplm.perl $(CORPUS) | sort | uniq > $@
fplm_orfeo: ./purge_sfplm.pl $(LEXIQUE_ORFEO_DIR)/*.sfplm
./purge_sfplm.pl $(LEXIQUE_ORFEO_DIR) | sort | uniq | ./fplm_change_pos.pl > $@
fP: fplm fP: fplm
$(TOOLS_MACA_DATA)fplm2fP.pl $< > $@ $(TOOLS_MACA_DATA)/fplm2fP.pl $< > $@
clean: clean:
-rm fplm_datcha
-rm fplm_orfeo
-rm fplm -rm fplm
-rm fP -rm fP
#!/usr/bin/perl
while(<>){
# chop;
($mot, $mot_corrige, $err, $pos, $lemme, $num, $locuteur, $timestamp) = split /\t/;
# print "$mot\t$pos\t$lemme\t#####\n";
print "$mot_corrige\t$pos\t$lemme\t#####\n";
}
TOOLS_DIR=../../../tools CORPUS=tchat91.corpus_aligne.lc
TOOLS_DATCHA_DIR=../../tools
FTB_DIR=../../../data/ftb
TRAIN=$(FTB_DIR)/ftb.train.conll07
TEST=$(FTB_DIR)/ftb.test.conll07
DEV=$(FTB_DIR)/ftb.dev.conll07
TCHAT91=../../../data/orange-tchat91/tchat91.refMan.norm
compile: dev.mcf test.mcf train.mcf tchat91.mcf
train.mcf: $(TRAIN)
$(TOOLS_DIR)/ftb2datcha -f $< > $@
tchat91.mcf: $(TCHAT91)
$(TOOLS_DATCHA_DIR)/datcha2mcf.pl < $< > $@
dev.mcf: $(DEV)
$(TOOLS_DIR)/ftb2datcha -f $< > $@
test.mcf: $(TEST)
$(TOOLS_DIR)/ftb2datcha -f $< > $@
compile: $(CORPUS)
../../tools/datcha2mcf.pl $< > tt
./split.perl tt
clean: clean:
- rm test.mcf train.mcf dev.mcf tmp tchat91.mcf - rm test.mcf train.mcf tt
#!/usr/bin/perl
open(TEST, ">", "test.mcf");
open(TRAIN, ">", "train.mcf");
while(<>){
# chop;
($mot, $mot_corrige, $err, $pos, $lemme, $num, $locuteur, $timestamp) = split /\t/;
$num =~ /tchat([0-9]*)/;
$n = $1;
if($n < 47){
print TRAIN;
}
else{
print TEST;
}
# print;
# print "num = $1";
}
Source diff could not be displayed: it is too large. Options to address this: view the blob.
TEST=../data/treebank/tchat91.mcf MCF_TRAIN=../data/treebank/train.mcf
#TEST=../data/treebank/test.mcf MCF_DEV=../data/treebank/test.mcf
DEV=../data/treebank/dev.mcf MCF_TEST=../data/treebank/test.mcf
LANGUAGE=datcha EVAL_MCF=/home/alexis/gitlab/maca_data2/tools/eval_mcf.pl
MCD_FILE=eval.mcd
CFF_TRAIN=train.cff
CFF_CUTOFF_TRAIN=train.cutoff.cff
PERCEPTRON_ITERATIONS=5
CFF_CUTOFF=1
FEATURES_MODEL_FILENAME=../bin/maca_trans_tagger.fm
VOCABS_FILENAME=../bin/maca_trans_tagger.vocab
MCD_FILENAME=./maca_trans_tagger.mcd
MODEL_FILENAME=../bin/maca_trans_tagger.model
NUMBER_OF_SENTENCES=10000000
STREAM_MODE= -S
FORM_POS_FILENAME=../data/morpho-lexicon/fP
eval: test_Wp test_cp test_wp
test_W: $(MCF_TEST)
cut -f 2 $< > $@
test_w: $(MCF_TEST)
cut -f 1 $< > $@
test_c: $(MCF_TEST)
cut -f 9 $< > $@
test_P:$(MCF_TEST)
cut -f 4 $< > $@
test_S:$(MCF_TEST)
cut -f 3 $< > $@
test_L:$(MCF_TEST)
cut -f 7 $< > $@
test_WP:test_W test_P
paste test_W test_P > $@
test_Wp: test_W test_WP
maca_trans_tagger -m $(MODEL_FILENAME) -V $(VOCABS_FILENAME) --feat_model $(FEATURES_MODEL_FILENAME) -C $(MCD_FILENAME) -i $< -P $(FORM_POS_FILENAME) > $@
$(EVAL_MCF) -G WP -g test_WP -S WP -s $@
test_wp: test_w test_WP
maca_trans_tagger -m $(MODEL_FILENAME) -V $(VOCABS_FILENAME) --feat_model $(FEATURES_MODEL_FILENAME) -C $(MCD_FILENAME) -i $< -P $(FORM_POS_FILENAME) > $@
$(EVAL_MCF) -G WP -g test_WP -S WP -s $@
test_cp: test_c test_WP
maca_trans_tagger -m $(MODEL_FILENAME) -V $(VOCABS_FILENAME) --feat_model $(FEATURES_MODEL_FILENAME) -C $(MCD_FILENAME) -i $< -P $(FORM_POS_FILENAME) > $@
$(EVAL_MCF) -G WP -g test_WP -S WP -s $@
total: test_Wp test_WP test_wp test_cp test_S test_L
cut -f 2 test_Wp > 'test_p[W]'
cut -f 2 test_wp > 'test_p[w]'
cut -f 2 test_cp > 'test_p[c]'
paste test_w test_c test_W test_S test_P 'test_p[w]' 'test_p[c]' 'test_p[W]' test_L > $@
##-----------------------------------------------------------------------
## clean
##-----------------------------------------------------------------------
clean:
- rm -f test_W
- rm -f test_P
- rm -f test_WP
- rm -f test_Wp
- rm -f test_cp
- rm -f total
include ../../makefiles/eval.makefile
0 INDEX INT _
1 FORM VOCAB _
2 POS VOCAB _
3 LEMMA VOCAB _
1 FORM VOCAB _
4 POS VOCAB _
MCF_TRAIN=../data/treebank/train.mcf MCF_TRAIN=../data/treebank/train.mcf
MCF_DEV=../data/treebank/dev.mcf MCF_DEV=../data/treebank/test.mcf
MCF_TEST=../data/treebank/test.mcf MCF_TEST=../data/treebank/test.mcf
EVAL_MCF=/home/alexis/gitlab/maca_data2/tools/eval_mcf.pl
CFF_TRAIN=train.cff CFF_TRAIN=train.cff
CFF_CUTOFF_TRAIN=train.cutoff.cff CFF_CUTOFF_TRAIN=train.cutoff.cff
PERCEPTRON_ITERATIONS=5 PERCEPTRON_ITERATIONS=5
CFF_CUTOFF=1 CFF_CUTOFF=1
FEATURES_MODEL_FILENAME=maca_trans_tagger.fm FEATURES_MODEL_FILENAME=maca_trans_tagger.fm
VOCABS_FILENAME=maca_trans_tagger.vocab VOCABS_FILENAME=maca_trans_tagger.vocab
MCD_FILENAME=maca_trans_tagger.mcd MCD_FILENAME=maca_trans_tagger.mcd
...@@ -17,5 +16,47 @@ STREAM_MODE= -S ...@@ -17,5 +16,47 @@ STREAM_MODE= -S
FORM_POS_FILENAME=../data/morpho-lexicon/fP FORM_POS_FILENAME=../data/morpho-lexicon/fP
include ../../makefiles/maca_trans_tagger.makefile ##-----------------------------------------------------------------------
## compile
##-----------------------------------------------------------------------
compile: $(MODEL_FILENAME) $(FANN_TRAIN)
$(CFF_TRAIN): $(MCF_TRAIN)
maca_trans_tagger_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FILENAME) --vocabs $(VOCABS_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES) $(STREAM_MODE) -P $(FORM_POS_FILENAME)
$(CFF_FANN_TRAIN): $(MCF_TRAIN)
maca_trans_tagger_mcf2cff -C $(MCD_FILENAME) --input $< --mode TRAIN --feat_model $(FEATURES_MODEL_FANN_FILENAME) --vocabs $(VOCABS_FANN_FILENAME) --cff $@ -s $(NUMBER_OF_SENTENCES) $(STREAM_MODE) -P $(FORM_POS_FILENAME)
$(CFF_CUTOFF_TRAIN): $(CFF_TRAIN)
cff_cutoff --input $< --vocabs $(VOCABS_FILENAME) --cutoff $(CFF_CUTOFF) > $@
$(FANN_TRAIN): $(CFF_FANN_TRAIN)
cff2fann --vocabs $(VOCABS_FANN_FILENAME) --cff $< --feat_model $(FEATURES_MODEL_FANN_FILENAME) -C $(MCD_FILENAME) > $@
$(MODEL_FILENAME): $(CFF_CUTOFF_TRAIN)
#$(MODEL_FILENAME): $(CFF_TRAIN)
perceptron_train --cff $< --model $(MODEL_FILENAME) -n $(PERCEPTRON_ITERATIONS)
##-----------------------------------------------------------------------
## install
##-----------------------------------------------------------------------
install:
- cp $(FEATURES_MODEL_FILENAME) ../bin
- cp $(VOCABS_FILENAME) ../bin
- cp $(MODEL_FILENAME) ../bin
- cp $(FORM_POS_FILENAME) ../bin
##-----------------------------------------------------------------------
## clean
##-----------------------------------------------------------------------
clean:
- rm -f $(VOCABS_FILENAME)
- rm -f $(MODEL_FILENAME)
- rm -f $(CFF_TRAIN)
- rm -f $(CFF_CUTOFF_TRAIN)
b0U1 b0U1
b0sgn b0sgn
b1sgn b1sgn
b2sgn
b0f b0f
b1f #b1f
b2f #b2f
s0f b0len
s1f bm1f
s0p bm2f
s1p bm1p
s2p bm2p
s0p s1p bm3p
s0p s1p s2p bm2p bm1p
s1p s2p bm2p bm3p
bm1p b0sgn
b0s1
b0s1 b0s2
b0s1 b0s2 b0s3
b0s1 b0s2 b0s3 b0s4
1 FORM VOCAB _ 9 FORM VOCAB _
2 POS VOCAB _ 4 POS VOCAB _
...@@ -173,7 +173,7 @@ while(<REF>){ ...@@ -173,7 +173,7 @@ while(<REF>){
# if(($ref_index) && (!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){ # if(($ref_index) && (!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){
if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){ if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos))){
# if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos)) && (!is_punctuation_ud($ref_pos))){ # if((!is_punctuation_ftb($ref_pos)) && (!is_punctuation_ptb($ref_pos)) && (!is_punctuation_ud($ref_pos))){
if($ref_form ne $hyp_form){die "mismatch line $line_nb\n";} # if($ref_form ne $hyp_form){die "mismatch line $line_nb\n";}
$word_nb++; $word_nb++;
$pos_nb{$ref_pos}++; $pos_nb{$ref_pos}++;
$fct_nb{$ref_fct}++; $fct_nb{$ref_fct}++;
...@@ -240,7 +240,7 @@ my $uas = $correct_gov_total_nb / $word_nb * 100 ; ...@@ -240,7 +240,7 @@ my $uas = $correct_gov_total_nb / $word_nb * 100 ;
my $seg_recall = $nb_hyp_ref_seg / $nb_ref_seg; my $seg_recall = $nb_hyp_ref_seg / ($nb_ref_seg + 1);
my $seg_precision = $nb_hyp_ref_seg / ($nb_hyp_seg + 1); my $seg_precision = $nb_hyp_ref_seg / ($nb_hyp_seg + 1);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment