Newer
Older
#!/bin/bash
if [ $# != 2 ]
then
echo "usage: $0 <input.xml> <output.dir>" >&2
exit 1
fi
mkdir -p $2
input=`readlink -f $1`
output=`readlink -f $2`
dirname=`dirname $(readlink -f $0)`
cd $dirname
# exit on error with a nice error line
trap 'echo FAILED: $0:${LINENO} \(ret=$?\): `head -${LINENO} $(basename $0) | tail -1`' ERR
set -e -u -o pipefail
tmp=./tmp
# import kaldi environment
source path.sh
mkdir -p $tmp
# create corpus
cat $input | sed 's/<[^>]*>/ /g' | sed 's/ */ /g;s/^ //;s/ $//;' | sed 's/\r//g' | grep -P -v "^\s*$" | awk '{print "<s> "$0" </s>"}' | sed 's/-//g' | perl -npe '$_=lc' > $tmp/corpus.txt
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
grep -o 'trigger="[^"]*"' $input | cut -f2 -d'"' >> $tmp/corpus.txt || /bin/true
# create word list
(echo -e "<eps> 0"; tr " " "\n" < $tmp/corpus.txt | sort -u | grep -v "^\s*$" | awk '{print $0, NR}END{print "!sil", NR + 1;print "#0", NR + 2}' ) > $tmp/words.txt
# get basic lexicon
perl -npe 's/ .*/uc($&)/e' < base/cmudict.txt > $tmp/base-lexicon.txt
cat base/lexicon-rocio.txt >> $tmp/base-lexicon.txt
#$lang/L.fst
# get specific lexicon and words without phonetizations
cut -f1 $tmp/words.txt -d " " | grep -v '<' | grep -v "!sil" > $tmp/words-noids.txt
perl -e '%words=map{chomp();$_=>1}`cat '$tmp'/words-noids.txt`;while(<>){@a=split();exists $words{$a[0]} and print}' $tmp/base-lexicon.txt > $tmp/lexicon.txt
cp $tmp/lexicon.txt $output/words.phon
echo -e "!sil SIL" >> $tmp/lexicon.txt
echo
(cut -f1 -d" " $tmp/lexicon.txt; cat $tmp/words-noids.txt) | sort | uniq -u | grep -v "^#" | grep -v "!sil" > $tmp/to-phonetize.txt || /bin/true
# phonetize words
cp $tmp/to-phonetize.txt $output/words.autophon
NUM_TO_PHONETIZE=`cat $tmp/to-phonetize.txt | wc -l`
if [ $NUM_TO_PHONETIZE != 0 ]
then
cat $tmp/to-phonetize.txt | phonetize/phonetize-cantab.sh | perl -npe 's/( .*)/uc($1)/e;' >> $tmp/lexicon.txt
fi
# add BIO-like labels to phonemes
perl -ane '@A=split(" ",$_); $w = shift @A; @A>0||die ("failed at ".$_); if(@A==1) { print "$w $A[0]_S\n"; } else { print "$w $A[0]_B "; for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' < $tmp/lexicon.txt > $tmp/lexicon1.txt
# make lexicon fst L_disambig.fst
ndisambig=`utils/add_lex_disambig.pl $tmp/lexicon1.txt $tmp/lexicon_disambig.txt`
ndisambig=$[$ndisambig+1];
phone_disambig_symbol=`grep \#0 base/phones.txt | awk '{print $2}'`
word_disambig_symbol=`grep \#0 $tmp/words.txt | awk '{print $2}'`
utils/make_lexicon_fst.pl $tmp/lexicon_disambig.txt 0.5 SIL '#'$ndisambig | fstcompile --isymbols=base/phones.txt --osymbols=$tmp/words.txt --keep_isymbols=false --keep_osymbols=false | fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | fstarcsort --sort_type=olabel > $tmp/L_disambig.fst
# make language model G.fst
estimate-ngram -t $tmp/corpus.txt -o $order -wl $tmp/lm.arpa
#cp ../../tools/modele_lex_lm_rocio.new/modele_homeostasis_25nov.n3.arpa $tmp/lm.arpa
cat "$tmp/lm.arpa" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tmp/words.txt \
--osymbols=$tmp/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $tmp/G.fst
cp base/phones.txt $tmp/
(cd $tmp && ln --relative -sf L_disambig.fst L.fst)
(cd $tmp && ln --relative -sf ../base/phones .)
# make whole search space from HMM targets to words, HCLG.fst
utils/mkgraph.sh $tmp/ base/tri3b/ base/tri3b/graph
cp base/tri3b/graph/HCLG.fst $output/
cp $tmp/words.txt $output/
# add acoustic model
(cd $output && ln --relative -sf $dirname/base/acoustic-model/{conf,ivector_extractor,final.mdl,adaptation*} .)
cp $input $output
# TODO: generate new config
rm -rf $tmp