#!/bin/bash if [ $# != 2 ] then echo "usage: $0 <input.xml> <output.dir>" >&2 exit 1 fi mkdir -p $2 input=`readlink -f $1` output=`readlink -f $2` dirname=`dirname $(readlink -f $0)` cd $dirname # exit on error with a nice error line trap 'echo FAILED: $0:${LINENO} \(ret=$?\): `head -${LINENO} $0 | tail -1`' ERR set -e -u -o pipefail tmp=./tmp dir=../ export LD_LIBRARY_PATH=$dir/libs:$dir/libs/fst:$dir/libs/atlas # import kaldi environment source path.sh mkdir -p $tmp # create corpus cat $input | sed 's/<[^>]*>/ /g' | sed 's/ */ /g;s/^ //;s/ $//;' | sed 's/\r//g' | grep -P -v "^\s*$" | awk '{print "<s> "$0" </s>"}' | perl -npe '$_=lc' > $tmp/corpus.txt grep -o 'trigger="[^"]*"' $input | cut -f2 -d'"' >> $tmp/corpus.txt || /bin/true # create word list (echo -e "<eps> 0"; tr " " "\n" < $tmp/corpus.txt | sort -u | grep -v "^\s*$" | awk '{print $0, NR}END{print "!sil", NR + 1;print "#0", NR + 2}' ) > $tmp/words.txt # get basic lexicon perl -npe 's/ .*/uc($&)/e' < base/cmudict.txt > $tmp/base-lexicon.txt cat base/lexicon-rocio.txt >> $tmp/base-lexicon.txt #$lang/L.fst # get specific lexicon and words without phonetizations cut -f1 $tmp/words.txt -d " " | grep -v '<' | grep -v "!sil" > $tmp/words-noids.txt perl -e '%words=map{chomp();$_=>1}`cat '$tmp'/words-noids.txt`;while(<>){@a=split();exists $words{$a[0]} and print}' $tmp/base-lexicon.txt > $tmp/lexicon.txt cp $tmp/lexicon.txt $output/words.phon echo -e "!sil SIL" >> $tmp/lexicon.txt echo (cut -f1 -d" " $tmp/lexicon.txt; cat $tmp/words-noids.txt) | sort | uniq -u | grep -v "^#" | grep -v "!sil" > $tmp/to-phonetize.txt || /bin/true # phonetize words cp $tmp/to-phonetize.txt $output/words.autophon NUM_TO_PHONETIZE=`cat $tmp/to-phonetize.txt | wc -l` if [ $NUM_TO_PHONETIZE != 0 ] then cat $tmp/to-phonetize.txt | phonetize/phonetize-cantab.sh | perl -npe 's/( .*)/uc($1)/e;' >> $tmp/lexicon.txt fi # add BIO-like labels to phonemes perl -ane '@A=split(" ",$_); $w = shift @A; @A>0||die ("failed at ".$_); if(@A==1) { print "$w $A[0]_S\n"; } else { print "$w $A[0]_B "; for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' < $tmp/lexicon.txt > $tmp/lexicon1.txt # make lexicon fst L_disambig.fst ndisambig=`utils/add_lex_disambig.pl $tmp/lexicon1.txt $tmp/lexicon_disambig.txt` ndisambig=$[$ndisambig+1]; phone_disambig_symbol=`grep \#0 base/phones.txt | awk '{print $2}'` word_disambig_symbol=`grep \#0 $tmp/words.txt | awk '{print $2}'` utils/make_lexicon_fst.pl $tmp/lexicon_disambig.txt 0.5 SIL '#'$ndisambig | fstcompile --isymbols=base/phones.txt --osymbols=$tmp/words.txt --keep_isymbols=false --keep_osymbols=false | fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | fstarcsort --sort_type=olabel > $tmp/L_disambig.fst # make language model G.fst order=2 estimate-ngram -t $tmp/corpus.txt -o $order -wl $tmp/lm.arpa cat "$tmp/lm.arpa" | \ grep -v '<s> <s>' | \ grep -v '</s> <s>' | \ grep -v '</s> </s>' | \ arpa2fst - | fstprint | \ utils/remove_oovs.pl /dev/null | \ utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tmp/words.txt \ --osymbols=$tmp/words.txt --keep_isymbols=false --keep_osymbols=false | \ fstrmepsilon > $tmp/G.fst cp base/phones.txt $tmp/ (cd $tmp && ln -sf L_disambig.fst L.fst) (cd $tmp && ln -sf ../base/phones .) # make whole search space from HMM targets to words, HCLG.fst utils/mkgraph.sh $tmp/ base/tri3b/ base/tri3b/graph cp base/tri3b/graph/HCLG.fst $output/ cp $tmp/words.txt $output/ # add acoustic model (cd $output && ln -sf $dirname/base/acoustic-model/{conf,ivector_extractor,final.mdl} .) cp $input $output # TODO: generate new config rm -rf $tmp