create-asr-models.sh

#!/bin/bash

if [ $# != 2 ]
then
    echo "usage: $0 <input.xml> <output.dir>" >&2
    exit 1
fi

mkdir -p $2
input=`readlink -f $1`
output=`readlink -f $2`

dirname=`dirname $(readlink -f $0)`
cd $dirname

# exit on error with a nice error line
trap 'echo FAILED: $0:${LINENO} \(ret=$?\): `head -${LINENO} $(basename $0) | tail -1`' ERR
set -e -u -o pipefail

tmp=./tmp

# import kaldi environment
source path.sh
mkdir -p $tmp

# create corpus
cat $input | sed 's/<[^>]*>/ /g' | sed 's/  */ /g;s/^ //;s/ $//;' | sed 's/\r//g' | grep -P -v "^\s*$" | awk '{print "<s> "$0" </s>"}' | sed 's/-//g' | perl -npe '$_=lc' > $tmp/corpus.txt
grep -o 'trigger="[^"]*"' $input | cut -f2 -d'"' >> $tmp/corpus.txt || /bin/true

# create word list
(echo -e "<eps> 0"; tr " " "\n" < $tmp/corpus.txt | sort -u | grep -v "^\s*$" | awk '{print $0, NR}END{print "!sil", NR + 1;print "#0", NR + 2}' ) > $tmp/words.txt

# get basic lexicon
perl -npe 's/ .*/uc($&)/e' < base/cmudict.txt > $tmp/base-lexicon.txt
cat base/lexicon-rocio.txt >> $tmp/base-lexicon.txt

#$lang/L.fst 

# get specific lexicon and words without phonetizations
cut -f1 $tmp/words.txt -d " " | grep -v '<' | grep -v "!sil" > $tmp/words-noids.txt
perl -e '%words=map{chomp();$_=>1}`cat '$tmp'/words-noids.txt`;while(<>){@a=split();exists $words{$a[0]} and print}' $tmp/base-lexicon.txt > $tmp/lexicon.txt
cp $tmp/lexicon.txt $output/words.phon
echo -e "!sil SIL" >> $tmp/lexicon.txt
echo 
(cut -f1 -d" " $tmp/lexicon.txt; cat $tmp/words-noids.txt) | sort | uniq -u | grep -v "^#" | grep -v "!sil" > $tmp/to-phonetize.txt || /bin/true

# phonetize words
cp $tmp/to-phonetize.txt $output/words.autophon
NUM_TO_PHONETIZE=`cat $tmp/to-phonetize.txt | wc -l`
if [ $NUM_TO_PHONETIZE != 0 ]
then
    cat $tmp/to-phonetize.txt | phonetize/phonetize-cantab.sh | perl -npe 's/( .*)/uc($1)/e;' >> $tmp/lexicon.txt
fi

# add BIO-like labels to phonemes
perl -ane '@A=split(" ",$_); $w = shift @A; @A>0||die ("failed at ".$_); if(@A==1) { print "$w $A[0]_S\n"; } else { print "$w $A[0]_B "; for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' < $tmp/lexicon.txt > $tmp/lexicon1.txt

# make lexicon fst L_disambig.fst
ndisambig=`utils/add_lex_disambig.pl $tmp/lexicon1.txt $tmp/lexicon_disambig.txt`
ndisambig=$[$ndisambig+1]; 
phone_disambig_symbol=`grep \#0 base/phones.txt | awk '{print $2}'`
word_disambig_symbol=`grep \#0 $tmp/words.txt | awk '{print $2}'`
utils/make_lexicon_fst.pl $tmp/lexicon_disambig.txt 0.5 SIL '#'$ndisambig | fstcompile --isymbols=base/phones.txt --osymbols=$tmp/words.txt --keep_isymbols=false --keep_osymbols=false | fstaddselfloops  "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | fstarcsort --sort_type=olabel > $tmp/L_disambig.fst

# make language model G.fst
order=3
estimate-ngram -t $tmp/corpus.txt -o $order -wl $tmp/lm.arpa
#cp ../../tools/modele_lex_lm_rocio.new/modele_homeostasis_25nov.n3.arpa $tmp/lm.arpa
cat "$tmp/lm.arpa" | \
   grep -v '<s> <s>' | \
   grep -v '</s> <s>' | \
   grep -v '</s> </s>' | \
   arpa2fst - | fstprint | \
   utils/remove_oovs.pl /dev/null | \
   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tmp/words.txt \
     --osymbols=$tmp/words.txt  --keep_isymbols=false --keep_osymbols=false | \
    fstrmepsilon > $tmp/G.fst

cp base/phones.txt $tmp/
(cd $tmp && ln --relative -sf L_disambig.fst L.fst)
(cd $tmp && ln --relative -sf ../base/phones .)

# make whole search space from HMM targets to words, HCLG.fst
utils/mkgraph.sh $tmp/ base/tri3b/ base/tri3b/graph
cp base/tri3b/graph/HCLG.fst $output/
cp $tmp/words.txt $output/

# add acoustic model
(cd $output && ln --relative -sf $dirname/base/acoustic-model/{conf,ivector_extractor,final.mdl,adaptation*} .)
cp $input $output

# TODO: generate new config

rm -rf $tmp