Anand Venkataraman anand at speech.sri.com
Tue Jul 16 12:50:25 PDT 2002

```Bing,

I hope you found the info you want in the man page that Andreas pointed you
at.  If you want an example of how to use the compute-best-mix program, the
following script may be useful.  This script will probably be included into
the toolkit with the next release.  It computes the mixed log probability
and perplexity on a given corpus according to a dynamic mixture of up to 6
language models by jack-knifing. i.e., the mixture coefficients for one
half of the corpus are those estimated using compute-best-mix on the other
half.

cheers.

&

#!/bin/ksh
#
# Computes the "fairly" (word level) interpolated probability of the
# given data set using all of the (upto 6) given language models.  The
# procedure is to estimate lambdas on one half, mix by this proportion
# on the second half and vice versa.  Usage example:
# compute-mixed-logprob -lm lm1 -lm lm2 ... -text text -sets set1 set2
#
#
LMS="";
TEXT="-";
PWD=`pwd`
EXPT=`basename \$PWD`

function split_lines
{
prefix="lines"
if [ x\$1 = "x-prefix" ]; then
prefix=\$2;
shift; shift;
fi

gawk -v f1=\$prefix.set1 -v f2=\$prefix.set2 -v n1=\$1 -v n2=\$2 '
BEGIN {
n=n1+n2;
}
(NR-1) % n < n1 {
print >f1;
next;
} {
print >f2;
next;
}'
}

#----------------------------------------------------------------------
# Main
#

while [ \$# -gt 0 ]; do
case \$1 in
-lm)            LMS="\$LMS \$2"; shift; shift;;
-lm?flags)      LMFLAGS="\$2"; shift; shift;;
-text)          TEXT=\$2; shift; shift;;
-expt)          EXPT=\$2; shift; shift;;
-sets)          set1=\$2; set2=\$3; shift; shift; shift;;
*)              echo "Incorrect usage.  Refer to man page ppl-scripts(1)."; exit 1;
esac
done

LOG=\$EXPT.log
EXPTDIR=`dirname \$EXPT`
mkdir -p \$EXPTDIR

exec 2>>\$LOG
echo "The following is the log of \$0 starting at `date`" 1>&2
set -x

# Divide input text into two chunks.  This will produce
# \$EXPT.set1 and \$EXPT.set2
#
if [ -z "\$set1" -o -z "\$set2" ]; then
cat \$TEXT | split_lines -prefix \$EXPT 1 1
set1=\$EXPT.set1
set2=\$EXPT.set2
fi

# Compute logprobs according to each lm on each half.
#
for lm in \$LMS; do
for set in \$set1 \$set2; do
ngram \$LMFLAGS -debug 2 -lm \$lm -ppl \$set >\$set-`basename \$lm`.ppl
done
done

# Compute best mix
#
for set in \$set1 \$set2; do
ppl_files="";
for lm in \$LMS; do
ppl_files="\$ppl_files \$set-`basename \$lm`.ppl"
done
compute-best-mix \$ppl_files >\$set-lambdas
done

# Interpolate each set, with lambdas from the other set.
#
(echo \$set1 \$set2; echo \$set2 \$set1;) | while read s1 s2; do

main_lm=`echo \$LMS | gawk '{print \$1}'`
lm_flags="\$LMFLAGS -lm \$main_lm"
if [ ! -s \$s1-lambdas ]; then
echo Could not read \$s1-lambdas 1>&2
exit 1;
fi

set `cat \$s1-lambdas | sed 's/^.*(\(.*\))/\1/'`
shift;
if [ \$# -gt 0 ]; then
mix_lm=`echo \$LMS | gawk '{print \$2}'`
lambdas="-lambda \$1";
lm_flags="\$lm_flags -mix-lm \$mix_lm"
shift;
fi

for i in 2 3 4 5; do
if [ \$# -gt 0 ]; then
lambdas="\$lambdas -mix-lambda\$i \$1";
mix_lm=`echo \$LMS | gawk -v i=\$i '{print \$(i+1)}'`
if [ -z "\$mix_lm" ]; then
echo No mix lm found for lambda \$1
exit;
fi
lm_flags="\$lm_flags -mix-lm\$i \$mix_lm"
shift;
fi
done

ngram_flags="\$lm_flags \$lambdas"
ngram \$ngram_flags -ppl \$s2

done | \
gawk '{
print;
}

\$1 ~ /^file\$/ {
nsents += \$3;
nwords += \$5;
noovs += \$7;
next;
}

\$2 ~ /^zeroprobs,\$/ {
nzeroprobs+= \$1;
logprob += \$4;
next;
}

END {
printf "file both: %d sentences, %d words, %d OOVs\n",
nsents, nwords, noovs;
printf "%d zeroprobs, logprob= %g ppl= %g ppl1= %g\n",
nzeroprobs, logprob,
10^(-logprob/(nsents+nwords-noovs)),
10^(-logprob/(nwords-noovs));
}'

```