inquiry about SRI Toolkit

Tue Jul 16 12:50:25 PDT 2002

Bing,

I hope you found the info you want in the man page that Andreas pointed you
at.  If you want an example of how to use the compute-best-mix program, the
following script may be useful.  This script will probably be included into
the toolkit with the next release.  It computes the mixed log probability
and perplexity on a given corpus according to a dynamic mixture of up to 6
language models by jack-knifing. i.e., the mixture coefficients for one
half of the corpus are those estimated using compute-best-mix on the other
half.

cheers.

&

#!/bin/ksh
#
# Computes the "fairly" (word level) interpolated probability of the
# given data set using all of the (upto 6) given language models.  The
# procedure is to estimate lambdas on one half, mix by this proportion
# on the second half and vice versa.  Usage example:
# compute-mixed-logprob -lm lm1 -lm lm2 ... -text text -sets set1 set2
#
# $Header: $
#
LMS="";
TEXT="-";
PWD=`pwd`
EXPT=`basename $PWD`

function split_lines
{
  prefix="lines"
  if [ x$1 = "x-prefix" ]; then
    prefix=$2;
    shift; shift;
  fi

  gawk -v f1=$prefix.set1 -v f2=$prefix.set2 -v n1=$1 -v n2=$2 '
  BEGIN {
    n=n1+n2;
  }
  (NR-1) % n < n1 {
    print >f1;
    next;
  } {
    print >f2;
    next;
  }'
}

#----------------------------------------------------------------------
# Main
#

while [ $# -gt 0 ]; do
  case $1 in
    -lm)            LMS="$LMS $2"; shift; shift;;
    -lm?flags)      LMFLAGS="$2"; shift; shift;;
    -text)          TEXT=$2; shift; shift;;
    -expt)          EXPT=$2; shift; shift;;
    -sets)          set1=$2; set2=$3; shift; shift; shift;;
    *)              echo "Incorrect usage.  Refer to man page ppl-scripts(1)."; exit 1;
  esac
done

LOG=$EXPT.log
EXPTDIR=`dirname $EXPT`
mkdir -p $EXPTDIR

exec 2>>$LOG
echo "The following is the log of $0 starting at `date`" 1>&2
set -x

# Divide input text into two chunks.  This will produce
# $EXPT.set1 and $EXPT.set2
#
if [ -z "$set1" -o -z "$set2" ]; then
  cat $TEXT | split_lines -prefix $EXPT 1 1
  set1=$EXPT.set1
  set2=$EXPT.set2
fi

# Compute logprobs according to each lm on each half.
#
for lm in $LMS; do
  for set in $set1 $set2; do
    ngram $LMFLAGS -debug 2 -lm $lm -ppl $set >$set-`basename $lm`.ppl
  done
done

# Compute best mix
#
for set in $set1 $set2; do
  ppl_files="";
  for lm in $LMS; do
    ppl_files="$ppl_files $set-`basename $lm`.ppl"
  done
  compute-best-mix $ppl_files >$set-lambdas
done

# Interpolate each set, with lambdas from the other set.
#
(echo $set1 $set2; echo $set2 $set1;) | while read s1 s2; do

  main_lm=`echo $LMS | gawk '{print $1}'`
  lm_flags="$LMFLAGS -lm $main_lm"
  if [ ! -s $s1-lambdas ]; then
    echo Could not read $s1-lambdas 1>&2
    exit 1;
  fi

  set `cat $s1-lambdas | sed 's/^.*(\(.*\))/\1/'`
  shift;
  if [ $# -gt 0 ]; then
    mix_lm=`echo $LMS | gawk '{print $2}'`
    lambdas="-lambda $1";
    lm_flags="$lm_flags -mix-lm $mix_lm"
    shift;
  fi

  for i in 2 3 4 5; do
    if [ $# -gt 0 ]; then
      lambdas="$lambdas -mix-lambda$i $1";
      mix_lm=`echo $LMS | gawk -v i=$i '{print $(i+1)}'`
      if [ -z "$mix_lm" ]; then
	echo No mix lm found for lambda $1
	exit;
      fi
      lm_flags="$lm_flags -mix-lm$i $mix_lm"
      shift;
    fi
  done

  ngram_flags="$lm_flags $lambdas"
  ngram $ngram_flags -ppl $s2

done | \
gawk '{
  print;
}

$1 ~ /^file$/ {
  nsents += $3;
  nwords += $5;
  noovs += $7;
  next;
}

$2 ~ /^zeroprobs,$/ {
  nzeroprobs+= $1;
  logprob += $4;
  next;
}

END {
  printf "file both: %d sentences, %d words, %d OOVs\n",
          nsents, nwords, noovs;
  printf "%d zeroprobs, logprob= %g ppl= %g ppl1= %g\n",
          nzeroprobs, logprob,
	  10^(-logprob/(nsents+nwords-noovs)),
	  10^(-logprob/(nwords-noovs));
}'