[SRILM User List] arpa header number of 4g to big for int

Andreas Stolcke stolcke at icsi.berkeley.edu
Thu Sep 19 13:27:35 PDT 2013


The attached patch should fix it.  Note this still doesn't support 
vocabularies larger than 2^32, but the number of higher-order ngrams can 
now be 2^64.

Thanks for reporting this problem!

Andreas


On 9/19/2013 4:13 AM, Juan Pino wrote:
> Hello,
>
> I am running this command with version 1.7.0 (the purpose is to fix 
> the format of my input lm):
>
> srilm1.7.0/bin/i686-m64/ngram -debug 1 -order 4 -lm 
> MY_LM_IN_ARPA_FORMAT -write-lm MY_OUTPUT_LM
>
> I get this error:
>
> line 6: ngram number -1840328771 out of range
>
> This is because I have this header in my input lm:
> ngram 4=2454638525
>
> So the number of 4grams is bigger than the maximum 32-bit int.
>
> I've fixed it by replacing
> int nNgrams;
> by
> long nNgrams;
> at line 497 in lm/src/NgramLM.cc and by replacing
> } else if (sscanf(line, "ngram %d=%d", &thisOrder, &nNgrams) == 2) {
> by
> } else if (sscanf(line, "ngram %d=%ld", &thisOrder, &nNgrams) == 2) {
> at line 515 in lm/src/NgramLM.cc
>
> Are there other places in the code that I should change ? Is there a 
> better solution for my problem ?
>
> Thanks very much,
>
> Juan
>
>
> _______________________________________________
> SRILM-User site list
> SRILM-User at speech.sri.com
> http://www.speech.sri.com/mailman/listinfo/srilm-user

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.speech.sri.com/pipermail/srilm-user/attachments/20130919/8262e686/attachment.html>
-------------- next part --------------
*** lm/src/Ngram.h.dist	2013-07-02 20:23:07.385694200 -0700
--- lm/src/Ngram.h	2013-09-19 12:13:48.378147500 -0700
***************
*** 99,105 ****
      /*
       * Statistics
       */
!     virtual unsigned int numNgrams(unsigned int n) const;
      virtual void memStats(MemStats &stats);
  
      /*
--- 99,105 ----
      /*
       * Statistics
       */
!     virtual Count numNgrams(unsigned int n) const;
      virtual void memStats(MemStats &stats);
  
      /*
*** lm/src/NgramLM.cc.dist	2013-09-19 12:15:13.124134400 -0700
--- lm/src/NgramLM.cc	2013-09-19 12:16:15.577406400 -0700
***************
*** 407,417 ****
  {
      char *line;
      unsigned maxOrder = 0;	/* maximal n-gram order in this model */
!     unsigned numNgrams[maxNgramOrder + 1];
  				/* the number of n-grams for each order */
!     unsigned numRead[maxNgramOrder + 1];
  				/* Number of n-grams actually read */
!     unsigned numOOVs = 0;	/* Numer of n-gram skipped due to OOVs */
      int state = -1 ;		/* section of file being read:
  				 * -1 - pre-header, 0 - header,
  				 * 1 - unigrams, 2 - bigrams, ... */
--- 407,417 ----
  {
      char *line;
      unsigned maxOrder = 0;	/* maximal n-gram order in this model */
!     Count numNgrams[maxNgramOrder + 1];
  				/* the number of n-grams for each order */
!     Count numRead[maxNgramOrder + 1];
  				/* Number of n-grams actually read */
!     Count numOOVs = 0;	/* Numer of n-gram skipped due to OOVs */
      int state = -1 ;		/* section of file being read:
  				 * -1 - pre-header, 0 - header,
  				 * 1 - unigrams, 2 - bigrams, ... */
***************
*** 487,493 ****
  
  	case 0:		/* ngram header */
  	    unsigned thisOrder;
! 	    int nNgrams;
  
  	    if (backslash && sscanf(line, "\\%d-grams", &state) == 1) {
  		/*
--- 487,493 ----
  
  	case 0:		/* ngram header */
  	    unsigned thisOrder;
! 	    long long nNgrams;
  
  	    if (backslash && sscanf(line, "\\%d-grams", &state) == 1) {
  		/*
***************
*** 505,511 ****
  		}
  
  		continue;
! 	    } else if (sscanf(line, "ngram %d=%d", &thisOrder, &nNgrams) == 2) {
  		/*
  		 * scanned a line of the form
  		 *	ngram <N>=<howmany>
--- 505,511 ----
  		}
  
  		continue;
! 	    } else if (sscanf(line, "ngram %d=%lld", &thisOrder, &nNgrams) == 2) {
  		/*
  		 * scanned a line of the form
  		 *	ngram <N>=<howmany>
***************
*** 775,781 ****
  Ngram::writeWithOrder(File &file, unsigned order)
  {
      unsigned i;
!     unsigned howmanyNgrams[maxNgramOrder + 1];
      VocabIndex context[maxNgramOrder + 2];
      VocabString scontext[maxNgramOrder + 1];
  
--- 775,781 ----
  Ngram::writeWithOrder(File &file, unsigned order)
  {
      unsigned i;
!     Count howmanyNgrams[maxNgramOrder + 1];
      VocabIndex context[maxNgramOrder + 2];
      VocabString scontext[maxNgramOrder + 1];
  
***************
*** 787,793 ****
  
      for (i = 1; i <= order; i++ ) {
  	howmanyNgrams[i] = numNgrams(i);
! 	file.fprintf("ngram %d=%d\n", i, howmanyNgrams[i]);
      }
  
      for (i = 1; i <= order; i++ ) {
--- 787,793 ----
  
      for (i = 1; i <= order; i++ ) {
  	howmanyNgrams[i] = numNgrams(i);
! 	file.fprintf("ngram %d=%lld\n", i, (long long)howmanyNgrams[i]);
      }
  
      for (i = 1; i <= order; i++ ) {
***************
*** 1461,1473 ****
      return false;
  }
  
! unsigned int
  Ngram::numNgrams(unsigned int order) const
  {
      if (order < 1) {
  	return 0;
      } else {
! 	unsigned int howmany = 0;
  
  	makeArray(VocabIndex, context, order + 1);
  
--- 1461,1473 ----
      return false;
  }
  
! Count
  Ngram::numNgrams(unsigned int order) const
  {
      if (order < 1) {
  	return 0;
      } else {
! 	Count howmany = 0;
  
  	makeArray(VocabIndex, context, order + 1);
  


More information about the SRILM-User mailing list