[SRILM User List] arpa header number of 4g to big for int
Andreas Stolcke
stolcke at icsi.berkeley.edu
Thu Sep 19 13:27:35 PDT 2013
The attached patch should fix it. Note this still doesn't support
vocabularies larger than 2^32, but the number of higher-order ngrams can
now be 2^64.
Thanks for reporting this problem!
Andreas
On 9/19/2013 4:13 AM, Juan Pino wrote:
> Hello,
>
> I am running this command with version 1.7.0 (the purpose is to fix
> the format of my input lm):
>
> srilm1.7.0/bin/i686-m64/ngram -debug 1 -order 4 -lm
> MY_LM_IN_ARPA_FORMAT -write-lm MY_OUTPUT_LM
>
> I get this error:
>
> line 6: ngram number -1840328771 out of range
>
> This is because I have this header in my input lm:
> ngram 4=2454638525
>
> So the number of 4grams is bigger than the maximum 32-bit int.
>
> I've fixed it by replacing
> int nNgrams;
> by
> long nNgrams;
> at line 497 in lm/src/NgramLM.cc and by replacing
> } else if (sscanf(line, "ngram %d=%d", &thisOrder, &nNgrams) == 2) {
> by
> } else if (sscanf(line, "ngram %d=%ld", &thisOrder, &nNgrams) == 2) {
> at line 515 in lm/src/NgramLM.cc
>
> Are there other places in the code that I should change ? Is there a
> better solution for my problem ?
>
> Thanks very much,
>
> Juan
>
>
> _______________________________________________
> SRILM-User site list
> SRILM-User at speech.sri.com
> http://www.speech.sri.com/mailman/listinfo/srilm-user
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.speech.sri.com/pipermail/srilm-user/attachments/20130919/8262e686/attachment.html>
-------------- next part --------------
*** lm/src/Ngram.h.dist 2013-07-02 20:23:07.385694200 -0700
--- lm/src/Ngram.h 2013-09-19 12:13:48.378147500 -0700
***************
*** 99,105 ****
/*
* Statistics
*/
! virtual unsigned int numNgrams(unsigned int n) const;
virtual void memStats(MemStats &stats);
/*
--- 99,105 ----
/*
* Statistics
*/
! virtual Count numNgrams(unsigned int n) const;
virtual void memStats(MemStats &stats);
/*
*** lm/src/NgramLM.cc.dist 2013-09-19 12:15:13.124134400 -0700
--- lm/src/NgramLM.cc 2013-09-19 12:16:15.577406400 -0700
***************
*** 407,417 ****
{
char *line;
unsigned maxOrder = 0; /* maximal n-gram order in this model */
! unsigned numNgrams[maxNgramOrder + 1];
/* the number of n-grams for each order */
! unsigned numRead[maxNgramOrder + 1];
/* Number of n-grams actually read */
! unsigned numOOVs = 0; /* Numer of n-gram skipped due to OOVs */
int state = -1 ; /* section of file being read:
* -1 - pre-header, 0 - header,
* 1 - unigrams, 2 - bigrams, ... */
--- 407,417 ----
{
char *line;
unsigned maxOrder = 0; /* maximal n-gram order in this model */
! Count numNgrams[maxNgramOrder + 1];
/* the number of n-grams for each order */
! Count numRead[maxNgramOrder + 1];
/* Number of n-grams actually read */
! Count numOOVs = 0; /* Numer of n-gram skipped due to OOVs */
int state = -1 ; /* section of file being read:
* -1 - pre-header, 0 - header,
* 1 - unigrams, 2 - bigrams, ... */
***************
*** 487,493 ****
case 0: /* ngram header */
unsigned thisOrder;
! int nNgrams;
if (backslash && sscanf(line, "\\%d-grams", &state) == 1) {
/*
--- 487,493 ----
case 0: /* ngram header */
unsigned thisOrder;
! long long nNgrams;
if (backslash && sscanf(line, "\\%d-grams", &state) == 1) {
/*
***************
*** 505,511 ****
}
continue;
! } else if (sscanf(line, "ngram %d=%d", &thisOrder, &nNgrams) == 2) {
/*
* scanned a line of the form
* ngram <N>=<howmany>
--- 505,511 ----
}
continue;
! } else if (sscanf(line, "ngram %d=%lld", &thisOrder, &nNgrams) == 2) {
/*
* scanned a line of the form
* ngram <N>=<howmany>
***************
*** 775,781 ****
Ngram::writeWithOrder(File &file, unsigned order)
{
unsigned i;
! unsigned howmanyNgrams[maxNgramOrder + 1];
VocabIndex context[maxNgramOrder + 2];
VocabString scontext[maxNgramOrder + 1];
--- 775,781 ----
Ngram::writeWithOrder(File &file, unsigned order)
{
unsigned i;
! Count howmanyNgrams[maxNgramOrder + 1];
VocabIndex context[maxNgramOrder + 2];
VocabString scontext[maxNgramOrder + 1];
***************
*** 787,793 ****
for (i = 1; i <= order; i++ ) {
howmanyNgrams[i] = numNgrams(i);
! file.fprintf("ngram %d=%d\n", i, howmanyNgrams[i]);
}
for (i = 1; i <= order; i++ ) {
--- 787,793 ----
for (i = 1; i <= order; i++ ) {
howmanyNgrams[i] = numNgrams(i);
! file.fprintf("ngram %d=%lld\n", i, (long long)howmanyNgrams[i]);
}
for (i = 1; i <= order; i++ ) {
***************
*** 1461,1473 ****
return false;
}
! unsigned int
Ngram::numNgrams(unsigned int order) const
{
if (order < 1) {
return 0;
} else {
! unsigned int howmany = 0;
makeArray(VocabIndex, context, order + 1);
--- 1461,1473 ----
return false;
}
! Count
Ngram::numNgrams(unsigned int order) const
{
if (order < 1) {
return 0;
} else {
! Count howmany = 0;
makeArray(VocabIndex, context, order + 1);
More information about the SRILM-User
mailing list