Modifying ngram.cc

lambert mathias lambert at jhu.edu
Tue Jul 19 14:04:19 PDT 2005


Hi,

I tried using the copy constructors in srilm-1.4.5.
I initialized my LM as
Ngram useLM(ngramLM);
However, when I do a useLM->write(), the output shows some of the entried as
(null).

\data\
ngram 1=6
ngram 2=24
ngram 3=96

\1-grams:
0       (null)  -99
0       (null)  -99
-0.6083246      3       0
-0.6093718      4       0
-1.78533        </s>
-99     <s>     -99

\2-grams:
0       3 (null)        -99
0       3 (null)        -99
-0.6108742      3 3     0
-0.6078538      3 4     3.818703
-1.778024       3 </s>
 ............

I am trying to figure out if there is a bug in the copy constructor
initialization.

Lambert



On 7/17/05 8:47 PM, "Andreas Stolcke" <stolcke at speech.sri.com> wrote:

> The problem was that copy constructors (and assignment operators)
> for the container data structures (Array, SArray, LHash) weren't implemented.
> I fixed that in the current development version.
> No changes to any of the higher-level classes (like Ngram) is needed,
> as C++ automatically (and recursively) defines copy constructors based on the
> member copy constructors.
> 
> You can either replace the attached source files (in dstruct/src), or
> download the "1.4.5 (Beta)" version from the web page.
> 
> --Andreas
> 
> In message <BEF415D2.332F%lambert at jhu.edu>you wrote:
>> Hi,
>> 
>> This is kind of a C++ question
>> I wrote the following copy constructor in Ngram.h
>> Ngram(const Ngram
>> &ng):LM(ng.vocab),contexts(ng.contexts),order(ng.order),_skipOOVs(ng._skipOO
>> Vs),_trustTotals(ng._trustTotals){};
>> 
>> I have the following declarations
>> 
>> Ngram ngramLM(vocab*,order);
>> 
>> While(//<some condition>)
>>     Ngram useLM(ngramLM);
>> 
>>     // Do some stuff with useLM
>>     .........
>>     ........
>> }
>> 
>> The problem is that the assignment useLM=ngramLM doesn't assign the original
>> ngramLM. Any changes I make to useLM shows up in ngramLM too.
>> I just want to make a copy (useLM) of the original ngramLM, work on that
>> copy and then reinitialize another useLM with the original ngramLM.
>> 
>> Any thoughts?
>> 
>> 
>> Lambert
>> 
> 
> /*
>  * Array.h --
>  * Extensible array class
>  *
>  * Copyright (c) 1995-2005 SRI International.  All Rights Reserved.
>  *
>  * @(#)$Header: /home/srilm/devel/dstruct/src/RCS/Array.h,v 1.11 2005/07/17
> 22:12:21 stolcke Exp $
>  *
>  */
> 
> #ifndef _Array_h_
> #define _Array_h_
> 
> #include <assert.h>
> 
> #include "MemStats.h"
> 
> template <class DataT>
> class Array
> {
> public:
>     Array(int base = 0, unsigned int size = 0)
> : _base(base), _size(size), _data(0), alloc_size(0)
> { if (size > 0) { alloc(size-1); } };
>     Array(Array<DataT> &source)
> : _base(source._base), _size(0), _data(0), alloc_size(0)
> { *this = source; }
> 
>     ~Array() { delete [] _data; } ;
> 
>     DataT &operator[](int index)
> { int offset = index - _base; assert(offset >= 0);
>  if (offset >= _size) {
>    _size = offset + 1;
>    if (offset >= alloc_size) { alloc(offset); }
>  }
>  return _data[offset];
> };
> 
>     Array<DataT> & operator= (const Array<DataT> &other);
> 
>     DataT *data() const { return _data - _base; };
>     int base() const { return _base; }
>     unsigned int size() const { return _size; }
> 
>     void memStats(MemStats &stats) const;
> 
> private:
>     unsigned int _size;  /* used size */
>     int _base;
>     DataT *_data;
>     unsigned int alloc_size; /* allocated size */
>     void alloc(unsigned int size);
> };
> 
> #endif /* _Array_h_ */
> /*
>  * LHash.cc --
>  * Linear search hash table implementation
>  *
>  */
> 
> #ifndef _LHash_cc_
> #define _LHash_cc_
> 
> #ifndef lint
> static char LHash_Copyright[] = "Copyright (c) 1995-2005 SRI International.
> All Rights Reserved.";
> static char LHash_RcsId[] = "@(#)$Header:
> /home/srilm/devel/dstruct/src/RCS/LHash.cc,v 1.46 2005/07/17 22:01:31 stolcke
> Exp $";
> #endif
> 
> #include <iostream.h>
> #include <stdlib.h>
> #include <string.h>
> #include <assert.h>
> #include <new.h>
> 
> #include "LHash.h"
> 
> #undef INSTANTIATE_LHASH
> #define INSTANTIATE_LHASH(KeyT, DataT) \
> template <> DataT *LHash< KeyT, DataT >::removedData = 0; \
> template class LHash< KeyT, DataT >; \
> template class LHashIter< KeyT, DataT >
> 
> #ifndef __GNUG__
> template <class KeyT, class DataT>
> DataT *LHash<KeyT,DataT>::removedData = 0;
> #endif /* __GNUG__ */
> 
> const unsigned minHashBits = 3;  /* minimum no. bits for hashing
> * tables smaller than this use linear
> * search to save space */
> const float fillRatio = 0.8;  /* fill ration at which the table is
> * expanded and rehashed */
> 
> #define BODY(b) ((LHashBody<KeyT,DataT> *)b)
> 
> /*
>  * Dump the entire hash array to cerr.  Unused slots are printed as "FREE".
>  */
> template <class KeyT, class DataT>
> void
> LHash<KeyT,DataT>::dump() const
> {
>     if (body) {
> unsigned maxEntries = hashSize(BODY(body)->maxBits);
> unsigned i;
> 
> for (i = 0; i < maxEntries; i++) {
>    cerr << " " << i << ": ";
>    if (Map_noKeyP(BODY(body)->data[i].key)) {
> cerr << "FREE";
>    } else {
> cerr << BODY(body)->data[i].key
> #ifdef DUMP_VALUES
> /* 
> * Only certain types can be printed.
> */
>     << "->" << BODY(body)->data[i].value
> #endif /* DUMP_VALUES */
>     ;
>    }
> }
>     } else {
> cerr << "EMPTY";
>     }
>     cerr << endl;
> }
> 
> template <class KeyT, class DataT>
> void
> LHash<KeyT,DataT>::memStats(MemStats &stats) const
> {
>     stats.total += sizeof(*this);
>     if (body) {
>         unsigned maxEntries = hashSize(BODY(body)->maxBits);
> 
> stats.total += sizeof(*BODY(body)) +
> sizeof(BODY(body)->data[0]) *
> (maxEntries - 1);
> stats.wasted += sizeof(BODY(body)->data[0]) *
> (maxEntries - BODY(body)->nEntries);
>     }
> }
> 
> /*
>  * Compute the actual minimum size required for a given number of entries
>  */
> static inline unsigned
> roundSize(unsigned size)
> {
>     if (size < hashSize(minHashBits)) {
> return size;
>     } else {
> return (unsigned)((size + 1)/ fillRatio);
>     }
> }
> 
> template <class KeyT, class DataT>
> void
> LHash<KeyT,DataT>::alloc(unsigned size)
> {
>     unsigned maxBits, maxEntries;
>     unsigned i;
> 
>     /*
>      * round up to power of two
>      */
>     maxBits = 0;
>     while (hashSize(maxBits) < size) {
> assert(maxBits < LHash_maxBitLimit);
> maxBits++;
>     }
> 
>     maxEntries = hashSize(maxBits);
> 
>     //cerr << "LHash::alloc: current " << (body ? BODY(body)->nEntries : 0)
>     //  << ", requested " << size
>     //  << ", allocating " << maxEntries << " (" << maxBits << ")\n";
> 
>     body = (LHashBody<KeyT,DataT> *)malloc(sizeof(*BODY(body)) +
>       (maxEntries - 1) * sizeof(BODY(body)->data[0]));
>     assert(body != 0);
> 
>     BODY(body)->maxBits = maxBits;
>     BODY(body)->nEntries = 0;
> 
>     for (i = 0; i < maxEntries; i++) {
> Map_noKey(BODY(body)->data[i].key);
>     }
>     //cerr << "memory for header = " <<
>     //  ((void *)&(BODY(body)->data[0]) - (void *)BODY(body)) << endl;
>     //cerr << "memory per entry = " <<
>     //  ((void *)&(BODY(body)->data[3]) - (void *)&(BODY(body)->data[2])) <<
> endl;
> }
> 
> template <class KeyT, class DataT>
> LHash<KeyT,DataT>::LHash(unsigned size)
>     : body(0)
> {
>     if (size != 0) {
> /*
> * determine actual initial size
> */
> alloc(roundSize(size));
>     }
> }
> 
> template <class KeyT, class DataT>
> void
> LHash<KeyT,DataT>::clear(unsigned size)
> {
>     if (body) {
> unsigned maxEntries = hashSize(BODY(body)->maxBits);
> unsigned i;
> 
> for (i = 0; i < maxEntries; i++) {
>    if (! Map_noKeyP(BODY(body)->data[i].key)) {
> Map_freeKey(BODY(body)->data[i].key);
>    }
> }
> free(body);
> body = 0;
>     }
>     if (size != 0) {
> alloc(roundSize(size));
>     }
> }
> 
> template <class KeyT, class DataT>
> LHash<KeyT,DataT>::~LHash()
> {
>     clear(0);
> }
> 
> template <class KeyT, class DataT>
> LHash<KeyT,DataT>::LHash(const LHash<KeyT,DataT> &source)
>     : body(0)
> {
> #ifdef DEBUG
>     cerr << "warning: LHash copy constructor called\n";
> #endif
>     *this = source;
> }
> 
> template <class KeyT, class DataT>
> LHash<KeyT,DataT> &
> LHash<KeyT,DataT>::operator= (const LHash<KeyT,DataT> &other)
> {
> #ifdef DEBUG
>     cerr << "warning: LHash::operator= called\n";
> #endif
> 
>     if (&other == this) {
> return *this;
>     }
> 
>     /*
>      * copy hash entries from old to new
>      */
>     if (other.body) {
> unsigned maxEntries = hashSize(BODY(other.body)->maxBits);
> clear(maxEntries);
> 
> for (unsigned i = 0; i < maxEntries; i++) {
>    KeyT thisKey = BODY(other.body)->data[i].key;
> 
>    if (!Map_noKeyP(thisKey)) {
> /*
> * Copy key
> */
> BODY(body)->data[i].key = Map_copyKey(thisKey);
> 
> /*
> * Initialize data, required for = operator
> */
> new (&(BODY(body)->data[i].value)) DataT;
> 
> /*
> * Copy data
> */
> BODY(body)->data[i].value = BODY(other.body)->data[i].value;
>    }
> }
> BODY(body)->nEntries = BODY(other.body)->nEntries;
>     } else {
> clear(0);
>     }
> 
>     return *this;
> }
> 
> /*
>  * Returns index into data array that has the key which is either
>  * equal to key, or indicates the place where key would be placed if it
>  * occurred in the array.
>  */
> template <class KeyT, class DataT>
> Boolean
> LHash<KeyT,DataT>::locate(KeyT key, unsigned &index) const
> {
>     assert(!Map_noKeyP(key));
> 
>     if (body) {
> unsigned maxBits = BODY(body)->maxBits;
> register MapEntry<KeyT,DataT> *data = BODY(body)->data;
> 
> if (maxBits < minHashBits) {
>    /*
>     * Do a linear search
>     */
>    unsigned nEntries = BODY(body)->nEntries;
>    register unsigned i;
> 
>    for (i = 0; i < nEntries; i++) {
> if (LHash_equalKey(data[i].key, key)) {
>    index = i;
>    return true;
> }
>    }
>    index = i;
>    return false;
> } else {
>    /*
>     * Do a hashed search
>     */
>    unsigned hash = LHash_hashKey(key, maxBits);
>    unsigned i; 
> 
>    for (i = hash; ; i = (i + 1) & hashMask(maxBits))
>    {
> if (Map_noKeyP(data[i].key)) {
>    index = i;
>    return false;
> } else if (LHash_equalKey(data[i].key, key)) {
>    index = i;
>    return true;
> }
>    }
> }
>     } else {
> return false;
>     }
> }
> 
> template <class KeyT, class DataT>
> DataT *
> LHash<KeyT,DataT>::find(KeyT key, Boolean &foundP) const
> {
>     unsigned index;
> 
>     if (foundP = locate(key, index)) {
> return &(BODY(body)->data[index].value);
>     } else {
> return 0;
>     }
> }
> 
> template <class KeyT, class DataT>
> KeyT
> LHash<KeyT,DataT>::getInternalKey(KeyT key, Boolean &foundP) const
> {
>     unsigned index;
>     static KeyT zeroKey;
> 
>     if (foundP = locate(key, index)) {
> return BODY(body)->data[index].key;
>     } else {
> return zeroKey;
>     }
> }
> 
> template <class KeyT, class DataT>
> DataT *
> LHash<KeyT,DataT>::insert(KeyT key, Boolean &foundP)
> {
>     unsigned index;
> 
>     assert(!(Map_noKeyP(key)));
> 
>     /*
>      * Make sure there is room for at least one entry
>      */
>     if (body == 0) {
> alloc(1);
>     }
> 
>     if (foundP = locate(key, index)) {
> return &(BODY(body)->data[index].value);
>     } else {
> unsigned maxEntries = hashSize(BODY(body)->maxBits);
> unsigned nEntries = BODY(body)->nEntries;
> 
> /*
> * Rehash table if necessary
> */
> unsigned minSize = roundSize(nEntries + 1);
> 
> if (minSize > maxEntries) {
>    LHashBody<KeyT,DataT> *oldBody = BODY(body);
>    unsigned i;
> 
>    /*
>     * Since LHash_maxEntriesLimit is a power of two minus 1
>     * we need to check this only when the array is enlarged
>     */
>    assert(nEntries < LHash_maxEntriesLimit);
> 
>    alloc(minSize);
>    BODY(body)->nEntries = nEntries;
> 
>    if (BODY(body)->maxBits < minHashBits) {
> /*
> * Just copy old entries to new storage, no reindexing
> * required.
> */
> memcpy(BODY(body)->data, oldBody->data,
> sizeof(oldBody->data[0]) * nEntries);
>    } else {
> /*
> * Rehash
> */
> for (i = 0; i < maxEntries; i++) {
>    KeyT key = oldBody->data[i].key;
> 
>    if (! Map_noKeyP(key)) {
> (void)locate(key, index);
> memcpy(&(BODY(body)->data[index]), &(oldBody->data[i]),
> sizeof(oldBody->data[0]));
>    }
> }
>    }
>    free(oldBody);
> 
>    /*
>     * Entries have been moved, so have to re-locate key
>     */
>    (void)locate(key, index);
> }
> 
> BODY(body)->data[index].key = Map_copyKey(key);
> 
> /*
> * Initialize data to zero, but also call constructors, if any
> */
> memset(&(BODY(body)->data[index].value), 0,
> sizeof(BODY(body)->data[index].value));
> new (&(BODY(body)->data[index].value)) DataT;
> 
> BODY(body)->nEntries++;
> 
> return &(BODY(body)->data[index].value);
>     }
> }
> 
> template <class KeyT, class DataT>
> DataT *
> LHash<KeyT,DataT>::remove(KeyT key, Boolean &foundP)
> {
>     unsigned index;
> 
>     /*
>      * Allocate pseudo-static memory for removed objects (returned by
>      * remove()). We cannot make this static because otherwise
>      * the destructor for DataT would be called on it at program exit.
>      */
>     if (removedData == 0) {
> removedData = (DataT *)malloc(sizeof(DataT));
> assert(removedData != 0);
>     }
> 
>     if (foundP = locate(key, index)) {
> Map_freeKey(BODY(body)->data[index].key);
> Map_noKey(BODY(body)->data[index].key);
> memcpy(removedData, &BODY(body)->data[index].value, sizeof(DataT));
> 
> if (BODY(body)->maxBits < minHashBits) {
>    /*
>     * Linear search mode -- Just move all entries above the
>     * the one removed to fill the gap.
>     */
>    unsigned nEntries = BODY(body)->nEntries;
> 
>    memmove(&BODY(body)->data[index],
>    &BODY(body)->data[index+1],
>    (nEntries - index - 1) * sizeof(BODY(body)->data[0]));
>    Map_noKey(BODY(body)->data[nEntries - 1].key);
> } else {
>    /*
>     * The entry after the one being deleted could actually
>     * belong to a prior spot in the table, but was bounced forward due
>     * to a collision.   The invariant used in lookup is that
>     * all locations between the hash index and the actual index are
>     * filled.  Hence we examine all entries between the deleted
>     * position and the next free position as whether they need to
>     * be moved backward.
>     */
>    while (1) {
> unsigned newIndex;
> 
> index = (index + 1) & hashMask(BODY(body)->maxBits);
> 
> if (Map_noKeyP(BODY(body)->data[index].key)) {
>    break;
> }
> 
> /* 
> * If find returns false that means the deletion has
> * introduced a hole in the table that would prevent
> * us from finding the next entry. Luckily, find
> * also tells us where the hole is.  We move the
> * entry to its rightful place and continue filling
> * the hole created by this move.
> */
> if (!locate(BODY(body)->data[index].key, newIndex)) {
>    memcpy(&(BODY(body)->data[newIndex]),
>   &(BODY(body)->data[index]),
>   sizeof(BODY(body)->data[0]));
>    Map_noKey(BODY(body)->data[index].key);
> }
>    }
> }
> BODY(body)->nEntries--;
> return removedData;
>     } else {
> return 0;
>     }
> }
> 
> #ifdef __GNUG__
> static
> #endif
> int (*LHash_thisKeyCompare)();  /* used by LHashIter() to communicate
> * with compareIndex() */
> #ifdef __GNUG__
> static
> #endif
> void *LHash_thisBody;   /* ditto */
> 
> template <class KeyT, class DataT>
> void
> LHashIter<KeyT,DataT>::sortKeys()
> {
>     /*
>      * Store keys away and sort them to the user's orders.
>      */
>     unsigned maxEntries = hashSize(myLHashBody->maxBits);
> 
>     unsigned *sortedIndex = new unsigned[numEntries];
>     assert(sortedIndex != 0);
> 
>     unsigned i;
> 
>     unsigned j = 0;
>     for (i = 0; i < maxEntries; i++) {
> if (!Map_noKeyP(myLHashBody->data[i].key)) {
>    sortedIndex[j++] = i;
> }
>     }
>     assert(j == numEntries);
> 
>     /*
>      * Due to the limitations of the qsort interface we have to
>      * pass extra information to compareIndex in these global
>      * variables - yuck.
>      */
>     LHash_thisKeyCompare = (int(*)())sortFunction;
>     LHash_thisBody = myLHashBody;
>     qsort(sortedIndex, numEntries, sizeof(*sortedIndex), compareIndex);
> 
>     /*
>      * Save the keys for enumeration.  The reason we save the keys,
>      * not the indices, is that indices may change during enumeration
>      * due to deletions.
>      */
>     sortedKeys = new KeyT[numEntries];
>     assert(sortedKeys != 0);
> 
>     for (i = 0; i < numEntries; i++) {
> sortedKeys[i] = myLHashBody->data[sortedIndex[i]].key;
>     }
> 
>     delete [] sortedIndex;
> }
> 
> template <class KeyT, class DataT>
> LHashIter<KeyT,DataT>::LHashIter(const LHash<KeyT,DataT> &lhash,
>    int (*keyCompare)(KeyT, KeyT))
>     : myLHashBody(BODY(lhash.body)), current(0),
>       numEntries(lhash.numEntries()), sortFunction(keyCompare)
> {
>     /*
>      * Note: we access the underlying LHash through the body pointer,
>      * not the top-level LHash itself.  This allows the top-level object
>      * to be moved without affecting an ongoing iteration.
>      * XXX: This only works because
>      * - it is illegal to insert while iterating
>      * - deletions don't cause reallocation of the data
>      */
>     if (sortFunction && myLHashBody) {
> sortKeys();
>     } else {
> sortedKeys = 0;
>     }
> }
> 
> /*
>  * This is the callback function passed to qsort for comparing array
>  * entries. It is passed the indices into the data array, which are
>  * compared according to the user-supplied function applied to the
>  * keys found at those locations.
>  */
> template <class KeyT, class DataT>
> int
> LHashIter<KeyT,DataT>::compareIndex(const void *idx1, const void *idx2)
> {
>     return (*(compFnType)LHash_thisKeyCompare)
>    (BODY(LHash_thisBody)->data[*(unsigned *)idx1].key,
>     BODY(LHash_thisBody)->data[*(unsigned *)idx2].key);
> }
> 
> template <class KeyT, class DataT>
> LHashIter<KeyT,DataT>::~LHashIter()
> {
>     delete [] sortedKeys;
> }
> 
> 
> template <class KeyT, class DataT>
> void 
> LHashIter<KeyT,DataT>::init()
> {
>     delete [] sortedKeys;
> 
>     current = 0;
> 
>     {
> /*
> * XXX: fake LHash object to access numEntries()
> */
> LHash<KeyT,DataT> myLHash(0);
> 
> myLHash.body = myLHashBody;
> numEntries = myLHash.numEntries();
> myLHash.body = 0;
>     }
> 
>     if (sortFunction && myLHashBody) {
> sortKeys();
>     } else {
> sortedKeys = 0;
>     }
> }
> 
> template <class KeyT, class DataT>
> DataT *
> LHashIter<KeyT,DataT>::next(KeyT &key)
> {
> 
>     if (myLHashBody == 0) {
> return 0;
>     } else {
> unsigned index;
> 
> if (sortedKeys) {
>    /*
>     * Sorted enumeration -- advance to next key in sorted list
>     */
>    if (current == numEntries) {
> return 0;
>    }
> 
>    /*
>     * XXX: fake LHash object to access locate()
>     */
>    LHash<KeyT,DataT> myLHash(0);
> 
>    myLHash.body = myLHashBody;
>    myLHash.locate(sortedKeys[current++], index);
>    myLHash.body = 0;;
> } else {
>    /*
>     * Detect deletions by comparing old and current number of
>     * entries.
>     * A legal deletion can only affect the current entry, so
>     * adjust the current position to reflect the next entry was
>     * moved.
>     */
>    if (myLHashBody->nEntries != numEntries) {
> assert(myLHashBody->nEntries == numEntries - 1);
> 
> numEntries --;
> current --;
>    }
> 
>    /*
>     * Random enumeration mode
>     */
>    unsigned maxBits = myLHashBody->maxBits;
> 
>    if (maxBits < minHashBits) {
> /*
> * Linear search mode - advance one to the next entry
> */
> if (current == numEntries) {
>    return 0;
> }
>    } else {
> unsigned maxEntries = hashSize(maxBits);
> 
> while (current < maxEntries &&
>       Map_noKeyP(myLHashBody->data[current].key))
> {
>    current++;
> }
> 
> if (current == maxEntries) {
>    return 0;
> }
>    }
> 
>    index = current ++;
> }
> 
> key = myLHashBody->data[index].key;
> return &(myLHashBody->data[index].value);
>     }
> }
> 
> #undef BODY
> 
> #endif /* _LHash_cc_ */
> /*
>  * SArray.cc --
>  * Sorted array implementation
>  *
>  */
> 
> #ifndef _SArray_cc_
> #define _SArray_cc_
> 
> #ifndef lint
> static char SArray_Copyright[] = "Copyright (c) 1995-2005 SRI International.
> All Rights Reserved.";
> static char SArray_RcsId[] = "@(#)$Header:
> /home/srilm/devel/dstruct/src/RCS/SArray.cc,v 1.37 2005/07/17 22:01:31 stolcke
> Exp $";
> #endif
> 
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <assert.h>
> #include <new.h>
> 
> #include "SArray.h"
> 
> #undef INSTANTIATE_SARRAY
> #define INSTANTIATE_SARRAY(KeyT, DataT) \
> template <> DataT *SArray< KeyT, DataT >::removedData = 0; \
> template class SArray< KeyT, DataT >; \
> template class SArrayIter< KeyT, DataT >
> 
> #ifndef __GNUG__
> template <class KeyT, class DataT>
> DataT *SArray<KeyT,DataT>::removedData = 0;
> #endif /* __GNUG__ */
> 
> #define BODY(b) ((SArrayBody<KeyT,DataT> *)b)
> 
> const double growSize = 1.1;
> 
> template <class KeyT, class DataT>
> void
> SArray<KeyT,DataT>::dump() const
> {
>     if (body == 0) {
> cerr << "EMPTY" << endl;
>     } else {
> unsigned nEntries = numEntries();
> 
> cerr << "maxEntries = " << BODY(body)->maxEntries;
> cerr << " nEntries = " << nEntries;
> 
> for (unsigned i = 0; i < nEntries; i++) {
>    cerr << " " << i << ": ";
>    cerr << BODY(body)->data[i].key
> #ifdef DUMP_VALUES
> /* 
> * Only certain types can be printed.
> */
>     << "->" << BODY(body)->data[i].value
> #endif /* DUMP_VALUES */
>     ;
> }
>     }
> }
> 
> template <class KeyT, class DataT>
> void
> SArray<KeyT,DataT>::memStats(MemStats &stats) const
> {
>     stats.total += sizeof(*this);
> 
>     if (body) {
> unsigned maxEntries = BODY(body)->maxEntries;
> 
> stats.total += sizeof(*BODY(body)) +
> sizeof(BODY(body)->data[0]) *
> (maxEntries - 1);
> stats.wasted += sizeof(BODY(body)->data[0]) *
> (maxEntries - numEntries());
>     }
> }
> 
> template <class KeyT, class DataT>
> void
> SArray<KeyT,DataT>::alloc(unsigned size)
> {
>     body = (SArrayBody<KeyT,DataT> *)malloc(sizeof(*BODY(body)) +
>   (size - 1) * sizeof(BODY(body)->data[0]));
>     assert(body != 0);
> 
>     BODY(body)->deleted = 0;
>     BODY(body)->maxEntries = size;
> 
>     /*
>      * Fill the array with dummy keys, marking the unused space
>      */
>     for (unsigned i = 0; i < size; i ++) {
> Map_noKey(BODY(body)->data[i].key);
>     }
> }
> 
> template <class KeyT, class DataT>
> SArray<KeyT,DataT>::SArray(unsigned size)
>     : body(0)
> {
>     if (size != 0) {
> alloc(size);
>     }
> }
> 
> template <class KeyT, class DataT>
> void
> SArray<KeyT,DataT>::clear(unsigned size)
> {
>     if (body) {
> unsigned maxEntries = BODY(body)->maxEntries;
> 
> for (unsigned i = 0; i < maxEntries; i++) {
>    KeyT thisKey = BODY(body)->data[i].key;
> 
>    if (Map_noKeyP(thisKey)) {
> break;
>    }
>    Map_freeKey(thisKey);
> }
> free(body);
> body = 0;
>     }
>     if (size != 0) {
> alloc(size);
>     }
> }
> 
> template <class KeyT, class DataT>
> SArray<KeyT,DataT>::~SArray()
> {
>     clear(0);
> }
> 
> template <class KeyT, class DataT>
> unsigned
> SArray<KeyT,DataT>::numEntries() const
> {
>     if (body == 0) {
> return 0;
>     } else if (Map_noKeyP(BODY(body)->data[0].key)) {
> return 0;
>     } else {
> /*
> * Do a binary search for the end of the used memory
> * lower always points to a filled entry
> * upper always points to a free entry beyond the end of used entries
> */
> unsigned lower, upper;
> 
> lower = 0;   /* minimum index (inclusive) */
> upper = BODY(body)->maxEntries; /* maximum index (exclusive) */
> 
> while (lower + 1 < upper) {
>    unsigned middle = (lower + upper)/2;
> 
>    if (Map_noKeyP(BODY(body)->data[middle].key)) {
>    upper = middle;
>    } else {
>    lower = middle;
>    }
> }
> 
> /* lower + 1 == upper */
> return upper;
>     }
> }
> 
> template <class KeyT, class DataT>
> SArray<KeyT,DataT>::SArray(const SArray<KeyT,DataT> &source)
>     : body(0)
> {
> #ifdef DEBUG
>     cerr << "warning: SArray copy constructor called\n";
> #endif
>     *this = source;
> }
> 
> template <class KeyT, class DataT>
> SArray<KeyT,DataT> &
> SArray<KeyT,DataT>::operator= (const SArray<KeyT,DataT> &other)
> {
> #ifdef DEBUG
>     cerr << "warning: SArray::operator= called\n";
> #endif
> 
>     if (&other == this) {
> return *this;
>     }
> 
>     /*
>      * copy array entries from old to new
>      */
>     if (other.body) {
> unsigned maxEntries = BODY(other.body)->maxEntries;
> clear(maxEntries);
> 
> for (unsigned i = 0; i < maxEntries; i++) {
>    KeyT thisKey = BODY(other.body)->data[i].key;
> 
>    if (Map_noKeyP(thisKey)) {
> break;
>    }
> 
>    /*
>     * Copy key
>     */
>    BODY(body)->data[i].key = Map_copyKey(thisKey);
> 
>    /*
>     * Initialize data, required for = operator
>     */
>    new (&(BODY(body)->data[i].value)) DataT;
> 
>    /*
>     * Copy data
>     */
>    BODY(body)->data[i].value = BODY(other.body)->data[i].value;
> 
> }
>     } else {
> clear(0);
>     }
> 
>     return *this;
> }
> 
> /*
>  * Returns index into data array that has the key which is either
>  * equal to key, or indicates the place where key would be placed if it
>  * occurred in the array.
>  */
> template <class KeyT, class DataT>
> Boolean
> SArray<KeyT,DataT>::locate(KeyT key, unsigned &index) const
> {
>     assert(!Map_noKeyP(key));
> 
>     if (body) {
> unsigned lower, upper;
> 
> lower = 0;   /* minimum index (inclusive) */
> upper = BODY(body)->maxEntries; /* maximum index (exclusive) */
> 
> while (lower < upper) {
>    unsigned middle = (lower + upper)/2;
> 
>    KeyT thisKey = BODY(body)->data[middle].key;
> 
>    if (Map_noKeyP(thisKey)) {
> upper = middle;
> continue;
>    }
>    
>    int compare = SArray_compareKey(key, thisKey);
> 
>    if (compare < 0) {
>    upper = middle;
>    } else if (compare > 0) {
>    lower = middle + 1;
>    } else {
>    index = middle;
>    return true;
>    }
> }
> 
> /* we have lower == upper */
> if (lower == BODY(body)->maxEntries ||
>    Map_noKeyP(BODY(body)->data[lower].key) ||
>    SArray_compareKey(key, BODY(body)->data[lower].key) < 0)
> {
>    index = lower;
> } else  {
>    index = lower + 1;
> }
> return false;
>     } else {
> return false;
>     }
> }
> 
> template <class KeyT, class DataT>
> DataT *
> SArray<KeyT,DataT>::find(KeyT key, Boolean &foundP) const
> {
>     unsigned index;
> 
>     if (foundP = locate(key, index)) {
> return &(BODY(body)->data[index].value);
>     } else {
> return 0;
>     }
> }
> 
> template <class KeyT, class DataT>
> KeyT
> SArray<KeyT,DataT>::getInternalKey(KeyT key, Boolean &foundP) const
> {
>     unsigned index;
>     static KeyT zeroKey;
> 
>     if (foundP = locate(key, index)) {
> return BODY(body)->data[index].key;
>     } else {
> return zeroKey;
>     }
> }
> 
> template <class KeyT, class DataT>
> DataT *
> SArray<KeyT,DataT>::insert(KeyT key, Boolean &foundP)
> {
>     unsigned index;
> 
>     /*
>      * Make sure there is room for at least one entry
>      */
>     if (body == 0) {
> alloc(1);
>     }
> 
>     if (foundP = locate(key, index)) {
> return &(BODY(body)->data[index].value);
>     } else {
> unsigned nEntries = numEntries();
> unsigned maxEntries = BODY(body)->maxEntries;
> 
> /*
> * Need to add an element.
> * First, enlarge array if necessary
> */
> if (nEntries == maxEntries) {
>    maxEntries = (int)(maxEntries * growSize);
>    if (maxEntries == nEntries) {
> maxEntries ++;
>    }
>    body = (SArrayBody<KeyT,DataT> *)realloc(body,
> sizeof(*BODY(body)) +
>   (maxEntries - 1) * sizeof(BODY(body)->data[0]));
>    assert(body != 0);
> 
>    /*
>     * Fill new space with dummy keys
>     */
>    for (unsigned i = BODY(body)->maxEntries; i < maxEntries; i ++) {
> Map_noKey(BODY(body)->data[i].key);
>    }
> 
>    BODY(body)->maxEntries = maxEntries;
> }
> 
> memmove(&(BODY(body)->data[index + 1]),
> &(BODY(body)->data[index]),
> (nEntries - index) * sizeof(BODY(body)->data[0]));
> 
> BODY(body)->data[index].key = Map_copyKey(key);
> 
> /*
> * Initialize data to zero, but also call constructors, if any
> */
> memset(&(BODY(body)->data[index].value), 0,
>   sizeof(BODY(body)->data[index].value));
> new (&(BODY(body)->data[index].value)) DataT;
> 
> return &(BODY(body)->data[index].value);
>     }
> }
>   
> template <class KeyT, class DataT>
> DataT *
> SArray<KeyT,DataT>::remove(KeyT key, Boolean &foundP)
> {
>     unsigned index;
> 
>     /*
>      * Allocate pseudo-static memory for removed objects (returned by
>      * remove()). We cannot make this static because otherwise
>      * the destructor for DataT would be called on it at program exit.
>      */
>     if (removedData == 0) {
> removedData = (DataT *)malloc(sizeof(DataT));
> assert(removedData != 0);
>     }
> 
>     if (foundP = locate(key, index)) {
> unsigned nEntries = numEntries();
> 
> Map_freeKey(BODY(body)->data[index].key);
> memcpy(removedData, &BODY(body)->data[index].value, sizeof(DataT));
> 
> memmove(&(BODY(body)->data[index]),
> &(BODY(body)->data[index + 1]),
> (nEntries - index - 1) * sizeof(BODY(body)->data[0]));
> 
> /*
> * mark previous last slot as free
> */
> Map_noKey(BODY(body)->data[nEntries-1].key);
> 
> /*
> * Indicate that deletion occurred
> */
> BODY(body)->deleted = 1;
> 
> return removedData;
>     } else {
> return 0;
>     }
> }
>   
> #ifdef __GNUG__
> static
> #endif
> int (*SArray_thisKeyCompare)();  /* used by SArrayIter() to communicate
> * with compareIndex() */
> #ifdef __GNUG__
> static
> #endif
> void *SArray_thisBody;   /* ditto */
> 
> 
> template <class KeyT, class DataT>
> void
> SArrayIter<KeyT,DataT>::sortKeys()
> {
>     /*
>      * Store keys away and sort them to the user's orders.
>      */
>     unsigned *sortedIndex = new unsigned[numEntries];
>     assert(sortedIndex != 0);
> 
>     unsigned i;
>     for (i = 0; i < numEntries; i++) {
> sortedIndex[i] = i;
>     }
> 
>     /*
>      * Due to the limitations of the qsort interface we have to
>      * pass extra information to compareIndex in these global
>      * variables - yuck.
>      */
>     SArray_thisKeyCompare = (int (*)())sortFunction;
>     SArray_thisBody = mySArrayBody;
>     qsort(sortedIndex, numEntries, sizeof(*sortedIndex), compareIndex);
> 
>     /*
>      * Save the keys for enumeration.  The reason we save the keys,
>      * not the indices, is that indices may change during enumeration
>      * due to deletions.
>      */
>     sortedKeys = new KeyT[numEntries];
>     assert(sortedKeys != 0);
> 
>     for (i = 0; i < numEntries; i++) {
> sortedKeys[i] = mySArrayBody->data[sortedIndex[i]].key;
>     }
> 
>     delete [] sortedIndex;
> }
> 
> template <class KeyT, class DataT>
> SArrayIter<KeyT,DataT>::SArrayIter(const SArray<KeyT,DataT> &sarray,
> int (*keyCompare)(KeyT, KeyT))
>     : mySArrayBody(BODY(sarray.body)), current(0),
>       numEntries(sarray.numEntries()), sortFunction(keyCompare)
> {
>     /*
>      * Note: we access the underlying SArray through the body pointer,
>      * not the top-level SArray itself.  This allows the top-level object
>      * to be moved without affecting an ongoing iteration.
>      * XXX: This only works because
>      * - it is illegal to insert while iterating
>      * - deletions don't cause reallocation of the data
>      */
>     if (sortFunction && mySArrayBody) {
> sortKeys();
>     } else {
> sortedKeys = 0;
>     }
> 
>     if (mySArrayBody) {
> mySArrayBody->deleted = 0;
>     }
> }
> 
> /*
>  * This is the callback function passed to qsort for comparing array
>  * entries. It is passed the indices into the data array, which are
>  * compared according to the user-supplied function applied to the
>  * keys found at those locations.
>  */
> template <class KeyT, class DataT>
> int
> SArrayIter<KeyT,DataT>::compareIndex(const void *idx1, const void *idx2)
> {
>   return (*(compFnType)SArray_thisKeyCompare)
> (BODY(SArray_thisBody)->data[*(unsigned *)idx1].key,
> BODY(SArray_thisBody)->data[*(unsigned *)idx2].key);
> }
> 
> template <class KeyT, class DataT>
> SArrayIter<KeyT,DataT>::~SArrayIter()
> {
>     delete [] sortedKeys;
> }
> 
> template <class KeyT, class DataT>
> void
> SArrayIter<KeyT,DataT>::init()
> {
>     delete [] sortedKeys;
>     sortedKeys = 0;
> 
>     current = 0;
> 
>     {
> /*
> * XXX: fake SArray object to access numEntries()
> */
> SArray<KeyT,DataT> mySArray(0);
> 
> mySArray.body = mySArrayBody;
> numEntries = mySArray.numEntries();
> mySArray.body = 0;
>     }
> 
>     if (mySArrayBody) {
> if (sortFunction) {
>    sortKeys();
> }
> mySArrayBody->deleted = 0;
>     }
> }
> 
> template <class KeyT, class DataT>
> DataT *
> SArrayIter<KeyT,DataT>::next(KeyT &key)
> {
>     if (mySArrayBody == 0) {
> return 0;
>     } else {
> unsigned index;
> 
> if (sortedKeys == 0) {
>    /*
>     * Detect deletion while iterating.
>     * A legal deletion can only affect the current entry, so
>     * adjust the current position to reflect the next entry was
>     * moved.
>     */
>    if (mySArrayBody->deleted) {
> numEntries --;
> current --;
>    }
> 
>    if (current == numEntries) {
> return 0;
>    }
> 
>    index = current++;
> } else {
>    if (current == numEntries) {
> return 0;
>    }
> 
>    /*
>     * XXX: fake an SArray object to access locate()
>     */
>    SArray<KeyT,DataT> mySArray(0);
> 
>    mySArray.body = mySArrayBody;
>    (void)mySArray.locate(sortedKeys[current++], index);
>    mySArray.body = 0;
> }
> mySArrayBody->deleted = 0;
> 
> key = mySArrayBody->data[index].key;
> return &(mySArrayBody->data[index].value);
>     }
> }
> 
> #undef BODY
> 
> #endif /* _SArray_cc_ */




More information about the SRILM-User mailing list