Modifying ngram.cc

Andreas Stolcke stolcke at speech.sri.com
Sun Jul 17 17:47:57 PDT 2005


The problem was that copy constructors (and assignment operators)
for the container data structures (Array, SArray, LHash) weren't implemented.
I fixed that in the current development version.   
No changes to any of the higher-level classes (like Ngram) is needed,
as C++ automatically (and recursively) defines copy constructors based on the 
member copy constructors.

You can either replace the attached source files (in dstruct/src), or
download the "1.4.5 (Beta)" version from the web page.

--Andreas

In message <BEF415D2.332F%lambert at jhu.edu>you wrote:
> Hi,
> 
> This is kind of a C++ question
> I wrote the following copy constructor in Ngram.h
> Ngram(const Ngram 
> &ng):LM(ng.vocab),contexts(ng.contexts),order(ng.order),_skipOOVs(ng._skipOO
> Vs),_trustTotals(ng._trustTotals){};
> 
> I have the following declarations
> 
> Ngram ngramLM(vocab*,order);
> 
> While(//<some condition>)
>     Ngram useLM(ngramLM);
> 
>     // Do some stuff with useLM
>     .........
>     ........
> }
> 
> The problem is that the assignment useLM=ngramLM doesn't assign the original
> ngramLM. Any changes I make to useLM shows up in ngramLM too.
> I just want to make a copy (useLM) of the original ngramLM, work on that
> copy and then reinitialize another useLM with the original ngramLM.
> 
> Any thoughts?
> 
> 
> Lambert
> 

-------------- next part --------------
/*
 * Array.h --
 *	Extensible array class
 *
 * Copyright (c) 1995-2005 SRI International.  All Rights Reserved.
 *
 * @(#)$Header: /home/srilm/devel/dstruct/src/RCS/Array.h,v 1.11 2005/07/17 22:12:21 stolcke Exp $
 *
 */

#ifndef _Array_h_
#define _Array_h_

#include <assert.h>

#include "MemStats.h"

template <class DataT>
class Array
{
public:
    Array(int base = 0, unsigned int size = 0)
	: _base(base), _size(size), _data(0), alloc_size(0)
	{ if (size > 0) { alloc(size-1); } };
    Array(Array<DataT> &source)
	: _base(source._base), _size(0), _data(0), alloc_size(0)
    	{ *this = source; }

    ~Array() { delete [] _data; } ;

    DataT &operator[](int index)
    	{ int offset = index - _base; assert(offset >= 0);
	  if (offset >= _size) {
	    _size = offset + 1;
	    if (offset >= alloc_size) { alloc(offset); }
	  }
	  return _data[offset];
	};

    Array<DataT> & operator= (const Array<DataT> &other);

    DataT *data() const { return _data - _base; };
    int base() const { return _base; }
    unsigned int size() const { return _size; }

    void memStats(MemStats &stats) const;

private:
    unsigned int _size;		/* used size */
    int _base;
    DataT *_data;
    unsigned int alloc_size;	/* allocated size */
    void alloc(unsigned int size);
};

#endif /* _Array_h_ */
-------------- next part --------------
/*
 * LHash.cc --
 *	Linear search hash table implementation
 *
 */

#ifndef _LHash_cc_
#define _LHash_cc_

#ifndef lint
static char LHash_Copyright[] = "Copyright (c) 1995-2005 SRI International.  All Rights Reserved.";
static char LHash_RcsId[] = "@(#)$Header: /home/srilm/devel/dstruct/src/RCS/LHash.cc,v 1.46 2005/07/17 22:01:31 stolcke Exp $";
#endif

#include <iostream.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <new.h>

#include "LHash.h"

#undef INSTANTIATE_LHASH
#define INSTANTIATE_LHASH(KeyT, DataT) \
	template <> DataT *LHash< KeyT, DataT >::removedData = 0; \
	template class LHash< KeyT, DataT >; \
	template class LHashIter< KeyT, DataT >

#ifndef __GNUG__
template <class KeyT, class DataT>
DataT *LHash<KeyT,DataT>::removedData = 0;
#endif /* __GNUG__ */

const unsigned minHashBits = 3;		/* minimum no. bits for hashing
					 * tables smaller than this use linear
					 * search to save space */
const float fillRatio = 0.8;		/* fill ration at which the table is
					 * expanded and rehashed */

#define BODY(b)	((LHashBody<KeyT,DataT> *)b)

/*
 * Dump the entire hash array to cerr.  Unused slots are printed as "FREE".
 */
template <class KeyT, class DataT>
void
LHash<KeyT,DataT>::dump() const
{
    if (body) {
	unsigned maxEntries = hashSize(BODY(body)->maxBits);
	unsigned i;

	for (i = 0; i < maxEntries; i++) {
	    cerr << " " << i << ": ";
	    if (Map_noKeyP(BODY(body)->data[i].key)) {
		cerr << "FREE";
	    } else {
		cerr << BODY(body)->data[i].key
#ifdef DUMP_VALUES
			/* 
			 * Only certain types can be printed.
			 */
		     << "->" << BODY(body)->data[i].value
#endif /* DUMP_VALUES */
		     ;
	    }
	}
    } else {
	cerr << "EMPTY";
    }
    cerr << endl;
}

template <class KeyT, class DataT>
void
LHash<KeyT,DataT>::memStats(MemStats &stats) const
{
    stats.total += sizeof(*this);
    if (body) {
        unsigned maxEntries = hashSize(BODY(body)->maxBits);

	stats.total += sizeof(*BODY(body)) +
			sizeof(BODY(body)->data[0]) *
				(maxEntries - 1);
	stats.wasted += sizeof(BODY(body)->data[0]) *
				(maxEntries - BODY(body)->nEntries);
    }
}

/*
 * Compute the actual minimum size required for a given number of entries
 */
static inline unsigned
roundSize(unsigned size)
{
    if (size < hashSize(minHashBits)) {
	return size;
    } else {
	return (unsigned)((size + 1)/ fillRatio);
    }
}

template <class KeyT, class DataT>
void
LHash<KeyT,DataT>::alloc(unsigned size)
{
    unsigned maxBits, maxEntries;
    unsigned i;

    /*
     * round up to power of two
     */
    maxBits = 0;
    while (hashSize(maxBits) < size) {
	assert(maxBits < LHash_maxBitLimit);
	maxBits++;
    }

    maxEntries = hashSize(maxBits);

    //cerr << "LHash::alloc: current " << (body ? BODY(body)->nEntries : 0)
    //	 << ", requested " << size 
    //	 << ", allocating " << maxEntries << " (" << maxBits << ")\n";

    body = (LHashBody<KeyT,DataT> *)malloc(sizeof(*BODY(body)) +
			       (maxEntries - 1) * sizeof(BODY(body)->data[0]));
    assert(body != 0);

    BODY(body)->maxBits = maxBits;
    BODY(body)->nEntries = 0;

    for (i = 0; i < maxEntries; i++) {
	Map_noKey(BODY(body)->data[i].key);
    }
    //cerr << "memory for header = " <<
    //		((void *)&(BODY(body)->data[0]) - (void *)BODY(body)) << endl;
    //cerr << "memory per entry = " <<
    //		((void *)&(BODY(body)->data[3]) - (void *)&(BODY(body)->data[2])) << endl;
}

template <class KeyT, class DataT>
LHash<KeyT,DataT>::LHash(unsigned size)
    : body(0)
{
    if (size != 0) {
	/*
	 * determine actual initial size
	 */
	alloc(roundSize(size));
    }
}

template <class KeyT, class DataT>
void
LHash<KeyT,DataT>::clear(unsigned size)
{
    if (body) {
	unsigned maxEntries = hashSize(BODY(body)->maxBits);
	unsigned i;

	for (i = 0; i < maxEntries; i++) {
	    if (! Map_noKeyP(BODY(body)->data[i].key)) {
		Map_freeKey(BODY(body)->data[i].key);
	    }
	}
	free(body);
	body = 0;
    }
    if (size != 0) {
	alloc(roundSize(size));
    }
}

template <class KeyT, class DataT>
LHash<KeyT,DataT>::~LHash()
{
    clear(0);
}

template <class KeyT, class DataT>
LHash<KeyT,DataT>::LHash(const LHash<KeyT,DataT> &source)
    : body(0)
{
#ifdef DEBUG
    cerr << "warning: LHash copy constructor called\n";
#endif
    *this = source;
}

template <class KeyT, class DataT>
LHash<KeyT,DataT> &
LHash<KeyT,DataT>::operator= (const LHash<KeyT,DataT> &other)
{
#ifdef DEBUG
    cerr << "warning: LHash::operator= called\n";
#endif

    if (&other == this) {
	return *this;
    }

    /*
     * copy hash entries from old to new 
     */
    if (other.body) {
	unsigned maxEntries = hashSize(BODY(other.body)->maxBits);
	clear(maxEntries);

	for (unsigned i = 0; i < maxEntries; i++) {
	    KeyT thisKey = BODY(other.body)->data[i].key;

	    if (!Map_noKeyP(thisKey)) {
		/*
		 * Copy key
		 */
		BODY(body)->data[i].key = Map_copyKey(thisKey);

		/*
		 * Initialize data, required for = operator
		 */
		new (&(BODY(body)->data[i].value)) DataT;

		/*
		 * Copy data
		 */
		BODY(body)->data[i].value = BODY(other.body)->data[i].value;
	    }
	}
	BODY(body)->nEntries = BODY(other.body)->nEntries;
    } else {
	clear(0);
    }

    return *this;
}

/*
 * Returns index into data array that has the key which is either
 * equal to key, or indicates the place where key would be placed if it
 * occurred in the array.
 */
template <class KeyT, class DataT>
Boolean
LHash<KeyT,DataT>::locate(KeyT key, unsigned &index) const
{
    assert(!Map_noKeyP(key));

    if (body) {
	unsigned maxBits = BODY(body)->maxBits;
    	register MapEntry<KeyT,DataT> *data = BODY(body)->data;

	if (maxBits < minHashBits) {
	    /*
	     * Do a linear search
	     */
	    unsigned nEntries = BODY(body)->nEntries;
	    register unsigned i;

	    for (i = 0; i < nEntries; i++) {
		if (LHash_equalKey(data[i].key, key)) {
		    index = i;
		    return true;
		}
	    }
	    index = i;
	    return false;
	} else {
	    /*
	     * Do a hashed search
	     */
	    unsigned hash = LHash_hashKey(key, maxBits);
	    unsigned i; 

	    for (i = hash; ; i = (i + 1) & hashMask(maxBits))
	    {
		if (Map_noKeyP(data[i].key)) {
		    index = i;
		    return false;
		} else if (LHash_equalKey(data[i].key, key)) {
		    index = i;
		    return true;
		}
	    }
	}
    } else {
	return false;
    }
}

template <class KeyT, class DataT>
DataT *
LHash<KeyT,DataT>::find(KeyT key, Boolean &foundP) const
{
    unsigned index;

    if (foundP = locate(key, index)) {
	return &(BODY(body)->data[index].value);
    } else {
	return 0;
    }
}

template <class KeyT, class DataT>
KeyT
LHash<KeyT,DataT>::getInternalKey(KeyT key, Boolean &foundP) const
{
    unsigned index;
    static KeyT zeroKey;

    if (foundP = locate(key, index)) {
	return BODY(body)->data[index].key;
    } else {
	return zeroKey;
    }
}

template <class KeyT, class DataT>
DataT *
LHash<KeyT,DataT>::insert(KeyT key, Boolean &foundP)
{
    unsigned index;

    assert(!(Map_noKeyP(key)));

    /*
     * Make sure there is room for at least one entry
     */
    if (body == 0) {
	alloc(1);
    }

    if (foundP = locate(key, index)) {
	return &(BODY(body)->data[index].value);
    } else {
	unsigned maxEntries = hashSize(BODY(body)->maxBits);
	unsigned nEntries = BODY(body)->nEntries;

	/*
	 * Rehash table if necessary
	 */
	unsigned minSize = roundSize(nEntries + 1);

	if (minSize > maxEntries) {
	    LHashBody<KeyT,DataT> *oldBody = BODY(body);
	    unsigned i;

	    /*
	     * Since LHash_maxEntriesLimit is a power of two minus 1
	     * we need to check this only when the array is enlarged
	     */
	    assert(nEntries < LHash_maxEntriesLimit);

	    alloc(minSize);
	    BODY(body)->nEntries = nEntries;

	    if (BODY(body)->maxBits < minHashBits) {
		/*
		 * Just copy old entries to new storage, no reindexing
		 * required.
		 */
		memcpy(BODY(body)->data, oldBody->data,
			sizeof(oldBody->data[0]) * nEntries);
	    } else {
		/*
		 * Rehash
		 */
		for (i = 0; i < maxEntries; i++) {
		    KeyT key = oldBody->data[i].key;

		    if (! Map_noKeyP(key)) {
			(void)locate(key, index);
			memcpy(&(BODY(body)->data[index]), &(oldBody->data[i]),
							sizeof(oldBody->data[0]));
		    }
		}
	    }
	    free(oldBody);

	    /*
	     * Entries have been moved, so have to re-locate key
	     */
	    (void)locate(key, index);
	}

	BODY(body)->data[index].key = Map_copyKey(key);

	/*
	 * Initialize data to zero, but also call constructors, if any
	 */
	memset(&(BODY(body)->data[index].value), 0,
			sizeof(BODY(body)->data[index].value));
	new (&(BODY(body)->data[index].value)) DataT;

	BODY(body)->nEntries++;

	return &(BODY(body)->data[index].value);
    }
}

template <class KeyT, class DataT>
DataT *
LHash<KeyT,DataT>::remove(KeyT key, Boolean &foundP)
{
    unsigned index;

    /*
     * Allocate pseudo-static memory for removed objects (returned by
     * remove()). We cannot make this static because otherwise 
     * the destructor for DataT would be called on it at program exit.
     */
    if (removedData == 0) {
	removedData = (DataT *)malloc(sizeof(DataT));
	assert(removedData != 0);
    }

    if (foundP = locate(key, index)) {
	Map_freeKey(BODY(body)->data[index].key);
	Map_noKey(BODY(body)->data[index].key);
	memcpy(removedData, &BODY(body)->data[index].value, sizeof(DataT));

	if (BODY(body)->maxBits < minHashBits) {
	    /*
	     * Linear search mode -- Just move all entries above the
	     * the one removed to fill the gap.
	     */
	    unsigned nEntries = BODY(body)->nEntries;

	    memmove(&BODY(body)->data[index],
		    &BODY(body)->data[index+1],
		    (nEntries - index - 1) * sizeof(BODY(body)->data[0]));
	    Map_noKey(BODY(body)->data[nEntries - 1].key);
	} else {
	    /*
	     * The entry after the one being deleted could actually
	     * belong to a prior spot in the table, but was bounced forward due
	     * to a collision.   The invariant used in lookup is that
	     * all locations between the hash index and the actual index are
	     * filled.  Hence we examine all entries between the deleted
	     * position and the next free position as whether they need to
	     * be moved backward.
	     */
	    while (1) {
		unsigned newIndex;

		index = (index + 1) & hashMask(BODY(body)->maxBits);

		if (Map_noKeyP(BODY(body)->data[index].key)) {
		    break;
		}

		/* 
		 * If find returns false that means the deletion has
		 * introduced a hole in the table that would prevent
		 * us from finding the next entry. Luckily, find 
		 * also tells us where the hole is.  We move the 
		 * entry to its rightful place and continue filling
		 * the hole created by this move.
		 */
		if (!locate(BODY(body)->data[index].key, newIndex)) {
		    memcpy(&(BODY(body)->data[newIndex]),
			   &(BODY(body)->data[index]),
			   sizeof(BODY(body)->data[0]));
		    Map_noKey(BODY(body)->data[index].key);
		}
	    }
	}
	BODY(body)->nEntries--;
	return removedData;
    } else {
	return 0;
    }
}

#ifdef __GNUG__
static
#endif
int (*LHash_thisKeyCompare)();		/* used by LHashIter() to communicate
					 * with compareIndex() */
#ifdef __GNUG__
static
#endif
void *LHash_thisBody;			/* ditto */

template <class KeyT, class DataT>
void
LHashIter<KeyT,DataT>::sortKeys()
{
    /*
     * Store keys away and sort them to the user's orders.
     */
    unsigned maxEntries = hashSize(myLHashBody->maxBits);

    unsigned *sortedIndex = new unsigned[numEntries];
    assert(sortedIndex != 0);

    unsigned i;

    unsigned j = 0;
    for (i = 0; i < maxEntries; i++) {
	if (!Map_noKeyP(myLHashBody->data[i].key)) {
	    sortedIndex[j++] = i;
	}
    }
    assert(j == numEntries);

    /*
     * Due to the limitations of the qsort interface we have to 
     * pass extra information to compareIndex in these global
     * variables - yuck. 
     */
    LHash_thisKeyCompare = (int(*)())sortFunction;
    LHash_thisBody = myLHashBody;
    qsort(sortedIndex, numEntries, sizeof(*sortedIndex), compareIndex);

    /*
     * Save the keys for enumeration.  The reason we save the keys,
     * not the indices, is that indices may change during enumeration
     * due to deletions.
     */
    sortedKeys = new KeyT[numEntries];
    assert(sortedKeys != 0);

    for (i = 0; i < numEntries; i++) {
	sortedKeys[i] = myLHashBody->data[sortedIndex[i]].key;
    }

    delete [] sortedIndex;
}

template <class KeyT, class DataT>
LHashIter<KeyT,DataT>::LHashIter(const LHash<KeyT,DataT> &lhash,
				    int (*keyCompare)(KeyT, KeyT))
    : myLHashBody(BODY(lhash.body)), current(0),
      numEntries(lhash.numEntries()), sortFunction(keyCompare)
{
    /*
     * Note: we access the underlying LHash through the body pointer,
     * not the top-level LHash itself.  This allows the top-level object
     * to be moved without affecting an ongoing iteration.
     * XXX: This only works because
     * - it is illegal to insert while iterating
     * - deletions don't cause reallocation of the data
     */
    if (sortFunction && myLHashBody) {
	sortKeys();
    } else {
	sortedKeys = 0;
    }
}

/*
 * This is the callback function passed to qsort for comparing array
 * entries. It is passed the indices into the data array, which are
 * compared according to the user-supplied function applied to the 
 * keys found at those locations.
 */
template <class KeyT, class DataT>
int
LHashIter<KeyT,DataT>::compareIndex(const void *idx1, const void *idx2)
{
    return (*(compFnType)LHash_thisKeyCompare)
		    (BODY(LHash_thisBody)->data[*(unsigned *)idx1].key,
		     BODY(LHash_thisBody)->data[*(unsigned *)idx2].key);
}

template <class KeyT, class DataT>
LHashIter<KeyT,DataT>::~LHashIter()
{
    delete [] sortedKeys;
}


template <class KeyT, class DataT>
void 
LHashIter<KeyT,DataT>::init()
{
    delete [] sortedKeys;

    current = 0;

    {
	/*
	 * XXX: fake LHash object to access numEntries()
	 */
	LHash<KeyT,DataT> myLHash(0);

	myLHash.body = myLHashBody;
	numEntries = myLHash.numEntries();
	myLHash.body = 0;
    }

    if (sortFunction && myLHashBody) {
	sortKeys();
    } else {
	sortedKeys = 0;
    }
}

template <class KeyT, class DataT>
DataT *
LHashIter<KeyT,DataT>::next(KeyT &key)
{

    if (myLHashBody == 0) {
	return 0;
    } else {
	unsigned index;

	if (sortedKeys) {
	    /*
	     * Sorted enumeration -- advance to next key in sorted list
	     */
	    if (current == numEntries) {
		return 0;
	    }

	    /*
	     * XXX: fake LHash object to access locate()
	     */
	    LHash<KeyT,DataT> myLHash(0);

	    myLHash.body = myLHashBody;
	    myLHash.locate(sortedKeys[current++], index);
	    myLHash.body = 0;;
	} else {
	    /*
	     * Detect deletions by comparing old and current number of 
	     * entries.
	     * A legal deletion can only affect the current entry, so
	     * adjust the current position to reflect the next entry was
	     * moved.
	     */
	    if (myLHashBody->nEntries != numEntries) {
		assert(myLHashBody->nEntries == numEntries - 1);

		numEntries --;
		current --;
	    }

	    /*
	     * Random enumeration mode
	     */
	    unsigned maxBits = myLHashBody->maxBits;

	    if (maxBits < minHashBits) {
		/*
		 * Linear search mode - advance one to the next entry
		 */
		if (current == numEntries) {
		    return 0;
		}
	    } else {
		unsigned maxEntries = hashSize(maxBits);

		while (current < maxEntries &&
		       Map_noKeyP(myLHashBody->data[current].key))
		{
		    current++;
		}

		if (current == maxEntries) {
		    return 0;
		}
	    }

	    index = current ++;
	}

	key = myLHashBody->data[index].key;
	return &(myLHashBody->data[index].value);
    }
}

#undef BODY

#endif /* _LHash_cc_ */
-------------- next part --------------
/*
 * SArray.cc --
 *	Sorted array implementation
 *
 */

#ifndef _SArray_cc_
#define _SArray_cc_

#ifndef lint
static char SArray_Copyright[] = "Copyright (c) 1995-2005 SRI International.  All Rights Reserved.";
static char SArray_RcsId[] = "@(#)$Header: /home/srilm/devel/dstruct/src/RCS/SArray.cc,v 1.37 2005/07/17 22:01:31 stolcke Exp $";
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <new.h>

#include "SArray.h"

#undef INSTANTIATE_SARRAY
#define INSTANTIATE_SARRAY(KeyT, DataT) \
	template <> DataT *SArray< KeyT, DataT >::removedData = 0; \
	template class SArray< KeyT, DataT >; \
	template class SArrayIter< KeyT, DataT >

#ifndef __GNUG__
template <class KeyT, class DataT>
DataT *SArray<KeyT,DataT>::removedData = 0;
#endif /* __GNUG__ */

#define BODY(b)	((SArrayBody<KeyT,DataT> *)b)

const double growSize = 1.1;

template <class KeyT, class DataT>
void
SArray<KeyT,DataT>::dump() const
{
    if (body == 0) {
	cerr << "EMPTY" << endl;
    } else {
	unsigned nEntries = numEntries();

	cerr << "maxEntries = " << BODY(body)->maxEntries;
	cerr << " nEntries = " << nEntries;

	for (unsigned i = 0; i < nEntries; i++) {
	    cerr << " " << i << ": ";
	    cerr << BODY(body)->data[i].key
#ifdef DUMP_VALUES
			/* 
			 * Only certain types can be printed.
			 */
		     << "->" << BODY(body)->data[i].value
#endif /* DUMP_VALUES */
		     ;
	}
    }
}

template <class KeyT, class DataT>
void
SArray<KeyT,DataT>::memStats(MemStats &stats) const
{
    stats.total += sizeof(*this);

    if (body) {
	unsigned maxEntries = BODY(body)->maxEntries;

	stats.total += sizeof(*BODY(body)) +
			sizeof(BODY(body)->data[0]) *
				(maxEntries - 1);
	stats.wasted += sizeof(BODY(body)->data[0]) *
				(maxEntries - numEntries());
    }
}

template <class KeyT, class DataT>
void
SArray<KeyT,DataT>::alloc(unsigned size)
{
    body = (SArrayBody<KeyT,DataT> *)malloc(sizeof(*BODY(body)) +
			   (size - 1) * sizeof(BODY(body)->data[0]));
    assert(body != 0);

    BODY(body)->deleted = 0;
    BODY(body)->maxEntries = size;

    /*
     * Fill the array with dummy keys, marking the unused space
     */
    for (unsigned i = 0; i < size; i ++) {
	Map_noKey(BODY(body)->data[i].key);
    }
}

template <class KeyT, class DataT>
SArray<KeyT,DataT>::SArray(unsigned size)
    : body(0)
{
    if (size != 0) {
	alloc(size);
    }
}

template <class KeyT, class DataT>
void
SArray<KeyT,DataT>::clear(unsigned size)
{
    if (body) {
	unsigned maxEntries = BODY(body)->maxEntries;

	for (unsigned i = 0; i < maxEntries; i++) {
	    KeyT thisKey = BODY(body)->data[i].key;

	    if (Map_noKeyP(thisKey)) {
		break;
	    }
	    Map_freeKey(thisKey);
	}
	free(body);
	body = 0;
    }
    if (size != 0) {
	alloc(size);
    }
}

template <class KeyT, class DataT>
SArray<KeyT,DataT>::~SArray()
{
    clear(0);
}

template <class KeyT, class DataT>
unsigned
SArray<KeyT,DataT>::numEntries() const
{
    if (body == 0) {
	return 0;
    } else if (Map_noKeyP(BODY(body)->data[0].key)) {
	return 0;
    } else {
	/*
	 * Do a binary search for the end of the used memory
	 * lower always points to a filled entry
	 * upper always points to a free entry beyond the end of used entries
	 */
	unsigned lower, upper;

	lower = 0;			/* minimum index (inclusive) */
	upper = BODY(body)->maxEntries;	/* maximum index (exclusive) */

	while (lower + 1 < upper) {
	    unsigned middle = (lower + upper)/2;

	    if (Map_noKeyP(BODY(body)->data[middle].key)) {
		    upper = middle;
	    } else {
		    lower = middle;
	    }
	}

	/* lower + 1 == upper */
	return upper;
    }
}

template <class KeyT, class DataT>
SArray<KeyT,DataT>::SArray(const SArray<KeyT,DataT> &source)
    : body(0)
{
#ifdef DEBUG
    cerr << "warning: SArray copy constructor called\n";
#endif
    *this = source;
}

template <class KeyT, class DataT>
SArray<KeyT,DataT> &
SArray<KeyT,DataT>::operator= (const SArray<KeyT,DataT> &other)
{
#ifdef DEBUG
    cerr << "warning: SArray::operator= called\n";
#endif

    if (&other == this) {
	return *this;
    }

    /*
     * copy array entries from old to new 
     */
    if (other.body) {
	unsigned maxEntries = BODY(other.body)->maxEntries;
	clear(maxEntries);

	for (unsigned i = 0; i < maxEntries; i++) {
	    KeyT thisKey = BODY(other.body)->data[i].key;

	    if (Map_noKeyP(thisKey)) {
		break;
	    }

	    /*
	     * Copy key
	     */
	    BODY(body)->data[i].key = Map_copyKey(thisKey);

	    /*
	     * Initialize data, required for = operator
	     */
	    new (&(BODY(body)->data[i].value)) DataT;

	    /*
	     * Copy data
	     */
	    BODY(body)->data[i].value = BODY(other.body)->data[i].value;

	}
    } else {
	clear(0);
    }

    return *this;
}

/*
 * Returns index into data array that has the key which is either
 * equal to key, or indicates the place where key would be placed if it
 * occurred in the array.
 */
template <class KeyT, class DataT>
Boolean
SArray<KeyT,DataT>::locate(KeyT key, unsigned &index) const
{
    assert(!Map_noKeyP(key));

    if (body) {
	unsigned lower, upper;

	lower = 0;			/* minimum index (inclusive) */
	upper = BODY(body)->maxEntries;	/* maximum index (exclusive) */

	while (lower < upper) {
	    unsigned middle = (lower + upper)/2;

	    KeyT thisKey = BODY(body)->data[middle].key;

	    if (Map_noKeyP(thisKey)) {
		upper = middle;
		continue;
	    }
	    
	    int compare = SArray_compareKey(key, thisKey);

	    if (compare < 0) {
		    upper = middle;
	    } else if (compare > 0) {
		    lower = middle + 1;
	    } else {
		    index = middle;
		    return true;
	    }
	}

	/* we have lower == upper */
	if (lower == BODY(body)->maxEntries ||
	    Map_noKeyP(BODY(body)->data[lower].key) ||
	    SArray_compareKey(key, BODY(body)->data[lower].key) < 0)
	{
	    index = lower;
	} else  {
	    index = lower + 1;
	}
	return false;
    } else {
	return false;
    }
}

template <class KeyT, class DataT>
DataT *
SArray<KeyT,DataT>::find(KeyT key, Boolean &foundP) const
{
    unsigned index;

    if (foundP = locate(key, index)) {
	return &(BODY(body)->data[index].value);
    } else {
	return 0;
    }
}

template <class KeyT, class DataT>
KeyT
SArray<KeyT,DataT>::getInternalKey(KeyT key, Boolean &foundP) const
{
    unsigned index;
    static KeyT zeroKey;

    if (foundP = locate(key, index)) {
	return BODY(body)->data[index].key;
    } else {
	return zeroKey;
    }
}

template <class KeyT, class DataT>
DataT *
SArray<KeyT,DataT>::insert(KeyT key, Boolean &foundP)
{
    unsigned index;

    /*
     * Make sure there is room for at least one entry
     */
    if (body == 0) {
	alloc(1);
    }

    if (foundP = locate(key, index)) {
	return &(BODY(body)->data[index].value);
    } else {
	unsigned nEntries = numEntries();
	unsigned maxEntries = BODY(body)->maxEntries;

	/*
	 * Need to add an element.
	 * First, enlarge array if necessary
	 */
	if (nEntries == maxEntries) {
	    maxEntries = (int)(maxEntries * growSize);
	    if (maxEntries == nEntries) {
		maxEntries ++;
	    }
	    body = (SArrayBody<KeyT,DataT> *)realloc(body,
			sizeof(*BODY(body)) +
			   (maxEntries - 1) * sizeof(BODY(body)->data[0]));
	    assert(body != 0);

	    /*
	     * Fill new space with dummy keys
	     */
	    for (unsigned i = BODY(body)->maxEntries; i < maxEntries; i ++) {
		Map_noKey(BODY(body)->data[i].key);
	    }

	    BODY(body)->maxEntries = maxEntries;
	}

	memmove(&(BODY(body)->data[index + 1]),
		&(BODY(body)->data[index]),
		(nEntries - index) * sizeof(BODY(body)->data[0]));

	BODY(body)->data[index].key = Map_copyKey(key);

	/*
	 * Initialize data to zero, but also call constructors, if any
	 */
	memset(&(BODY(body)->data[index].value), 0,
		   sizeof(BODY(body)->data[index].value));
	new (&(BODY(body)->data[index].value)) DataT;

	return &(BODY(body)->data[index].value);
    }
}
  
template <class KeyT, class DataT>
DataT *
SArray<KeyT,DataT>::remove(KeyT key, Boolean &foundP)
{
    unsigned index;

    /*
     * Allocate pseudo-static memory for removed objects (returned by
     * remove()). We cannot make this static because otherwise 
     * the destructor for DataT would be called on it at program exit.
     */
    if (removedData == 0) {
	removedData = (DataT *)malloc(sizeof(DataT));
	assert(removedData != 0);
    }

    if (foundP = locate(key, index)) {
	unsigned nEntries = numEntries();

	Map_freeKey(BODY(body)->data[index].key);
	memcpy(removedData, &BODY(body)->data[index].value, sizeof(DataT));

	memmove(&(BODY(body)->data[index]),
		&(BODY(body)->data[index + 1]),
		(nEntries - index - 1) * sizeof(BODY(body)->data[0]));

	/*
	 * mark previous last slot as free
	 */
	Map_noKey(BODY(body)->data[nEntries-1].key);

	/*
	 * Indicate that deletion occurred
	 */
	BODY(body)->deleted = 1;

	return removedData;
    } else {
	return 0;
    }
}
  
#ifdef __GNUG__
static
#endif
int (*SArray_thisKeyCompare)();		/* used by SArrayIter() to communicate
					 * with compareIndex() */
#ifdef __GNUG__
static
#endif
void *SArray_thisBody;			/* ditto */


template <class KeyT, class DataT>
void
SArrayIter<KeyT,DataT>::sortKeys()
{
    /*
     * Store keys away and sort them to the user's orders.
     */
    unsigned *sortedIndex = new unsigned[numEntries];
    assert(sortedIndex != 0);

    unsigned i;
    for (i = 0; i < numEntries; i++) {
	sortedIndex[i] = i;
    }

    /*
     * Due to the limitations of the qsort interface we have to 
     * pass extra information to compareIndex in these global
     * variables - yuck. 
     */
    SArray_thisKeyCompare = (int (*)())sortFunction;
    SArray_thisBody = mySArrayBody;
    qsort(sortedIndex, numEntries, sizeof(*sortedIndex), compareIndex);

    /*
     * Save the keys for enumeration.  The reason we save the keys,
     * not the indices, is that indices may change during enumeration
     * due to deletions.
     */
    sortedKeys = new KeyT[numEntries];
    assert(sortedKeys != 0);

    for (i = 0; i < numEntries; i++) {
	sortedKeys[i] = mySArrayBody->data[sortedIndex[i]].key;
    }

    delete [] sortedIndex;
}

template <class KeyT, class DataT>
SArrayIter<KeyT,DataT>::SArrayIter(const SArray<KeyT,DataT> &sarray,
					int (*keyCompare)(KeyT, KeyT))
    : mySArrayBody(BODY(sarray.body)), current(0),
      numEntries(sarray.numEntries()), sortFunction(keyCompare)
{
    /*
     * Note: we access the underlying SArray through the body pointer,
     * not the top-level SArray itself.  This allows the top-level object
     * to be moved without affecting an ongoing iteration.
     * XXX: This only works because
     * - it is illegal to insert while iterating
     * - deletions don't cause reallocation of the data
     */
    if (sortFunction && mySArrayBody) {
	sortKeys();
    } else {
	sortedKeys = 0;
    }

    if (mySArrayBody) {
	mySArrayBody->deleted = 0;
    }
}

/*
 * This is the callback function passed to qsort for comparing array
 * entries. It is passed the indices into the data array, which are
 * compared according to the user-supplied function applied to the 
 * keys found at those locations.
 */
template <class KeyT, class DataT>
int
SArrayIter<KeyT,DataT>::compareIndex(const void *idx1, const void *idx2)
{
  return (*(compFnType)SArray_thisKeyCompare)
		(BODY(SArray_thisBody)->data[*(unsigned *)idx1].key,
		 BODY(SArray_thisBody)->data[*(unsigned *)idx2].key);
}

template <class KeyT, class DataT>
SArrayIter<KeyT,DataT>::~SArrayIter()
{
    delete [] sortedKeys;
}

template <class KeyT, class DataT>
void
SArrayIter<KeyT,DataT>::init()
{
    delete [] sortedKeys;
    sortedKeys = 0;

    current = 0;

    {
	/*
	 * XXX: fake SArray object to access numEntries()
	 */
	SArray<KeyT,DataT> mySArray(0);

	mySArray.body = mySArrayBody;
	numEntries = mySArray.numEntries();
	mySArray.body = 0;
    }

    if (mySArrayBody) {
	if (sortFunction) {
	    sortKeys();
	}
	mySArrayBody->deleted = 0;
    }
}

template <class KeyT, class DataT>
DataT *
SArrayIter<KeyT,DataT>::next(KeyT &key)
{
    if (mySArrayBody == 0) {
	return 0;
    } else {
	unsigned index;

	if (sortedKeys == 0) {
	    /*
	     * Detect deletion while iterating.
	     * A legal deletion can only affect the current entry, so
	     * adjust the current position to reflect the next entry was
	     * moved.
	     */
	    if (mySArrayBody->deleted) {
		numEntries --;
		current --;
	    }

	    if (current == numEntries) {
		return 0;
	    }

	    index = current++;
	} else {
	    if (current == numEntries) {
		return 0;
	    }

	    /*
	     * XXX: fake an SArray object to access locate()
	     */
	    SArray<KeyT,DataT> mySArray(0);

	    mySArray.body = mySArrayBody;
	    (void)mySArray.locate(sortedKeys[current++], index);
	    mySArray.body = 0;
	}
	mySArrayBody->deleted = 0;

	key = mySArrayBody->data[index].key;
	return &(mySArrayBody->data[index].value);
    }
}

#undef BODY

#endif /* _SArray_cc_ */


More information about the SRILM-User mailing list