src/parsec/disk-image/parsec/parsec-benchmark/pkgs/libs/tbblib/src/src/test/test_atomic.cpp - public/gem5-resources - Git at Google

 /*
     Copyright 2005-2010 Intel Corporation.  All Rights Reserved.

     This file is part of Threading Building Blocks.

     Threading Building Blocks is free software; you can redistribute it
     and/or modify it under the terms of the GNU General Public License
     version 2 as published by the Free Software Foundation.

     Threading Building Blocks is distributed in the hope that it will be
     useful, but WITHOUT ANY WARRANTY; without even the implied warranty
     of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     GNU General Public License for more details.

     You should have received a copy of the GNU General Public License
     along with Threading Building Blocks; if not, write to the Free Software
     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

     As a special exception, you may use this file as part of a free software
     library without restriction.  Specifically, if other files instantiate
     templates or use macros or inline functions from this file, or you compile
     this file and link it with other files to produce an executable, this
     file does not by itself cause the resulting executable to be covered by
     the GNU General Public License.  This exception does not however
     invalidate any other reasons why the executable file might be covered by
     the GNU General Public License.
 */

 // Put tbb/atomic.h first, so if it is missing a prerequisite header, we find out about it.
 // The tests here do *not* test for atomicity, just serial correctness. */

 #include "tbb/atomic.h"
 #include "harness_assert.h"
 #include <string.h> // memcmp

 #if _MSC_VER && !defined(__INTEL_COMPILER)
     // unary minus operator applied to unsigned type, result still unsigned
     #pragma warning( push )
     #pragma warning( disable: 4310 )
 #endif

 //! Structure that holds an atomic<T> and some guard bytes around it.
 template<typename T>
 struct TestStruct {
     T prefix;
     tbb::atomic<T> counter;
     T suffix;
     TestStruct( T i ) : prefix(T(0x1234)), suffix(T(0x5678)) {
         counter = i;
         ASSERT( sizeof(*this)==3*sizeof(T), NULL );
     }
     ~TestStruct() {
         // Check for writes outside the counter.
         ASSERT( prefix==T(0x1234), NULL );
         ASSERT( suffix==T(0x5678), NULL );
     }
 };

 #if _MSC_VER && !defined(__INTEL_COMPILER)
     #pragma warning( pop )
 #endif

 #if defined(__INTEL_COMPILER)
     // reference to EBX in a function requiring stack alignment
     #pragma warning( disable: 998 )
 #endif

 //! Test compare_and_swap template members of class atomic<T> for memory_semantics=M
 template<typename T,tbb::memory_semantics M>
 void TestCompareAndSwapAcquireRelease( T i, T j, T k ) {
     ASSERT( i!=k, "values must be distinct" );
     // Test compare_and_swap that should fail
     TestStruct<T> x(i);
     T old = x.counter.template compare_and_swap<M>( j, k );
     ASSERT( old==i, NULL );
     ASSERT( x.counter==i, "old value not retained" );
     // Test compare and swap that should suceed
     old = x.counter.template compare_and_swap<M>( j, i );
     ASSERT( old==i, NULL );
     ASSERT( x.counter==j, "value not updated?" );
 }

 //! i, j, k must be different values
 template<typename T>
 void TestCompareAndSwap( T i, T j, T k ) {
     ASSERT( i!=k, "values must be distinct" );
     // Test compare_and_swap that should fail
     TestStruct<T> x(i);
     T old = x.counter.compare_and_swap( j, k );
     ASSERT( old==i, NULL );
     ASSERT( x.counter==i, "old value not retained" );
     // Test compare and swap that should suceed
     old = x.counter.compare_and_swap( j, i );
     ASSERT( old==i, NULL );
     if( x.counter==i ) {
         ASSERT( x.counter==j, "value not updated?" );
     } else {
         ASSERT( x.counter==j, "value trashed" );
     }
     TestCompareAndSwapAcquireRelease<T,tbb::acquire>(i,j,k);
     TestCompareAndSwapAcquireRelease<T,tbb::release>(i,j,k);
 }

 //! memory_semantics variation on TestFetchAndStore
 template<typename T, tbb::memory_semantics M>
 void TestFetchAndStoreAcquireRelease( T i, T j ) {
     ASSERT( i!=j, "values must be distinct" );
     TestStruct<T> x(i);
     T old = x.counter.template fetch_and_store<M>( j );
     ASSERT( old==i, NULL );
     ASSERT( x.counter==j, NULL );
 }

 //! i and j must be different values
 template<typename T>
 void TestFetchAndStore( T i, T j ) {
     ASSERT( i!=j, "values must be distinct" );
     TestStruct<T> x(i);
     T old = x.counter.fetch_and_store( j );
     ASSERT( old==i, NULL );
     ASSERT( x.counter==j, NULL );
     TestFetchAndStoreAcquireRelease<T,tbb::acquire>(i,j);
     TestFetchAndStoreAcquireRelease<T,tbb::release>(i,j);
 }

 #if _MSC_VER && !defined(__INTEL_COMPILER)
     // conversion from <bigger integer> to <smaller integer>, possible loss of data
     // the warning seems a complete nonsense when issued for e.g. short+=short
     #pragma warning( push )
     #pragma warning( disable: 4244 )
 #endif

 //! Test fetch_and_add members of class atomic<T> for memory_semantics=M
 template<typename T,tbb::memory_semantics M>
 void TestFetchAndAddAcquireRelease( T i ) {
     TestStruct<T> x(i);
     T actual;
     T expected = i;

     // Test fetch_and_add member template
     for( int j=0; j<10; ++j ) {
         actual = x.counter.fetch_and_add(j);
         ASSERT( actual==expected, NULL );
         expected += j;
     }
     for( int j=0; j<10; ++j ) {
         actual = x.counter.fetch_and_add(-j);
         ASSERT( actual==expected, NULL );
         expected -= j;
     }

     // Test fetch_and_increment member template
     ASSERT( x.counter==i, NULL );
     actual = x.counter.template fetch_and_increment<M>();
     ASSERT( actual==i, NULL );
     ASSERT( x.counter==T(i+1), NULL );

     // Test fetch_and_decrement member template
     actual = x.counter.template fetch_and_decrement<M>();
     ASSERT( actual==T(i+1), NULL );
     ASSERT( x.counter==i, NULL );
 }

 //! Test fetch_and_add and related operators
 template<typename T>
 void TestFetchAndAdd( T i ) {
     TestStruct<T> x(i);
     T value;
     value = ++x.counter;
     ASSERT( value==T(i+1), NULL );
     value = x.counter++;
     ASSERT( value==T(i+1), NULL );
     value = x.counter--;
     ASSERT( value==T(i+2), NULL );
     value = --x.counter;
     ASSERT( value==i, NULL );
     T actual;
     T expected = i;
     for( int j=-100; j<=100; ++j ) {
         expected += j;
         actual = x.counter += j;
         ASSERT( actual==expected, NULL );
     }
     for( int j=-100; j<=100; ++j ) {
         expected -= j;
         actual = x.counter -= j;
         ASSERT( actual==expected, NULL );
     }
     // Test fetch_and_increment
     ASSERT( x.counter==i, NULL );
     actual = x.counter.fetch_and_increment();
     ASSERT( actual==i, NULL );
     ASSERT( x.counter==T(i+1), NULL );

     // Test fetch_and_decrement
     actual = x.counter.fetch_and_decrement();
     ASSERT( actual==T(i+1), NULL );
     ASSERT( x.counter==i, NULL );
     x.counter = i;
     ASSERT( x.counter==i, NULL );

     TestFetchAndAddAcquireRelease<T,tbb::acquire>(i);
     TestFetchAndAddAcquireRelease<T,tbb::release>(i);
 }

 #if _MSC_VER && !defined(__INTEL_COMPILER)
     #pragma warning( pop )
 #endif // warning 4244 is back

 //! A type with unknown size.
 class IncompleteType;

 void TestFetchAndAdd( IncompleteType* ) {
     // There are no fetch-and-add operations on a IncompleteType*.
 }
 void TestFetchAndAdd( void* ) {
     // There are no fetch-and-add operations on a void*.
 }

 void TestFetchAndAdd( bool ) {
     // There are no fetch-and-add operations on a bool.
 }

 template<typename T>
 void TestConst( T i ) {
     // Try const
     const TestStruct<T> x(i);
     ASSERT( memcmp( &i, &x.counter, sizeof(T) )==0, "write to atomic<T> broken?" );;
     ASSERT( x.counter==i, "read of atomic<T> broken?" );
 }

 template<typename T>
 void TestOperations( T i, T j, T k ) {
     TestConst(i);
     TestCompareAndSwap(i,j,k);
     TestFetchAndStore(i,k);    // Pass i,k instead of i,j, because callee requires two distinct values.
 }

 template<typename T>
 void TestParallel( const char* name );

 bool ParallelError;

 template<typename T>
 struct AlignmentChecker {
     char c;
     tbb::atomic<T> i;
 };

 #include "harness.h"

 #if _MSC_VER && !defined(__INTEL_COMPILER)
     // unary minus operator applied to unsigned type, result still unsigned
     #pragma warning( push )
     #pragma warning( disable: 4146 )
 #endif

 /** T is an integral type. */
 template<typename T>
 void TestAtomicInteger( const char* name ) {
     REMARK("testing atomic<%s>\n",name);
 #if ( __linux__ && __TBB_x86_32 && __GNUC__==3 && __GNUC_MINOR__==3 ) || defined(__SUNPRO_CC)
     // gcc 3.3 has known problem for 32-bit Linux, so only warn if there is a problem.
     // SUNPRO_CC does have this problem as well
     if( sizeof(T)==8 ) {
         if( sizeof(AlignmentChecker<T>)!=2*sizeof(tbb::atomic<T>) ) {
             REPORT("Known issue: alignment for atomic<%s> is wrong with gcc 3.3 and sunCC 5.9 2008/01/28 for IA32\n",name);
         }
     } else
 #endif /* ( __linux__ && __TBB_x86_32 && __GNUC__==3 && __GNUC_MINOR__==3 ) || defined(__SUNPRO_CC) */
     ASSERT( sizeof(AlignmentChecker<T>)==2*sizeof(tbb::atomic<T>), NULL );
     TestOperations<T>(0L,T(-T(1)),T(1));
     for( int k=0; k<int(sizeof(long))*8-1; ++k ) {
         TestOperations<T>(T(1L<<k),T(~(1L<<k)),T(1-(1L<<k)));
         TestOperations<T>(T(-1L<<k),T(~(-1L<<k)),T(1-(-1L<<k)));
         TestFetchAndAdd<T>(T(-1L<<k));
     }
     TestParallel<T>( name );
 }

 #if _MSC_VER && !defined(__INTEL_COMPILER)
     #pragma warning( pop )
 #endif


 template<typename T>
 struct Foo {
     T x, y, z;
 };


 template<typename T>
 void TestIndirection() {
     Foo<T> item;
     tbb::atomic<Foo<T>*> pointer;
     pointer = &item;
     for( int k=-10; k<=10; ++k ) {
         // Test various syntaxes for indirection to fields with non-zero offset.
         T value1=T(), value2=T();
         for( size_t j=0; j<sizeof(T); ++j ) {
             *(char*)&value1 = char(k^j);
             *(char*)&value2 = char(k^j*j);
         }
         pointer->y = value1;
         (*pointer).z = value2;
         T result1 = (*pointer).y;
         T result2 = pointer->z;
         ASSERT( memcmp(&value1,&result1,sizeof(T))==0, NULL );
         ASSERT( memcmp(&value2,&result2,sizeof(T))==0, NULL );
     }
 }

 //! Test atomic<T*>
 template<typename T>
 void TestAtomicPointer() {
     REMARK("testing atomic pointer (%d)\n",int(sizeof(T)));
     T array[1000];
     TestOperations<T*>(&array[500],&array[250],&array[750]);
     TestFetchAndAdd<T*>(&array[500]);
     TestIndirection<T>();
     TestParallel<T*>( "pointer" );
 }

 //! Test atomic<Ptr> where Ptr is a pointer to a type of unknown size
 template<typename Ptr>
 void TestAtomicPointerToTypeOfUnknownSize( const char* name ) {
     REMARK("testing atomic<%s>\n",name);
     char array[1000];
     TestOperations<Ptr>((Ptr)(void*)&array[500],(Ptr)(void*)&array[250],(Ptr)(void*)&array[750]);
     TestParallel<Ptr>( name );
 }

 void TestAtomicBool() {
     REMARK("testing atomic<bool>\n");
     TestOperations<bool>(true,true,false);
     TestOperations<bool>(false,false,true);
     TestParallel<bool>( "bool" );
 }

 enum Color {Red=0,Green=1,Blue=-1};

 void TestAtomicEnum() {
     REMARK("testing atomic<Color>\n");
     TestOperations<Color>(Red,Green,Blue);
     TestParallel<Color>( "Color" );
 }

 #if !__TBB_FLOATING_POINT_BROKEN
 template<typename T>
 void TestAtomicFloat( const char* name ) {
     REMARK("testing atomic<%s>\n", name );
     TestOperations<T>(0.5,3.25,10.75);
     TestParallel<T>( name );
 }
 #endif /* !__TBB_FLOATING_POINT_BROKEN */

 const int numMaskedOperations = 100000;
 const int testSpaceSize = 8;
 int prime[testSpaceSize] = {3,5,7,11,13,17,19,23};

 #if _MSC_VER && !defined(__INTEL_COMPILER)
     // "possible loss of data" warning suppressed again
     #pragma warning( push )
     #pragma warning( disable: 4244 )
 #endif

 template<typename T>
 class TestMaskedCAS_Body: NoAssign {
     T*  test_space_uncontended;
     T*  test_space_contended;
 public:
     TestMaskedCAS_Body( T* _space1, T* _space2 ) : test_space_uncontended(_space1), test_space_contended(_space2) {}
     void operator()( int my_idx ) const {
         using tbb::internal::__TBB_MaskedCompareAndSwap;
         const T my_prime = T(prime[my_idx]);
         T* const my_ptr = test_space_uncontended+my_idx;
         T old_value=0;
         for( int i=0; i<numMaskedOperations; ++i, old_value+=my_prime ){
             T result;
         // Test uncontended case
             T new_value = old_value + my_prime;
             // The following CAS should always fail
             result = __TBB_MaskedCompareAndSwap<sizeof(T),T>(my_ptr,new_value,old_value-1);
             ASSERT(result!=old_value-1, "masked CAS succeeded while it should fail");
             ASSERT(result==*my_ptr, "masked CAS result mismatch with real value");
             // The following one should succeed
             result = __TBB_MaskedCompareAndSwap<sizeof(T),T>(my_ptr,new_value,old_value);
             ASSERT(result==old_value && *my_ptr==new_value, "masked CAS failed while it should succeed");
             // The following one should fail again
             result = __TBB_MaskedCompareAndSwap<sizeof(T),T>(my_ptr,new_value,old_value);
             ASSERT(result!=old_value, "masked CAS succeeded while it should fail");
             ASSERT(result==*my_ptr, "masked CAS result mismatch with real value");
         // Test contended case
             for( int j=0; j<testSpaceSize; ++j ){
                 // try adding my_prime until success
                 T value;
                 do {
                     value = test_space_contended[j];
                     result = __TBB_MaskedCompareAndSwap<sizeof(T),T>(test_space_contended+j,value+my_prime,value);
                 } while( result!=value );
             }
         }
     }
 };

 template<typename T>
 struct intptr_as_array_of
 {
     static const int how_many_Ts = sizeof(intptr_t)/sizeof(T);
     union {
         intptr_t result;
         T space[ how_many_Ts ];
     };
 };

 template<typename T>
 intptr_t getCorrectUncontendedValue(int slot_idx) {
     intptr_as_array_of<T> slot;
     slot.result = 0;
     for( int i=0; i<slot.how_many_Ts; ++i ) {
         const T my_prime = T(prime[slot_idx*slot.how_many_Ts + i]);
         for( int j=0; j<numMaskedOperations; ++j )
             slot.space[i] += my_prime;
     }
     return slot.result;
 }

 template<typename T>
 intptr_t getCorrectContendedValue() {
     intptr_as_array_of<T>  slot;
     slot.result = 0;
     for( int i=0; i<slot.how_many_Ts; ++i )
         for( int primes=0; primes<testSpaceSize; ++primes )
             for( int j=0; j<numMaskedOperations; ++j )
                 slot.space[i] += prime[primes];
     return slot.result;
 }

 #if _MSC_VER && !defined(__INTEL_COMPILER)
     #pragma warning( pop )
 #endif // warning 4244 is back again

 template<typename T>
 void TestMaskedCAS() {
     REMARK("testing masked CAS<%d>\n",int(sizeof(T)));

     const int num_slots = sizeof(T)*testSpaceSize/sizeof(intptr_t);
     intptr_t arr1[num_slots+2]; // two more "canary" slots at boundaries
     intptr_t arr2[num_slots+2];
     for(int i=0; i<num_slots+2; ++i)
         arr2[i] = arr1[i] = 0;
     T* test_space_uncontended = (T*)(arr1+1);
     T* test_space_contended = (T*)(arr2+1);

     NativeParallelFor( testSpaceSize, TestMaskedCAS_Body<T>(test_space_uncontended, test_space_contended) );

     ASSERT( arr1[0]==0 && arr1[num_slots+1]==0 && arr2[0]==0 && arr2[num_slots+1]==0 , "adjacent memory was overwritten" );
     const intptr_t correctContendedValue = getCorrectContendedValue<T>();
     for(int i=0; i<num_slots; ++i) {
         ASSERT( arr1[i+1]==getCorrectUncontendedValue<T>(i), "unexpected value in an uncontended slot" );
         ASSERT( arr2[i+1]==correctContendedValue, "unexpected value in a contended slot" );
     }
 }

 template<unsigned N>
 class ArrayElement {
     char item[N];
 };

 int TestMain () {
 #if defined(__INTEL_COMPILER)||!defined(_MSC_VER)||_MSC_VER>=1400
     TestAtomicInteger<unsigned long long>("unsigned long long");
     TestAtomicInteger<long long>("long long");
 #else
     REPORT("Known issue: atomic<64-bits> does not compile with VC 7.1\n");
 #endif /*defined(__INTEL_COMPILER)||!defined(_MSC_VER)||_MSC_VER>=1400 */
     TestAtomicInteger<unsigned long>("unsigned long");
     TestAtomicInteger<long>("long");
     TestAtomicInteger<unsigned int>("unsigned int");
     TestAtomicInteger<int>("int");
     TestAtomicInteger<unsigned short>("unsigned short");
     TestAtomicInteger<short>("short");
     TestAtomicInteger<signed char>("signed char");
     TestAtomicInteger<unsigned char>("unsigned char");
     TestAtomicInteger<char>("char");
     TestAtomicInteger<wchar_t>("wchar_t");
     TestAtomicInteger<size_t>("size_t");
     TestAtomicInteger<ptrdiff_t>("ptrdiff_t");
     TestAtomicPointer<ArrayElement<1> >();
     TestAtomicPointer<ArrayElement<2> >();
     TestAtomicPointer<ArrayElement<3> >();
     TestAtomicPointer<ArrayElement<4> >();
     TestAtomicPointer<ArrayElement<5> >();
     TestAtomicPointer<ArrayElement<6> >();
     TestAtomicPointer<ArrayElement<7> >();
     TestAtomicPointer<ArrayElement<8> >();
     TestAtomicPointerToTypeOfUnknownSize<IncompleteType*>( "IncompleteType*" );
     TestAtomicPointerToTypeOfUnknownSize<void*>( "void*" );
     TestAtomicBool();
     TestAtomicEnum();
 #if !__TBB_FLOATING_POINT_BROKEN
     TestAtomicFloat<float>("float");
     TestAtomicFloat<double>("double");
 #endif /* !__TBB_FLOATING_POINT_BROKEN  */
     ASSERT( !ParallelError, NULL );
     TestMaskedCAS<unsigned char>();
     TestMaskedCAS<unsigned short>();
     return Harness::Done;
 }

 template<typename T>
 struct FlagAndMessage {
     //! 0 if message not set yet, 1 if message is set.
     tbb::atomic<T> flag;
     /** Force flag and message to be on distinct cache lines for machines with cache line size <= 4096 bytes */
     char pad[4096/sizeof(T)];
     //! Non-zero if message is ready
     T message;
 };

 // A special template function used for summation.
 // Actually it is only necessary because of its specialization for void*
 template<typename T>
 T special_sum(intptr_t arg1, intptr_t arg2) {
     return (T)((T)arg1 + arg2);
 }

 // The specialization for IncompleteType* is required
 // because pointer arithmetic (+) is impossible with IncompleteType*
 template<>
 IncompleteType* special_sum<IncompleteType*>(intptr_t arg1, intptr_t arg2) {
     return (IncompleteType*)(arg1 + arg2);
 }

 // The specialization for void* is required
 // because pointer arithmetic (+) is impossible with void*
 template<>
 void* special_sum<void*>(intptr_t arg1, intptr_t arg2) {
     return (void*)(arg1 + arg2);
 }

 // The specialization for bool is required to shut up gratuitous compiler warnings,
 // because some compilers warn about casting int to bool.
 template<>
 bool special_sum<bool>(intptr_t arg1, intptr_t arg2) {
     return ((arg1!=0) + arg2)!=0;
 }

 volatile int One = 1;

 template<typename T>
 class HammerLoadAndStoreFence: NoAssign {
     FlagAndMessage<T>* fam;
     const int n;
     const int p;
     const int trial;
     const char* name;
     mutable T accum;
 public:
     HammerLoadAndStoreFence( FlagAndMessage<T>* fam_, int n_, int p_, const char* name_, int trial_ ) : fam(fam_), n(n_), p(p_), trial(trial_), name(name_) {}
     void operator()( int k ) const {
         int one = One;
         FlagAndMessage<T>* s = fam+k;
         FlagAndMessage<T>* s_next = fam + (k+1)%p;
         for( int i=0; i<n; ++i ) {
             // The inner for loop is a spin-wait loop, which is normally considered very bad style.
             // But we must use it here because we are interested in examining subtle hardware effects.
             for(unsigned short cnt=1; ; ++cnt) {
                 if( !cnt ) // to help 1-core systems complete the test, yield every 2^16 iterations
                     __TBB_Yield();
                 // Compilers typically generate non-trivial sequence for division by a constant.
                 // The expression here is dependent on the loop index i, so it cannot be hoisted.
 #define COMPLICATED_ZERO (i*(one-1)/100)
                 // Read flag and then the message
                 T flag, message;
                 if( trial&1 ) {
                     // COMPLICATED_ZERO here tempts compiler to hoist load of message above reading of flag.
                     flag = (s+COMPLICATED_ZERO)->flag;
                     message = s->message;
                 } else {
                     flag = s->flag;
                     message = s->message;
                 }
                 if( flag ) {
                     if( flag!=(T)-1 ) {
                         REPORT("ERROR: flag!=(T)-1 k=%d i=%d trial=%x type=%s (atomicity problem?)\n", k, i, trial, name );
                         ParallelError = true;
                     }
                     if( message!=(T)-1 ) {
                         REPORT("ERROR: message!=(T)-1 k=%d i=%d trial=%x type=%s (memory fence problem?)\n", k, i, trial, name );
                         ParallelError = true;
                     }
                     s->message = T(0);
                     s->flag = T(0);
                     // Set message and then the flag
                     if( trial&2 ) {
                         // COMPLICATED_ZERO here tempts compiler to sink store below setting of flag
                         s_next->message = special_sum<T>(-1, COMPLICATED_ZERO);
                         s_next->flag = (T)-1;
                     } else {
                         s_next->message = (T)-1;
                         s_next->flag = (T)-1;
                     }
                     break;
                 } else {
                     // Force compiler to use message anyway, so it cannot sink read of s->message below the if.
                     accum = message;
                 }
             }
         }
     }
 };

 //! Test that atomic<T> has acquire semantics for loads and release semantics for stores.
 /** Test performs round-robin passing of message among p processors,
     where p goes from MinThread to MaxThread. */
 template<typename T>
 void TestLoadAndStoreFences( const char* name ) {
     for( int p=MinThread<2 ? 2 : MinThread; p<=MaxThread; ++p ) {
         FlagAndMessage<T>* fam = new FlagAndMessage<T>[p];
         // Each of four trials excercise slightly different expresion pattern within the test.
         // See occurrences of COMPLICATED_ZERO for details.
         for( int trial=0; trial<4; ++trial ) {
             memset( fam, 0, p*sizeof(FlagAndMessage<T>) );
             fam->message = (T)-1;
             fam->flag = (T)-1;
             NativeParallelFor( p, HammerLoadAndStoreFence<T>( fam, 100, p, name, trial ) );
             for( int k=0; k<p; ++k ) {
                 ASSERT( fam[k].message==(k==0 ? (T)-1 : 0), "incomplete round-robin?" );
                 ASSERT( fam[k].flag==(k==0 ? (T)-1 : 0), "incomplete round-robin?" );
             }
         }
         delete[] fam;
     }
 }

 //! Sparse set of values of integral type T.
 /** Set is designed so that if a value is read or written non-atomically,
     the resulting intermediate value is likely to not be a member of the set. */
 template<typename T>
 class SparseValueSet {
     T factor;
 public:
     SparseValueSet() {
         // Compute factor such that:
         // 1. It has at least one 1 in most of its bytes.
         // 2. The bytes are typically different.
         // 3. When multiplied by any value <=127, the product does not overflow.
         factor = T(0);
         for( unsigned i=0; i<sizeof(T)*8-7; i+=7 )
             factor = T(factor | T(1)<<i);
      }
      //! Get ith member of set
      T get( int i ) const {
          // Create multiple of factor.  The & prevents overflow of the product.
          return T((i&0x7F)*factor);
      }
      //! True if set contains x
      bool contains( T x ) const {
          // True if
          return (x%factor)==0;
      }
 };

 //! Specialization for pointer types.  The pointers are random and should not be dereferenced.
 template<typename T>
 class SparseValueSet<T*> {
     SparseValueSet<ptrdiff_t> my_set;
 public:
     T* get( int i ) const {return reinterpret_cast<T*>(my_set.get(i));}
     bool contains( T* x ) const {return my_set.contains(reinterpret_cast<ptrdiff_t>(x));}
 };

 //! Specialization for bool.
 /** Checking bool for atomic read/write is pointless in practice, because
     there is no way to *not* atomically read or write a bool value. */
 template<>
 class SparseValueSet<bool> {
 public:
     bool get( int i ) const {return i&1;}
     bool contains( bool ) const {return true;}
 };

 #if !__TBB_FLOATING_POINT_BROKEN

 #if _MSC_VER==1500 && !defined(__INTEL_COMPILER)
     // VS2008/VC9 seems to have an issue; limits pull in math.h
     #pragma warning( push )
     #pragma warning( disable: 4985 )
 #endif
 #include <limits> /* Need std::numeric_limits */
 #if _MSC_VER==1500 && !defined(__INTEL_COMPILER)
     #pragma warning( pop )
 #endif

 //! Commonality inherited by specializations for floating-point types.
 template<typename T>
 class SparseFloatSet: NoAssign {
     const T epsilon;
 public:
     SparseFloatSet() : epsilon(std::numeric_limits<T>::epsilon()) {}
     T get( int i ) const {
         return i==0 ? T(0) : 1/T((i&0x7F)+1);
     }
     bool contains( T x ) const {
         if( x==T(0) ) {
             return true;
         } else {
             int j = int(1/x+T(0.5));
             if( 0<j && j<=128 ) {
                 T error = x*T(j)-T(1);
                 // In the calculation above, if x was indeed generated by method get, the error should be
                 // at most epsilon, because x is off by at most 1/2 ulp from its infinitely precise value,
                 // j is exact, and the multiplication incurs at most another 1/2 ulp of round-off error.
                 if( -epsilon<=error && error<=epsilon ) {
                     return true;
                 } else {
                     REPORT("Warning: excessive floating-point error encountered j=%d x=%.15g error=%.15g\n",j,x,error);
                 }
             }
             return false;
         }
     };
 };

 template<>
 class SparseValueSet<float>: public SparseFloatSet<float> {};

 template<>
 class SparseValueSet<double>: public SparseFloatSet<double> {};

 #endif /* !__TBB_FLOATING_POINT_BROKEN */

 template<typename T>
 class HammerAssignment: NoAssign {
     tbb::atomic<T>& x;
     const char* name;
     SparseValueSet<T> set;
 public:
     HammerAssignment( tbb::atomic<T>& x_, const char* name_ ) : x(x_), name(name_) {}
     void operator()( int k ) const {
         const int n = 1000000;
         if( k ) {
             tbb::atomic<T> z;
             AssertSameType( z=x, z );    // Check that return type from assignment is correct
             for( int i=0; i<n; ++i ) {
                 // Read x atomically into z.
                 z = x;
                 if( !set.contains(z) ) {
                     REPORT("ERROR: assignment of atomic<%s> is not atomic\n", name);
                     ParallelError = true;
                     return;
                 }
             }
         } else {
             tbb::atomic<T> y;
             for( int i=0; i<n; ++i ) {
                 // Get pseudo-random value.
                 y = set.get(i);
                 // Write y atomically into x.
                 x = y;
             }
         }
     }
 };

 // Compile-time check that a class method has the required signature.
 // Intended to check the assignment operator of tbb::atomic.
 template<typename T> void TestAssignmentSignature( T& (T::*)(const T&) ) {}

 #if _MSC_VER && !defined(__INTEL_COMPILER)
     // Suppress "conditional expression is constant" warning.
     #pragma warning( push )
     #pragma warning( disable: 4127 )
 #endif

 template<typename T>
 void TestAssignment( const char* name ) {
     TestAssignmentSignature( &tbb::atomic<T>::operator= );
     tbb::atomic<T> x;
     x = T(0);
     NativeParallelFor( 2, HammerAssignment<T>( x, name ) );
 #if __TBB_x86_32 && (__linux__ || __FreeBSD__ || _WIN32)
     if( sizeof(T)==8 ) {
         // Some compilers for IA-32 fail to provide 8-byte alignment of objects on the stack,
         // even if the object specifies 8-byte alignment.  On such platforms, the IA-32 implementation
         // of atomic<long long> and atomic<unsigned long long> use different tactics depending upon
         // whether the object is properly aligned or not.  The following abusive test ensures that we
         // cover both the proper and improper alignment cases, one with the x above and the other with
         // the y below, perhaps not respectively.

         // Allocate space big enough to always contain 8-byte locations that are aligned and misaligned.
         char raw_space[15];
         // Set delta to 0 if x is aligned, 4 otherwise.
         uintptr_t delta = ((reinterpret_cast<uintptr_t>(&x)&7) ? 0 : 4);
         // y crosses 8-byte boundary if and only if x does not cross.
         tbb::atomic<T>& y = *reinterpret_cast<tbb::atomic<T>*>((reinterpret_cast<uintptr_t>(&raw_space[7+delta])&~7u) - delta);
         // Assertion checks that y really did end up somewhere inside "raw_space".
         ASSERT( raw_space<=reinterpret_cast<char*>(&y), "y starts before raw_space" );
         ASSERT( reinterpret_cast<char*>(&y+1) <= raw_space+sizeof(raw_space), "y starts after raw_space" );
         y = T(0);
         NativeParallelFor( 2, HammerAssignment<T>( y, name ) );
     }
 #endif /* __TBB_x86_32 && (__linux__ || __FreeBSD__ || _WIN32) */
 }

 #if _MSC_VER && !defined(__INTEL_COMPILER)
     #pragma warning( pop )
 #endif

 template<typename T>
 void TestParallel( const char* name ) {
     TestLoadAndStoreFences<T>(name);
     TestAssignment<T>(name);
 }