src/heterosync/src/hipLocksBarrierAtomic.h - public/gem5-resources - Git at Google

 #ifndef __HIPLOCKSBARRIERATOMIC_H__
 #define __HIPLOCKSBARRIERATOMIC_H__

 #include "hip/hip_runtime.h"
 #include "hipLocks.h"

 inline __device__ void hipBarrierAtomicSub(unsigned int * globalBarr,
                                             int * done,
                                             // numBarr represents the number of
                                             // WGs going to the barrier
                                             const unsigned int numBarr,
                                             int backoff,
                                             const bool isMasterThread)
 {
   __syncthreads();
   if (isMasterThread)
   {
     *done = 0;

     // atomicInc acts as a store release, need TF to enforce ordering
     __threadfence();
     // atomicInc effectively adds 1 to atomic for each WG that's part of the
     // global barrier.
     /*
       HIP currently doesn't generate the correct code for atomicInc's here,
       so replace with an atomicAdd of 1 and assume no wraparound
     */
     //atomicInc(globalBarr, 0x7FFFFFFF);
     atomicAdd(globalBarr, 1);
   }
   __syncthreads();

   while (!*done)
   {
     if (isMasterThread)
     {
       /*
         For the tree barrier we expect only 1 WG from each CU to enter the
         global barrier.  Since we are assuming an equal amount of work for all
         CUs, we can use the # of WGs reaching the barrier for the compare value
         here.  Once the atomic's value == numBarr, then reset the value to 0 and
         proceed because all of the WGs have reached the global barrier.
       */
       if (atomicCAS(globalBarr, numBarr, 0) == 0) {
         // atomicCAS acts as a load acquire, need TF to enforce ordering
         __threadfence();
         *done = 1;
       }
       else { // increase backoff to avoid repeatedly hammering global barrier
         // (capped) exponential backoff
         backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
       }
     }
     __syncthreads();

     // do exponential backoff to reduce the number of times we pound the global
     // barrier
     if (!*done) {
       //sleepFunc(backoff);
       for (int j = 0; j < backoff; ++j) { ; }
       __syncthreads();
     }
   }
 }

 inline __device__ void hipBarrierAtomic(unsigned int * barrierBuffers,
                                          // numBarr represents the number of
                                          // WGs going to the barrier
                                          const unsigned int numBarr,
                                          const bool isMasterThread)
 {
   unsigned int * atomic1 = barrierBuffers;
   unsigned int * atomic2 = atomic1 + 1;
   __shared__ int done1, done2;
   __shared__ int backoff;

   if (isMasterThread) {
     backoff = 1;
   }
   __syncthreads();

   hipBarrierAtomicSub(atomic1, &done1, numBarr, backoff, isMasterThread);
   // second barrier is necessary to provide a facesimile for a sense-reversing
   // barrier
   hipBarrierAtomicSub(atomic2, &done2, numBarr, backoff, isMasterThread);
 }

 // does local barrier amongst all of the WGs on an CU
 inline __device__ void hipBarrierAtomicSubLocal(unsigned int * perCUBarr,
                                                  int * done,
                                                  const unsigned int numWGs_thisCU,
                                                  const bool isMasterThread)
 {
   __syncthreads();
   if (isMasterThread)
   {
     *done = 0;
     // atomicInc acts as a store release, need TF to enforce ordering locally
     __threadfence_block();
     /*
       atomicInc effectively adds 1 to atomic for each WG that's part of the
       barrier.  For the local barrier, this requires using the per-CU
       locations.
     */
     /*
       HIP currently doesn't generate the correct code for atomicInc's here,
       so replace with an atomicAdd of 1 and assume no wraparound
     */
     //atomicInc(perCUBarr, 0x7FFFFFFF);
     atomicAdd(perCUBarr, 1);
   }
   __syncthreads();

   while (!*done)
   {
     if (isMasterThread)
     {
       /*
         Once all of the WGs on this CU have incremented the value at atomic,
         then the value (for the local barrier) should be equal to the # of WGs
         on this CU.  Once that is true, then we want to reset the atomic to 0
         and proceed because all of the WGs on this CU have reached the local
         barrier.
       */
       if (atomicCAS(perCUBarr, numWGs_thisCU, 0) == 0) {
         // atomicCAS acts as a load acquire, need TF to enforce ordering
         // locally
         __threadfence_block();
         *done = 1;
       }
     }
     __syncthreads();
   }
 }

 // does local barrier amongst all of the WGs on an CU
 inline __device__ void hipBarrierAtomicLocal(unsigned int * perCUBarrierBuffers,
                                               const unsigned int cuID,
                                               const unsigned int numWGs_thisCU,
                                               const bool isMasterThread,
                                               const int MAX_BLOCKS)
 {
   // each CU has MAX_BLOCKS locations in barrierBuffers, so my CU's locations
   // start at barrierBuffers[cuID*MAX_BLOCKS]
   unsigned int * atomic1 = perCUBarrierBuffers + (cuID * MAX_BLOCKS);
   unsigned int * atomic2 = atomic1 + 1;
   __shared__ int done1, done2;

   hipBarrierAtomicSubLocal(atomic1, &done1, numWGs_thisCU, isMasterThread);
   // second barrier is necessary to approproximate a sense-reversing barrier
   hipBarrierAtomicSubLocal(atomic2, &done2, numWGs_thisCU, isMasterThread);
 }

 /*
   Helper function for joining the barrier with the atomic tree barrier.
 */
 __attribute__((always_inline)) __device__ void joinBarrier_helper(unsigned int * barrierBuffers,
                                                                   unsigned int * perCUBarrierBuffers,
                                                                   const unsigned int numBlocksAtBarr,
                                                                   const int cuID,
                                                                   const int perCU_blockID,
                                                                   const int numWGs_perCU,
                                                                   const bool isMasterThread,
                                                                   const int MAX_BLOCKS) {
   if (numWGs_perCU > 1) {
     hipBarrierAtomicLocal(perCUBarrierBuffers, cuID, numWGs_perCU,
                            isMasterThread, MAX_BLOCKS);

     // only 1 WG per CU needs to do the global barrier since we synchronized
     // the WGs locally first
     if (perCU_blockID == 0) {
       hipBarrierAtomic(barrierBuffers, numBlocksAtBarr, isMasterThread);
     }

     // all WGs on this CU do a local barrier to ensure global barrier is
     // reached
     hipBarrierAtomicLocal(perCUBarrierBuffers, cuID, numWGs_perCU,
                            isMasterThread, MAX_BLOCKS);
   } else { // if only 1 WG on the CU, no need for the local barriers
     hipBarrierAtomic(barrierBuffers, numBlocksAtBarr, isMasterThread);
   }
 }

 #endif
	#ifndef __HIPLOCKSBARRIERATOMIC_H__
	#define __HIPLOCKSBARRIERATOMIC_H__

	#include "hip/hip_runtime.h"
	#include "hipLocks.h"

	inline __device__ void hipBarrierAtomicSub(unsigned int * globalBarr,
	int * done,
	// numBarr represents the number of
	// WGs going to the barrier
	const unsigned int numBarr,
	int backoff,
	const bool isMasterThread)
	{
	__syncthreads();
	if (isMasterThread)
	{
	*done = 0;

	// atomicInc acts as a store release, need TF to enforce ordering
	__threadfence();
	// atomicInc effectively adds 1 to atomic for each WG that's part of the
	// global barrier.
	/*
	HIP currently doesn't generate the correct code for atomicInc's here,
	so replace with an atomicAdd of 1 and assume no wraparound
	*/
	//atomicInc(globalBarr, 0x7FFFFFFF);
	atomicAdd(globalBarr, 1);
	}
	__syncthreads();

	while (!*done)
	{
	if (isMasterThread)
	{
	/*
	For the tree barrier we expect only 1 WG from each CU to enter the
	global barrier. Since we are assuming an equal amount of work for all
	CUs, we can use the # of WGs reaching the barrier for the compare value
	here. Once the atomic's value == numBarr, then reset the value to 0 and
	proceed because all of the WGs have reached the global barrier.
	*/
	if (atomicCAS(globalBarr, numBarr, 0) == 0) {
	// atomicCAS acts as a load acquire, need TF to enforce ordering
	__threadfence();
	*done = 1;
	}
	else { // increase backoff to avoid repeatedly hammering global barrier
	// (capped) exponential backoff
	backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
	}
	}
	__syncthreads();

	// do exponential backoff to reduce the number of times we pound the global
	// barrier
	if (!*done) {
	//sleepFunc(backoff);
	for (int j = 0; j < backoff; ++j) { ; }
	__syncthreads();
	}
	}
	}

	inline __device__ void hipBarrierAtomic(unsigned int * barrierBuffers,
	// numBarr represents the number of
	// WGs going to the barrier
	const unsigned int numBarr,
	const bool isMasterThread)
	{
	unsigned int * atomic1 = barrierBuffers;
	unsigned int * atomic2 = atomic1 + 1;
	__shared__ int done1, done2;
	__shared__ int backoff;

	if (isMasterThread) {
	backoff = 1;
	}
	__syncthreads();

	hipBarrierAtomicSub(atomic1, &done1, numBarr, backoff, isMasterThread);
	// second barrier is necessary to provide a facesimile for a sense-reversing
	// barrier
	hipBarrierAtomicSub(atomic2, &done2, numBarr, backoff, isMasterThread);
	}

	// does local barrier amongst all of the WGs on an CU
	inline __device__ void hipBarrierAtomicSubLocal(unsigned int * perCUBarr,
	int * done,
	const unsigned int numWGs_thisCU,
	const bool isMasterThread)
	{
	__syncthreads();
	if (isMasterThread)
	{
	*done = 0;
	// atomicInc acts as a store release, need TF to enforce ordering locally
	__threadfence_block();
	/*
	atomicInc effectively adds 1 to atomic for each WG that's part of the
	barrier. For the local barrier, this requires using the per-CU
	locations.
	*/
	/*
	HIP currently doesn't generate the correct code for atomicInc's here,
	so replace with an atomicAdd of 1 and assume no wraparound
	*/
	//atomicInc(perCUBarr, 0x7FFFFFFF);
	atomicAdd(perCUBarr, 1);
	}
	__syncthreads();

	while (!*done)
	{
	if (isMasterThread)
	{
	/*
	Once all of the WGs on this CU have incremented the value at atomic,
	then the value (for the local barrier) should be equal to the # of WGs
	on this CU. Once that is true, then we want to reset the atomic to 0
	and proceed because all of the WGs on this CU have reached the local
	barrier.
	*/
	if (atomicCAS(perCUBarr, numWGs_thisCU, 0) == 0) {
	// atomicCAS acts as a load acquire, need TF to enforce ordering
	// locally
	__threadfence_block();
	*done = 1;
	}
	}
	__syncthreads();
	}
	}

	// does local barrier amongst all of the WGs on an CU
	inline __device__ void hipBarrierAtomicLocal(unsigned int * perCUBarrierBuffers,
	const unsigned int cuID,
	const unsigned int numWGs_thisCU,
	const bool isMasterThread,
	const int MAX_BLOCKS)
	{
	// each CU has MAX_BLOCKS locations in barrierBuffers, so my CU's locations
	// start at barrierBuffers[cuID*MAX_BLOCKS]
	unsigned int * atomic1 = perCUBarrierBuffers + (cuID * MAX_BLOCKS);
	unsigned int * atomic2 = atomic1 + 1;
	__shared__ int done1, done2;

	hipBarrierAtomicSubLocal(atomic1, &done1, numWGs_thisCU, isMasterThread);
	// second barrier is necessary to approproximate a sense-reversing barrier
	hipBarrierAtomicSubLocal(atomic2, &done2, numWGs_thisCU, isMasterThread);
	}

	/*
	Helper function for joining the barrier with the atomic tree barrier.
	*/
	__attribute__((always_inline)) __device__ void joinBarrier_helper(unsigned int * barrierBuffers,
	unsigned int * perCUBarrierBuffers,
	const unsigned int numBlocksAtBarr,
	const int cuID,
	const int perCU_blockID,
	const int numWGs_perCU,
	const bool isMasterThread,
	const int MAX_BLOCKS) {
	if (numWGs_perCU > 1) {
	hipBarrierAtomicLocal(perCUBarrierBuffers, cuID, numWGs_perCU,
	isMasterThread, MAX_BLOCKS);

	// only 1 WG per CU needs to do the global barrier since we synchronized
	// the WGs locally first
	if (perCU_blockID == 0) {
	hipBarrierAtomic(barrierBuffers, numBlocksAtBarr, isMasterThread);
	}

	// all WGs on this CU do a local barrier to ensure global barrier is
	// reached
	hipBarrierAtomicLocal(perCUBarrierBuffers, cuID, numWGs_perCU,
	isMasterThread, MAX_BLOCKS);
	} else { // if only 1 WG on the CU, no need for the local barriers
	hipBarrierAtomic(barrierBuffers, numBlocksAtBarr, isMasterThread);
	}
	}

	#endif