src/gpu/heterosync/src/hipLocksMutexSpin.h - public/gem5-resources - Git at Google

 #ifndef __HIPLOCKSMUTEXSPIN_H__
 #define __HIPLOCKSMUTEXSPIN_H__

 #include "hip/hip_runtime.h"

 inline __host__ hipError_t hipMutexCreateSpin(hipMutex_t * const handle,
                                               const int mutexNumber)
 {
   *handle = mutexNumber;
   return hipSuccess;
 }

 // This is the brain dead algorithm. Just spin on an atomic until you get the
 // lock.
 __device__ void hipMutexSpinLock(const hipMutex_t mutex,
                                  unsigned int * mutexBufferHeads,
                                  const int NUM_CU)
 {
   __shared__ int done;
   const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) &&
                                (hipThreadIdx_z == 0));
   if (isMasterThread) { done = 0; }
   __syncthreads();

   while (!done)
   {
     __syncthreads();
     if (isMasterThread)
     {
       if (atomicCAS(mutexBufferHeads + (mutex * NUM_CU), 0, 1) == 0) {
         // atomicCAS acts as a load acquire, need TF to enforce ordering
         __threadfence();
         done = 1;
       }
     }
     __syncthreads();
   }
 }

 __device__ void hipMutexSpinUnlock(const hipMutex_t mutex,
                                    unsigned int * mutexBufferHeads,
                                    const int NUM_CU)
 {
   __syncthreads();
   if (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && hipThreadIdx_z == 0)
   {
     // atomicExch acts as a store release, need TF to enforce ordering
     __threadfence();
     atomicExch(mutexBufferHeads + (mutex * NUM_CU), 0);
   }
   __syncthreads();
 }

 // same algorithm but uses local TF instead because data is local
 __device__ void hipMutexSpinLockLocal(const hipMutex_t mutex,
                                       const unsigned int cuID,
                                       unsigned int * mutexBufferHeads,
                                       const int NUM_CU)
 {
   __shared__ int done;
   const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) &&
                                (hipThreadIdx_z == 0));
   if (isMasterThread) { done = 0; }
   __syncthreads();

   while (!done)
   {
     __syncthreads();
     if (isMasterThread)
     {
       if (atomicCAS(mutexBufferHeads + ((mutex * NUM_CU) + cuID), 0, 1) == 0)
       {
         // atomicCAS acts as a load acquire, need TF to enforce ordering locally
         __threadfence_block();
         done = 1;
       }
     }
     __syncthreads();
   }
 }

 // same algorithm but uses local TF instead because data is local
 __device__ void hipMutexSpinUnlockLocal(const hipMutex_t mutex,
                                         const unsigned int cuID,
                                         unsigned int * mutexBufferHeads,
                                         const int NUM_CU)
 {
   __syncthreads();
   if ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && (hipThreadIdx_z == 0))
   {
     // atomicExch acts as a store release, need TF to enforce ordering locally
     __threadfence_block();
     // mutex math allows us to access the appropriate per-CU spin mutex location
     atomicExch(mutexBufferHeads + ((mutex * NUM_CU) + cuID), 0);
   }
   __syncthreads();
 }

 #endif
	#ifndef __HIPLOCKSMUTEXSPIN_H__
	#define __HIPLOCKSMUTEXSPIN_H__

	#include "hip/hip_runtime.h"

	inline __host__ hipError_t hipMutexCreateSpin(hipMutex_t * const handle,
	const int mutexNumber)
	{
	*handle = mutexNumber;
	return hipSuccess;
	}

	// This is the brain dead algorithm. Just spin on an atomic until you get the
	// lock.
	__device__ void hipMutexSpinLock(const hipMutex_t mutex,
	unsigned int * mutexBufferHeads,
	const int NUM_CU)
	{
	__shared__ int done;
	const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) &&
	(hipThreadIdx_z == 0));
	if (isMasterThread) { done = 0; }
	__syncthreads();

	while (!done)
	{
	__syncthreads();
	if (isMasterThread)
	{
	if (atomicCAS(mutexBufferHeads + (mutex * NUM_CU), 0, 1) == 0) {
	// atomicCAS acts as a load acquire, need TF to enforce ordering
	__threadfence();
	done = 1;
	}
	}
	__syncthreads();
	}
	}

	__device__ void hipMutexSpinUnlock(const hipMutex_t mutex,
	unsigned int * mutexBufferHeads,
	const int NUM_CU)
	{
	__syncthreads();
	if (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && hipThreadIdx_z == 0)
	{
	// atomicExch acts as a store release, need TF to enforce ordering
	__threadfence();
	atomicExch(mutexBufferHeads + (mutex * NUM_CU), 0);
	}
	__syncthreads();
	}

	// same algorithm but uses local TF instead because data is local
	__device__ void hipMutexSpinLockLocal(const hipMutex_t mutex,
	const unsigned int cuID,
	unsigned int * mutexBufferHeads,
	const int NUM_CU)
	{
	__shared__ int done;
	const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) &&
	(hipThreadIdx_z == 0));
	if (isMasterThread) { done = 0; }
	__syncthreads();

	while (!done)
	{
	__syncthreads();
	if (isMasterThread)
	{
	if (atomicCAS(mutexBufferHeads + ((mutex * NUM_CU) + cuID), 0, 1) == 0)
	{
	// atomicCAS acts as a load acquire, need TF to enforce ordering locally
	__threadfence_block();
	done = 1;
	}
	}
	__syncthreads();
	}
	}

	// same algorithm but uses local TF instead because data is local
	__device__ void hipMutexSpinUnlockLocal(const hipMutex_t mutex,
	const unsigned int cuID,
	unsigned int * mutexBufferHeads,
	const int NUM_CU)
	{
	__syncthreads();
	if ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && (hipThreadIdx_z == 0))
	{
	// atomicExch acts as a store release, need TF to enforce ordering locally
	__threadfence_block();
	// mutex math allows us to access the appropriate per-CU spin mutex location
	atomicExch(mutexBufferHeads + ((mutex * NUM_CU) + cuID), 0);
	}
	__syncthreads();
	}

	#endif