blob: ebcbc3468fae20e9294a351fe420375ed9257741 [file] [log] [blame]
#ifndef __HIPLOCKSMUTEXSPIN_H__
#define __HIPLOCKSMUTEXSPIN_H__
#include "hip/hip_runtime.h"
inline __host__ hipError_t hipMutexCreateSpin(hipMutex_t * const handle,
const int mutexNumber)
{
*handle = mutexNumber;
return hipSuccess;
}
// This is the brain dead algorithm. Just spin on an atomic until you get the
// lock.
__device__ void hipMutexSpinLock(const hipMutex_t mutex,
unsigned int * mutexBufferHeads,
const int NUM_CU)
{
__shared__ int done;
const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) &&
(hipThreadIdx_z == 0));
if (isMasterThread) { done = 0; }
__syncthreads();
while (!done)
{
__syncthreads();
if (isMasterThread)
{
if (atomicCAS(mutexBufferHeads + (mutex * NUM_CU), 0, 1) == 0) {
// atomicCAS acts as a load acquire, need TF to enforce ordering
__threadfence();
done = 1;
}
}
__syncthreads();
}
}
__device__ void hipMutexSpinUnlock(const hipMutex_t mutex,
unsigned int * mutexBufferHeads,
const int NUM_CU)
{
__syncthreads();
if (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && hipThreadIdx_z == 0)
{
// atomicExch acts as a store release, need TF to enforce ordering
__threadfence();
atomicExch(mutexBufferHeads + (mutex * NUM_CU), 0);
}
__syncthreads();
}
// same algorithm but uses local TF instead because data is local
__device__ void hipMutexSpinLockLocal(const hipMutex_t mutex,
const unsigned int cuID,
unsigned int * mutexBufferHeads,
const int NUM_CU)
{
__shared__ int done;
const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) &&
(hipThreadIdx_z == 0));
if (isMasterThread) { done = 0; }
__syncthreads();
while (!done)
{
__syncthreads();
if (isMasterThread)
{
if (atomicCAS(mutexBufferHeads + ((mutex * NUM_CU) + cuID), 0, 1) == 0)
{
// atomicCAS acts as a load acquire, need TF to enforce ordering locally
__threadfence_block();
done = 1;
}
}
__syncthreads();
}
}
// same algorithm but uses local TF instead because data is local
__device__ void hipMutexSpinUnlockLocal(const hipMutex_t mutex,
const unsigned int cuID,
unsigned int * mutexBufferHeads,
const int NUM_CU)
{
__syncthreads();
if ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && (hipThreadIdx_z == 0))
{
// atomicExch acts as a store release, need TF to enforce ordering locally
__threadfence_block();
// mutex math allows us to access the appropriate per-CU spin mutex location
atomicExch(mutexBufferHeads + ((mutex * NUM_CU) + cuID), 0);
}
__syncthreads();
}
#endif