blob: b9a1461f2cdbfacd68eb8b91f8e0643955efcba2 [file] [log] [blame]
#ifndef __HIPLOCKMUTEXSLEEP_H__
#define __HIPLOCKMUTEXSLEEP_H__
#include "hip/hip_runtime.h"
#include "hipLocks.h"
inline __host__ hipError_t hipMutexCreateSleep(hipMutex_t * const handle,
const int mutexNumber)
{
*handle = mutexNumber;
return hipSuccess;
}
/*
Instead of constantly pounding an atomic to try and lock the mutex, we simply
put ourselves into a ring buffer. Then we check our location in the ring
buffer to see if it's been set to 1 -- when it has, it is our turn. When
we're done, unset our location and set the next location to 1.
locks the mutex. must be called by the entire WG.
*/
__device__ unsigned int hipMutexSleepLock(const hipMutex_t mutex,
int * mutexBuffers,
unsigned int * mutexBufferTails,
const int maxRingBufferSize,
const int arrayStride,
const int NUM_CU)
{
__syncthreads();
// local variables
const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
hipThreadIdx_z == 0);
unsigned int * const ringBufferTailPtr = mutexBufferTails + (mutex * NUM_CU);
int * const ringBuffer = (int *)mutexBuffers + (mutex * NUM_CU) * arrayStride;
__shared__ unsigned int myRingBufferLoc;
__shared__ bool haveLock;
__shared__ int backoff;
// this is a fire-and-forget atomic.
if (isMasterThread)
{
/*
Don't need store release semantics -- the atomicAdd below determines
the happens-before ordering here.
*/
/*
HIP currently doesn't generate the correct code for atomicInc's,
so replace with an atomicAdd of 1 and assume no wraparound
*/
//myRingBufferLoc = atomicInc(ringBufferTailPtr, maxRingBufferSize);
myRingBufferLoc = atomicAdd(ringBufferTailPtr, 1);
haveLock = false; // initially we don't have the lock
backoff = 1;
}
__syncthreads();
// Two possibilities
// Mutex is unlocked
// Mutex is locked
while (!haveLock)
{
__syncthreads();
if (isMasterThread)
{
// spin waiting for our location in the ring buffer to == 1.
if (atomicAdd(&ringBuffer[myRingBufferLoc], 0) == 1)
{
// atomicAdd (load) acts as a load acquire, need TF to enforce ordering
__threadfence();
// When our location in the ring buffer == 1, we have the lock
haveLock = true;
}
else
{
// if we failed in acquiring the lock, wait for a little while before
// trying again
//sleepFunc(backoff);
for (int j = 0; j < backoff; ++j) { ; }
// (capped) exponential backoff
backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
}
}
__syncthreads();
}
return myRingBufferLoc;
}
// to unlock, simply increment the ring buffer's head pointer.
__device__ void hipMutexSleepUnlock(const hipMutex_t mutex,
int * mutexBuffers,
unsigned int myBufferLoc,
const int maxRingBufferSize,
const int arrayStride,
const int NUM_CU)
{
__syncthreads();
const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
hipThreadIdx_z == 0);
int * ringBuffer = (int * )mutexBuffers + (mutex * NUM_CU) * arrayStride;
// next location is 0 if we're the last location in the buffer (wraparound)
const unsigned int nextBufferLoc = ((myBufferLoc >= maxRingBufferSize) ? 0 :
myBufferLoc + 1);
if (isMasterThread)
{
// set my ring buffer location to -1
atomicExch((int *)(ringBuffer + myBufferLoc), -1);
// set the next location in the ring buffer to 1 so that next WG in line
// can get the lock now
atomicExch((int *)ringBuffer + nextBufferLoc, 1);
// atomicExch acts as a store release, need TF to enforce ordering
__threadfence();
}
__syncthreads();
}
// same algorithm but uses per-CU lock
__device__ unsigned int hipMutexSleepLockLocal(const hipMutex_t mutex,
const unsigned int cuID,
int * mutexBuffers,
unsigned int * mutexBufferTails,
const int maxRingBufferSize,
const int arrayStride,
const int NUM_CU)
{
__syncthreads();
// local variables
const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
hipThreadIdx_z == 0);
unsigned int * const ringBufferTailPtr = mutexBufferTails + ((mutex * NUM_CU) +
cuID);
int * const ringBuffer = (int * )mutexBuffers +
((mutex * NUM_CU) + cuID) * arrayStride;
__shared__ unsigned int myRingBufferLoc;
__shared__ bool haveLock;
// this is a fire-and-forget atomic.
if (isMasterThread)
{
/*
HIP currently doesn't generate the correct code for atomicInc's here,
so replace with an atomicAdd of 1 and assume no wraparound
*/
//myRingBufferLoc = atomicInc(ringBufferTailPtr, maxRingBufferSize);
myRingBufferLoc = atomicAdd(ringBufferTailPtr, 1);
haveLock = false; // initially we don't have the lock
}
__syncthreads();
// Two possibilities
// Mutex is unlocked
// Mutex is locked
while (!haveLock)
{
__syncthreads();
if (isMasterThread)
{
// spin waiting for our location in the ring buffer to == 1.
if (atomicAdd(&ringBuffer[myRingBufferLoc], 0) == 1)
{
// atomicAdd (load) acts as a load acquire, need TF to enforce ordering locally
__threadfence_block();
// When our location in the ring buffer == 1, we have the lock
haveLock = true;
}
}
__syncthreads();
}
return myRingBufferLoc;
}
// to unlock, simply increment the ring buffer's head pointer -- same algorithm
// but uses per-CU lock.
__device__ void hipMutexSleepUnlockLocal(const hipMutex_t mutex,
const unsigned int cuID,
int * mutexBuffers,
unsigned int myBufferLoc,
const int maxRingBufferSize,
const int arrayStride,
const int NUM_CU)
{
__syncthreads();
const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
hipThreadIdx_z == 0);
int * ringBuffer = (int * )mutexBuffers + ((mutex * NUM_CU) + cuID) *
arrayStride;
// next location is 0 if we're the last location in the buffer (wraparound)
const unsigned int nextBufferLoc = ((myBufferLoc >= maxRingBufferSize) ? 0 :
myBufferLoc + 1);
if (isMasterThread)
{
// set my ring buffer location to -1
atomicExch((int *)(ringBuffer + myBufferLoc), -1);
// set the next location in the ring buffer to 1 so that next WG in line
// can get the lock now
atomicExch((int *)ringBuffer + nextBufferLoc, 1);
// atomicExch acts as a store release, need TF to enforce ordering locally
__threadfence_block();
}
__syncthreads();
}
#endif