blob: 8a6428b941cee3128c59fed37f34ca2ab4c7819c [file] [log] [blame]
#ifndef __HIPLOCKSMUTEXFA_H__
#define __HIPLOCKSMUTEXFA_H__
#include "hip/hip_runtime.h"
#include "hipLocks.h"
inline __host__ hipError_t hipMutexCreateFA(hipMutex_t * const handle,
const int mutexNumber)
{
*handle = mutexNumber;
return hipSuccess;
}
inline __device__ void hipMutexFALock(const hipMutex_t mutex,
unsigned int * mutexBufferHeads,
unsigned int * mutexBufferTails,
const int NUM_CU)
{
const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
hipThreadIdx_z == 0);
__shared__ unsigned int myTicketNum;
__shared__ bool haveLock;
const unsigned int maxTurnNum = 1000000000;
unsigned int * ticketNumber = mutexBufferHeads + (mutex * NUM_CU);
unsigned int * turnNumber =
(unsigned int * )mutexBufferTails + (mutex * NUM_CU);
__syncthreads();
if (isMasterThread)
{
// load below provides ordering, no TF needed
myTicketNum = atomicInc(ticketNumber, maxTurnNum);
haveLock = false;
}
__syncthreads();
while (!haveLock)
{
if (isMasterThread)
{
unsigned int currTicketNum = atomicAdd(turnNumber, 0);
// it's my turn, I get the lock now
if (currTicketNum == myTicketNum) {
// above acts as a load acquire, so need TF to enforce ordering
__threadfence();
haveLock = true;
}
}
__syncthreads();
}
}
inline __device__ void hipMutexFAUnlock(const hipMutex_t mutex,
unsigned int * mutexBufferTails,
const int NUM_CU)
{
const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
hipThreadIdx_z == 0);
const unsigned int maxTurnNum = 1000000000;
unsigned int * turnNumber = mutexBufferTails + (mutex * NUM_CU);
__syncthreads();
if (isMasterThread) {
// atomicInc acts as a store release, need TF to enforce ordering
__threadfence();
/*
HIP currently doesn't generate the correct code for atomicInc's here,
so replace with an atomicAdd of 1 and assume no wraparound
*/
//atomicInc(turnNumber, maxTurnNum);
atomicAdd(turnNumber, 1);
}
__syncthreads();
}
// same algorithm but uses per-CU lock
inline __device__ void hipMutexFALockLocal(const hipMutex_t mutex,
const unsigned int cuID,
unsigned int * mutexBufferHeads,
unsigned int * mutexBufferTails,
const int NUM_CU)
{
// local variables
const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
hipThreadIdx_z == 0);
__shared__ unsigned int myTicketNum;
__shared__ bool haveLock;
const unsigned int maxTurnNum = 100000000;
unsigned int * ticketNumber = mutexBufferHeads + ((mutex * NUM_CU) +
cuID);
unsigned int * turnNumber =
(unsigned int *)mutexBufferTails + ((mutex * NUM_CU) + cuID);
__syncthreads();
if (isMasterThread)
{
myTicketNum = atomicInc(ticketNumber, maxTurnNum);
haveLock = false;
}
__syncthreads();
while (!haveLock)
{
if (isMasterThread)
{
unsigned int currTicketNum = atomicAdd(turnNumber, 0);
// it's my turn, I get the lock now
if (currTicketNum == myTicketNum) {
// above acts as a load acquire, so need TF to enforce ordering locally
__threadfence_block();
haveLock = true;
}
}
__syncthreads();
}
}
// same algorithm but uses per-CU lock
inline __device__ void hipMutexFAUnlockLocal(const hipMutex_t mutex,
const unsigned int cuID,
unsigned int * mutexBufferTails,
const int NUM_CU)
{
const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
hipThreadIdx_z == 0);
const unsigned int maxTurnNum = 100000000;
unsigned int * turnNumber = mutexBufferTails + ((mutex * NUM_CU) + cuID);
__syncthreads();
if (isMasterThread) {
// atomicInc acts as a store release, need TF to enforce ordering locally
__threadfence_block();
/*
HIP currently doesn't generate the correct code for atomicInc's here,
so replace with an atomicAdd of 1 and assume no wraparound
*/
//atomicInc(turnNumber, maxTurnNum);
atomicAdd(turnNumber, 1);
}
__syncthreads();
}
#endif