blob: e4a2d0fe22ad3bb85b84af9109c7fe670f2460b8 [file] [log] [blame]
#include <cstdio>
#include <string>
#include <assert.h>
#include <math.h>
#include "hip/hip_runtime.h"
#include "hip_error.h"
#define MAX_BACKOFF 1024
#define NUM_WIS_PER_WG 64
#define MAD_MUL 1.1f
#define MAD_ADD 0.25f
#define NUM_WORDS_PER_CACHELINE 16
#define NUM_WIS_PER_QUARTERWAVE 16
// separate .h files
#include "hipLocksBarrier.h"
#include "hipLocksImpl.h"
#include "hipLocksMutex.h"
#include "hipLocksSemaphore.h"
// program globals
const int NUM_REPEATS = 10;
int NUM_LDST = 0;
int numWGs = 0;
// number of CUs our GPU has
int NUM_CU = 0;
int MAX_WGS = 0;
bool pageAlign = false;
/*
Helper function to do data accesses for golden checking code.
*/
void accessData_golden(float * storageGolden, int currLoc, int numStorageLocs)
{
/*
If this location isn't the first location accessed by a
thread, update it -- each half-warp accesses (NUM_LDST + 1) cache
lines.
*/
if (currLoc % (NUM_WIS_PER_QUARTERWAVE * (NUM_LDST + 1)) >=
NUM_WORDS_PER_CACHELINE)
{
assert((currLoc - NUM_WORDS_PER_CACHELINE) >= 0);
assert(currLoc < numStorageLocs);
// each location is dependent on the location accessed at the
// same word on the previous cache line
storageGolden[currLoc] =
((storageGolden[currLoc -
NUM_WORDS_PER_CACHELINE]/* * MAD_MUL*/) /*+ MAD_ADD*/);
}
}
/*
Shared function that does the critical section data accesses for the barrier
and mutex kernels.
NOTE: kernels that access data differently (e.g., semaphores) should not call
this function.
*/
inline __device__ void accessData(float * storage, int threadBaseLoc,
int threadOffset, int NUM_LDST)
{
// local variables
int readLoc = 0, writeLoc = 0;
for (int n = NUM_LDST-1; n >= 0; --n) {
writeLoc = ((threadBaseLoc + n + 1) * NUM_WORDS_PER_CACHELINE) +
threadOffset;
readLoc = ((threadBaseLoc + n) * NUM_WORDS_PER_CACHELINE) + threadOffset;
storage[writeLoc] = ((storage[readLoc]/* * MAD_MUL*/) /*+ MAD_ADD*/);
}
}
/*
Helper function for semaphore writers. Although semaphore writers are
iterating over all locations accessed on a given CU, the logic for this
varies and is done outside this helper, so can just call accessData().
*/
inline __device__ void accessData_semWr(float * storage, int threadBaseLoc,
int threadOffset, int NUM_LDST)
{
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
}
/*
Helper function for semaphore readers.
*/
inline __device__ void accessData_semRd(float * storage,
volatile float * dummyArray,
int threadBaseLoc,
int threadOffset, int NUM_LDST)
{
for (int n = NUM_LDST-1; n >= 0; --n) {
dummyArray[hipThreadIdx_x] +=
storage[((threadBaseLoc + n) * NUM_WORDS_PER_CACHELINE) +
threadOffset];
__syncthreads();
}
}
// performs a tree barrier. Each WG on an CU accesses unique data then joins a
// local barrier. 1 of the WGs from each CU then joins the global barrier
__global__ void kernelAtomicTreeBarrierUniq(float * storage,
hipLockData_t * gpuLockData,
unsigned int * perCUBarrierBuffers,
const int ITERATIONS,
const int NUM_LDST,
const int NUM_CU,
const int MAX_WGS)
{
// local variables
// thread 0 is master thread
const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) &&
(hipThreadIdx_z == 0));
// represents the number of WGs going to the barrier (max NUM_CU, hipGridDim_x if
// fewer WGs than CUs).
const unsigned int numWGsAtBarr = ((hipGridDim_x < NUM_CU) ? hipGridDim_x :
NUM_CU);
const int cuID = (hipBlockIdx_x % numWGsAtBarr); // mod by # CUs to get CU ID
// all work groups on the same CU access unique locations because the
// barrier can't ensure DRF between WGs
int currWGID = hipBlockIdx_x;
int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
// determine if I'm WG 0 on my CU
const int perCU_WGID = (hipBlockIdx_x / numWGsAtBarr);
// given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
// all CUs have an identical number of WGs
int numWGs_perCU = (int)ceil((float)hipGridDim_x / numWGsAtBarr);
for (int i = 0; i < ITERATIONS; ++i)
{
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
joinBarrier_helper(gpuLockData->barrierBuffers, perCUBarrierBuffers,
numWGsAtBarr, cuID, perCU_WGID, numWGs_perCU,
isMasterThread, MAX_WGS);
// get new thread ID by trading amongst WGs -- + 1 WG ID to shift to next
// CUs data
currWGID = ((currWGID + 1) % hipGridDim_x);
tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
}
}
// like the tree barrier but also has WGs exchange work locally before joining
// the global barrier
__global__ void kernelAtomicTreeBarrierUniqLocalExch(float * storage,
hipLockData_t * gpuLockData,
unsigned int * perCUBarrierBuffers,
const int ITERATIONS,
const int NUM_LDST,
const int NUM_CU,
const int MAX_WGS)
{
// local variables
// thread 0 is master thread
const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) &&
(hipThreadIdx_z == 0));
// represents the number of WGs going to the barrier (max NUM_CU, hipGridDim_x if
// fewer WGs than CUs).
const unsigned int numWGsAtBarr = ((hipGridDim_x < NUM_CU) ? hipGridDim_x :
NUM_CU);
const int cuID = (hipBlockIdx_x % numWGsAtBarr); // mod by # CUs to get CU ID
// all work groups on the same CU access unique locations because the
// barrier can't ensure DRF between WGs
int currWGID = hipBlockIdx_x;
int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
// determine if I'm WG 0 on my CU
const int perCU_WGID = (hipBlockIdx_x / numWGsAtBarr);
// given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
// all CUs have an identical number of WGs
int numWGs_perCU = (int)ceil((float)hipGridDim_x / numWGsAtBarr);
for (int i = 0; i < ITERATIONS; ++i)
{
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
// all WGs on this CU do a local barrier (if > 1 WG)
if (numWGs_perCU > 1) {
hipBarrierAtomicLocal(perCUBarrierBuffers, cuID, numWGs_perCU,
isMasterThread, MAX_WGS);
// exchange data within the WGs on this CU, then do some more computations
currWGID = ((currWGID + numWGsAtBarr) % hipGridDim_x);
tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
}
joinBarrier_helper(gpuLockData->barrierBuffers, perCUBarrierBuffers,
numWGsAtBarr, cuID, perCU_WGID, numWGs_perCU,
isMasterThread, MAX_WGS);
// get new thread ID by trading amongst WGs -- + 1 WG ID to shift to next
// CUs data
currWGID = ((currWGID + 1) % hipGridDim_x);
tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
}
}
// performs a tree barrier like above but with a lock-free barrier
__global__ void kernelFBSTreeBarrierUniq(float * storage,
hipLockData_t * gpuLockData,
unsigned int * perCUBarrierBuffers,
const int ITERATIONS,
const int NUM_LDST,
const int NUM_CU,
const int MAX_WGS)
{
// local variables
// represents the number of WGs going to the barrier (max NUM_CU, hipGridDim_x if
// fewer WGs than CUs).
const unsigned int numWGsAtBarr = ((hipGridDim_x < NUM_CU) ? hipGridDim_x :
NUM_CU);
const int cuID = (hipBlockIdx_x % numWGsAtBarr); // mod by # CUs to get CU ID
// all work groups on the same CU access unique locations because the
// barrier can't ensure DRF between WGs
int currWGID = hipBlockIdx_x;
int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
// determine if I'm WG 0 on my CU
const int perCU_WGID = (hipBlockIdx_x / numWGsAtBarr);
// given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
// all CUs have an identical number of WGs
int numWGs_perCU = (int)ceil((float)hipGridDim_x/numWGsAtBarr);
for (int i = 0; i < ITERATIONS; ++i)
{
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
joinLFBarrier_helper(gpuLockData->barrierBuffers, perCUBarrierBuffers,
numWGsAtBarr, cuID, perCU_WGID, numWGs_perCU,
gpuLockData->arrayStride, MAX_WGS);
// get new thread ID by trading amongst WGs -- + 1 WG ID to shift to next
// CUs data
currWGID = ((currWGID + 1) % hipGridDim_x);
tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
}
}
// performs a tree barrier like above but with a lock-free barrier and has WGs
// exchange work locally before joining the global barrier
__global__ void kernelFBSTreeBarrierUniqLocalExch(float * storage,
hipLockData_t * gpuLockData,
unsigned int * perCUBarrierBuffers,
const int ITERATIONS,
const int NUM_LDST,
const int NUM_CU,
const int MAX_WGS)
{
// local variables
// represents the number of WGs going to the barrier (max NUM_CU, hipGridDim_x if
// fewer WGs than CUs).
const unsigned int numWGsAtBarr = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU);
const int cuID = (hipBlockIdx_x % numWGsAtBarr); // mod by # CUs to get CU ID
// all work groups on the same CU access unique locations because the
// barrier can't ensure DRF between WGs
int currWGID = hipBlockIdx_x;
int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
// determine if I'm WG 0 on my CU
const int perCU_WGID = (hipBlockIdx_x / numWGsAtBarr);
// given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
// all CUs have an identical number of WGs
int numWGs_perCU = (int)ceil((float)hipGridDim_x/numWGsAtBarr);
for (int i = 0; i < ITERATIONS; ++i)
{
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
// all WGs on this CU do a local barrier (if > 1 WG per CU)
if (numWGs_perCU > 1) {
hipBarrierLocal(gpuLockData->barrierBuffers, numWGsAtBarr,
gpuLockData->arrayStride, perCUBarrierBuffers, cuID, numWGs_perCU,
perCU_WGID, false, MAX_WGS);
// exchange data within the WGs on this CU and do some more computations
currWGID = ((currWGID + numWGsAtBarr) % hipGridDim_x);
tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
}
joinLFBarrier_helper(gpuLockData->barrierBuffers, perCUBarrierBuffers,
numWGsAtBarr, cuID, perCU_WGID, numWGs_perCU,
gpuLockData->arrayStride, MAX_WGS);
// get new thread ID by trading amongst WGs -- + 1 WG ID to shift to next
// CUs data
currWGID = ((currWGID + 1) % hipGridDim_x);
tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
}
}
__global__ void kernelSleepingMutex(hipMutex_t mutex, float * storage,
hipLockData_t * gpuLockData,
const int ITERATIONS, const int NUM_LDST,
const int NUM_CU)
{
// local variables
// all work groups access the same locations (rely on release to get
// ownership in time)
const int tid = hipThreadIdx_x;
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
__shared__ int myRingBufferLoc; // tracks my WGs location in the ring buffer
if (hipThreadIdx_x == 0) {
myRingBufferLoc = -1; // initially I have no location
}
__syncthreads();
for (int i = 0; i < ITERATIONS; ++i)
{
myRingBufferLoc = hipMutexSleepLock(mutex, gpuLockData->mutexBuffers,
gpuLockData->mutexBufferTails, gpuLockData->maxBufferSize,
gpuLockData->arrayStride, NUM_CU);
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
hipMutexSleepUnlock(mutex, gpuLockData->mutexBuffers, myRingBufferLoc,
gpuLockData->maxBufferSize, gpuLockData->arrayStride, NUM_CU);
}
}
__global__ void kernelSleepingMutexUniq(hipMutex_t mutex, float * storage,
hipLockData_t * gpuLockData,
const int ITERATIONS,
const int NUM_LDST, const int NUM_CU)
{
// local variables
const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID
// all work groups on the same CU access the same locations
const int tid = ((cuID * hipBlockDim_x) + hipThreadIdx_x);
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
__shared__ int myRingBufferLoc; // tracks my WGs location in the ring buffer
if (hipThreadIdx_x == 0) {
myRingBufferLoc = -1; // initially I have no location
}
__syncthreads();
for (int i = 0; i < ITERATIONS; ++i)
{
myRingBufferLoc = hipMutexSleepLockLocal(mutex, cuID, gpuLockData->mutexBuffers,
gpuLockData->mutexBufferTails, gpuLockData->maxBufferSize,
gpuLockData->arrayStride, NUM_CU);
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
hipMutexSleepUnlockLocal(mutex, cuID, gpuLockData->mutexBuffers, myRingBufferLoc,
gpuLockData->maxBufferSize, gpuLockData->arrayStride,
NUM_CU);
}
}
__global__ void kernelFetchAndAddMutex(hipMutex_t mutex, float * storage,
hipLockData_t * gpuLockData,
const int ITERATIONS,
const int NUM_LDST, const int NUM_CU)
{
// local variables
// all work groups access the same locations (rely on release to get
// ownership in time)
const int tid = hipThreadIdx_x;
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
for (int i = 0; i < ITERATIONS; ++i)
{
hipMutexFALock(mutex, gpuLockData->mutexBufferHeads, gpuLockData->mutexBufferTails,
NUM_CU);
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
hipMutexFAUnlock(mutex, gpuLockData->mutexBufferTails, NUM_CU);
}
}
__global__ void kernelFetchAndAddMutexUniq(hipMutex_t mutex, float * storage,
hipLockData_t * gpuLockData,
const int ITERATIONS,
const int NUM_LDST,
const int NUM_CU)
{
// local variables
const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID
// all work groups on the same CU access the same locations
const int tid = ((cuID * hipBlockDim_x) + hipThreadIdx_x);
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
for (int i = 0; i < ITERATIONS; ++i)
{
hipMutexFALockLocal(mutex, cuID, gpuLockData->mutexBufferHeads,
gpuLockData->mutexBufferTails, NUM_CU);
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
hipMutexFAUnlockLocal(mutex, cuID, gpuLockData->mutexBufferTails, NUM_CU);
}
}
__global__ void kernelSpinLockMutex(hipMutex_t mutex, float * storage,
hipLockData_t * gpuLockData,
const int ITERATIONS, const int NUM_LDST,
const int NUM_CU)
{
// local variables
// all work groups access the same locations (rely on release to get
// ownership in time)
const int tid = hipThreadIdx_x;
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
for (int i = 0; i < ITERATIONS; ++i)
{
hipMutexSpinLock(mutex, gpuLockData->mutexBufferHeads, NUM_CU);
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
hipMutexSpinUnlock(mutex, gpuLockData->mutexBufferHeads, NUM_CU);
}
}
__global__ void kernelSpinLockMutexUniq(hipMutex_t mutex, float * storage,
hipLockData_t * gpuLockData,
const int ITERATIONS,
const int NUM_LDST, const int NUM_CU)
{
// local variables
const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID
// all work groups on the same CU access the same locations
const int tid = ((cuID * hipBlockDim_x) + hipThreadIdx_x);
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
for (int i = 0; i < ITERATIONS; ++i)
{
hipMutexSpinLockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, NUM_CU);
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
hipMutexSpinUnlockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, NUM_CU);
}
}
__global__ void kernelEBOMutex(hipMutex_t mutex, float * storage,
hipLockData_t * gpuLockData,
const int ITERATIONS, const int NUM_LDST,
const int NUM_CU)
{
// local variables
// all work groups access the same locations (rely on release to get
// ownership in time)
const int tid = hipThreadIdx_x;
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
for (int i = 0; i < ITERATIONS; ++i)
{
hipMutexEBOLock(mutex, gpuLockData->mutexBufferHeads, NUM_CU);
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
hipMutexEBOUnlock(mutex, gpuLockData->mutexBufferHeads, NUM_CU);
}
}
__global__ void kernelEBOMutexUniq(hipMutex_t mutex, float * storage,
hipLockData_t * gpuLockData,
const int ITERATIONS, const int NUM_LDST,
const int NUM_CU)
{
// local variables
const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID
// all work groups on the same CU access the same locations
const int tid = ((cuID * hipBlockDim_x) + hipThreadIdx_x);
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
for (int i = 0; i < ITERATIONS; ++i)
{
hipMutexEBOLockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, NUM_CU);
accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
hipMutexEBOUnlockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, NUM_CU);
}
}
// All WGs on all CUs access the same data with 1 writer per CU (and N-1)
// readers per CU.
__global__ void kernelSpinLockSemaphore(hipSemaphore_t sem,
float * storage,
hipLockData_t * gpuLockData,
const unsigned int numStorageLocs,
const int ITERATIONS,
const int NUM_LDST,
const int NUM_CU)
{
// local variables
const unsigned int maxSemCount =
gpuLockData->semaphoreBuffers[((sem * 4 * NUM_CU) + (0 * 4))];
const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID
// If there are fewer WGs than # CUs, need to take into account for various
// math below. If WGs >= NUM_CU, use NUM_CU.
const unsigned int numCU = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU);
// given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
// all CUs have an identical number of WGs
int numWGs_perCU = (int)ceil((float)hipGridDim_x / numCU);
// number of threads on each WG
//const int numThrs_perCU = (hipBlockDim_x * numWGs_perCU);
const int perCU_WGID = (hipBlockIdx_x / numCU);
// rotate which WG is the writer
const bool isWriter = (perCU_WGID == (cuID % numWGs_perCU));
// all work groups on the same CU access unique locations except the writer,
// which writes all of the locations that all of the WGs access
//int currWGID = hipBlockIdx_x;
// the (reader) WGs on each CU access unique locations but those same
// locations are accessed by the reader WGs on all CUs
int tid = ((perCU_WGID * hipBlockDim_x) + hipThreadIdx_x);
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
// dummy array to hold the loads done in the readers
__shared__ volatile float dummyArray[NUM_WIS_PER_WG];
for (int i = 0; i < ITERATIONS; ++i)
{
/*
NOTE: There is a race here for entering the critical section. Most
importantly, it means that the at least one of the readers could win and
thus the readers will read before the writer has had a chance to write
the data.
*/
hipSemaphoreSpinWait(sem, isWriter, maxSemCount,
gpuLockData->semaphoreBuffers, NUM_CU);
// writer writes all the data that the WGs on this CU access
if (isWriter) {
for (int j = 0; j < numWGs_perCU; ++j) {
/*
Update the writer's "location" so it writes to the locations that the
readers will access (due to RR scheduling the next WG on this CU is
numCU WGs away). Use loop counter because the non-unique version
writes the same locations on all CUs.
*/
tid = ((j * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
accessData_semWr(storage, threadBaseLoc, threadOffset, NUM_LDST);
}
// reset locations
tid = ((perCU_WGID * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
}
// rest of WGs on this CU read the data written by each CU's writer WG
else {
accessData_semRd(storage, dummyArray, threadBaseLoc, threadOffset,
NUM_LDST);
}
hipSemaphoreSpinPost(sem, isWriter, maxSemCount,
gpuLockData->semaphoreBuffers, NUM_CU);
}
}
__global__ void kernelSpinLockSemaphoreUniq(hipSemaphore_t sem,
float * storage,
hipLockData_t * gpuLockData,
const int ITERATIONS,
const int NUM_LDST,
const int NUM_CU)
{
// local variables
const unsigned int maxSemCount =
gpuLockData->semaphoreBuffers[((sem * 4 * NUM_CU) + (0 * 4))];
// If there are fewer WGs than # CUs, need to take into account for various
// math below. If WGs >= NUM_CU, use NUM_CU.
const unsigned int numCU = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU);
const int cuID = (hipBlockIdx_x % numCU); // mod by # CUs to get CU ID
// given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
// all CUs have an identical number of WGs
int numWGs_perCU = (int)ceil((float)hipGridDim_x / numCU);
const int perCU_WGID = (hipBlockIdx_x / numCU);
// rotate which WG is the writer
const bool isWriter = (perCU_WGID == (cuID % numWGs_perCU));
// all work groups on the same CU access unique locations except the writer,
// which writes all of the locations that all of the WGs access
int currWGID = hipBlockIdx_x;
int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
// dummy array to hold the loads done in the readers
__shared__ volatile float dummyArray[NUM_WIS_PER_WG];
for (int i = 0; i < ITERATIONS; ++i)
{
/*
NOTE: There is a race here for entering the critical section. Most
importantly, it means that the at least one of the readers could win and
thus the readers will read before the writer has had a chance to write
the data.
*/
hipSemaphoreSpinWaitLocal(sem, cuID, isWriter, maxSemCount,
gpuLockData->semaphoreBuffers, NUM_CU);
// writer WG writes all the data that the WGs on this CU access
if (isWriter) {
for (int j = 0; j < numWGs_perCU; ++j) {
accessData_semWr(storage, threadBaseLoc, threadOffset, NUM_LDST);
/*
update the writer's "location" so it writes to the locations that the
readers will access (due to RR scheduling the next WG on this CU is
numCU WGs away and < hipGridDim_x).
NOTE: First location writer writes to is its own location(s). If the
writer is not CU 0 on this CU, it may require wrapping around to CUs
with smaller WG IDs.
*/
currWGID = (currWGID + numCU) % hipGridDim_x;
tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
}
// reset locations
currWGID = hipBlockIdx_x;
tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
}
// rest of WGs on this CU read the data written by each CU's writer WG
else {
accessData_semRd(storage, dummyArray, threadBaseLoc, threadOffset,
NUM_LDST);
}
hipSemaphoreSpinPostLocal(sem, cuID, isWriter, maxSemCount,
gpuLockData->semaphoreBuffers, NUM_CU);
}
}
// All WGs on all CUs access the same data with 1 writer per CU (and N-1)
// readers per CU.
__global__ void kernelEBOSemaphore(hipSemaphore_t sem, float * storage,
hipLockData_t * gpuLockData,
const unsigned int numStorageLocs,
const int ITERATIONS, const int NUM_LDST,
const int NUM_CU)
{
// local variables
const unsigned int maxSemCount =
gpuLockData->semaphoreBuffers[((sem * 4 * NUM_CU) + (0 * 4))];
// If there are fewer WGs than # CUs, need to take into account for various
// math below. If WGs >= NUM_CU, use NUM_CU.
const unsigned int numCU = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU);
const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID
// given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
// all CUs have an identical number of WGs
int numWGs_perCU = (int)ceil((float)hipGridDim_x / numCU);
// number of threads on each WG
//const int numThrs_perCU = (hipBlockDim_x * numWGs_perCU);
const int perCU_WGID = (hipBlockIdx_x / numCU);
// rotate which WG is the writer
const bool isWriter = (perCU_WGID == (cuID % numWGs_perCU));
// all work groups on the same CU access unique locations except the writer,
// which writes all of the locations that all of the WGs access
//int currWGID = hipBlockIdx_x;
// the (reader) WGs on each CU access unique locations but those same
// locations are accessed by the reader WGs on all CUs
int tid = ((perCU_WGID * hipBlockDim_x) + hipThreadIdx_x);
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
// dummy array to hold the loads done in the readers
__shared__ volatile float dummyArray[NUM_WIS_PER_WG];
for (int i = 0; i < ITERATIONS; ++i)
{
/*
NOTE: There is a race here for entering the critical section. Most
importantly, it means that the at least one of the readers could win and
thus the readers will read before the writer has had a chance to write
the data.
*/
hipSemaphoreEBOWait(sem, isWriter, maxSemCount,
gpuLockData->semaphoreBuffers, NUM_CU);
// writer WG writes all the data that the WGs on this CU access
if (isWriter) {
for (int j = 0; j < numWGs_perCU; ++j) {
/*
Update the writer's "location" so it writes to the locations that the
readers will access (due to RR scheduling the next WG on this CU is
numCU WGs away). Use loop counter because the non-unique version
writes the same locations on all CUs.
*/
tid = ((j * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
accessData_semWr(storage, threadBaseLoc, threadOffset, NUM_LDST);
}
// reset locations
tid = ((perCU_WGID * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
}
// rest of WGs on this CU read the data written by each CU's writer WG
else {
accessData_semRd(storage, dummyArray, threadBaseLoc, threadOffset,
NUM_LDST);
}
hipSemaphoreEBOPost(sem, isWriter, maxSemCount,
gpuLockData->semaphoreBuffers, NUM_CU);
}
}
__global__ void kernelEBOSemaphoreUniq(hipSemaphore_t sem, float * storage,
hipLockData_t * gpuLockData,
const int ITERATIONS,
const int NUM_LDST,
const int NUM_CU)
{
// local variables
const unsigned int maxSemCount =
gpuLockData->semaphoreBuffers[((sem * 4 * NUM_CU) + (0 * 4))];
// If there are fewer WGs than # CUs, need to take into account for various
// math below. If WGs >= NUM_CU, use NUM_CU.
const unsigned int numCU = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU);
const int cuID = (hipBlockIdx_x % numCU); // mod by # CUs to get CU ID
// given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
// all CUs have an identical number of WGs
int numWGs_perCU = (int)ceil((float)hipGridDim_x / numCU);
const int perCU_WGID = (hipBlockIdx_x / numCU);
// rotate which WG is the writer
const bool isWriter = (perCU_WGID == (cuID % numWGs_perCU));
// all work groups on the same CU access unique locations except the writer,
// which writes all of the locations that all of the WGs access
int currWGID = hipBlockIdx_x;
int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
// want striding to happen across cache lines so that each thread in a
// half-warp accesses sequential words
int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
// dummy array to hold the loads done in the readers
__shared__ volatile float dummyArray[NUM_WIS_PER_WG];
for (int i = 0; i < ITERATIONS; ++i)
{
/*
NOTE: There is a race here for entering the critical section. Most
importantly, it means that the at least one of the readers could win and
thus the readers will read before the writer has had a chance to write
the data.
*/
hipSemaphoreEBOWaitLocal(sem, cuID, isWriter, maxSemCount,
gpuLockData->semaphoreBuffers, NUM_CU);
// writer WG writes all the data that the WGs on this CU access
if (isWriter) {
for (int j = 0; j < numWGs_perCU; ++j) {
accessData_semWr(storage, threadBaseLoc, threadOffset, NUM_LDST);
/*
update the writer's "location" so it writes to the locations that the
readers will access (due to RR scheduling the next WG on this CU is
numCU WGs away and < hipGridDim_x).
NOTE: First location writer writes to is its own location(s). If the
writer is not CU 0 on this CU, it may require wrapping around to CUs
with smaller WG IDs.
*/
currWGID = (currWGID + numCU) % hipGridDim_x;
tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
}
// reset locations
currWGID = hipBlockIdx_x;
tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
}
// rest of WGs on this CU read the data written by each CU's writer WG
else {
accessData_semRd(storage, dummyArray, threadBaseLoc, threadOffset,
NUM_LDST);
}
hipSemaphoreEBOPostLocal(sem, cuID, isWriter, maxSemCount,
gpuLockData->semaphoreBuffers, NUM_CU);
}
}
void invokeAtomicTreeBarrier(float * storage_d, unsigned int * perCUBarriers_d,
int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelAtomicTreeBarrierUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
storage_d, cpuLockData, perCUBarriers_d, numIters, NUM_LDST, NUM_CU,
MAX_WGS);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelAtomicTreeBarrierUniq)");
}
}
void invokeAtomicTreeBarrierLocalExch(float * storage_d,
unsigned int * perCUBarriers_d,
int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelAtomicTreeBarrierUniqLocalExch), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
storage_d, cpuLockData, perCUBarriers_d, numIters, NUM_LDST, NUM_CU,
MAX_WGS);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr,
"hipDeviceSynchronize (kernelAtomicTreeBarrierUniqLockExch)");
}
}
void invokeFBSTreeBarrier(float * storage_d, unsigned int * perCUBarriers_d,
int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelFBSTreeBarrierUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
storage_d, cpuLockData, perCUBarriers_d, numIters, NUM_LDST, NUM_CU,
MAX_WGS);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelFBSTreeBarrierUniq)");
}
}
void invokeFBSTreeBarrierLocalExch(float * storage_d,
unsigned int * perCUBarriers_d,
int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelFBSTreeBarrierUniqLocalExch), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
storage_d, cpuLockData, perCUBarriers_d, numIters, NUM_LDST, NUM_CU,
MAX_WGS);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelFBSTreeBarrierUniqLocalExch)");
}
}
void invokeSpinLockMutex(hipMutex_t mutex, float * storage_d, int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSpinLockMutex), dim3(1, 1, 1), dim3(NUM_WIS_PER_WG), 0, 0,
mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelSpinLockMutex)");
}
}
void invokeSpinLockMutex_uniq(hipMutex_t mutex, float * storage_d,
int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSpinLockMutexUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelSpinLockMutexUniq)");
}
}
void invokeEBOMutex(hipMutex_t mutex, float * storage_d, int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelEBOMutex), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelEBOMutex)");
}
}
void invokeEBOMutex_uniq(hipMutex_t mutex, float * storage_d, int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelEBOMutexUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelEBOMutexUniq)");
}
}
void invokeSleepingMutex(hipMutex_t mutex, float * storage_d, int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSleepingMutex), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelSleepingMutex)");
}
}
void invokeSleepingMutex_uniq(hipMutex_t mutex, float * storage_d,
int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSleepingMutexUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelSleepingMutexUniq)");
}
}
void invokeFetchAndAddMutex(hipMutex_t mutex, float * storage_d, int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelFetchAndAddMutex), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelFetchAndAddMutex)");
}
}
void invokeFetchAndAddMutex_uniq(hipMutex_t mutex, float * storage_d, int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelFetchAndAddMutexUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelFetchAndAddMutexUniq)");
}
}
void invokeSpinLockSemaphore(hipSemaphore_t sem, float * storage_d,
const int maxVal,
int numIters, int numStorageLocs)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSpinLockSemaphore), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
sem, storage_d, cpuLockData, numStorageLocs, numIters, NUM_LDST,
NUM_CU);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelSpinLockSemaphore)");
}
}
void invokeSpinLockSemaphore_uniq(hipSemaphore_t sem, float * storage_d,
const int maxVal, int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSpinLockSemaphoreUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
sem, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelSpinLockSemaphoreUniq)");
}
}
void invokeEBOSemaphore(hipSemaphore_t sem, float * storage_d, const int maxVal,
int numIters, int numStorageLocs)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelEBOSemaphore), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
sem, storage_d, cpuLockData, numStorageLocs, numIters, NUM_LDST,
NUM_CU);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelEBOSemaphore)");
}
}
void invokeEBOSemaphore_uniq(hipSemaphore_t sem, float * storage_d,
const int maxVal, int numIters)
{
// local variable
const int WGs = numWGs;
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelEBOSemaphoreUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
sem, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
// Blocks until the device has completed all preceding requested
// tasks (make sure that the device returned before continuing).
hipError_t hipErr = hipDeviceSynchronize();
checkError(hipErr, "hipDeviceSynchronize (kernelEBOSemaphoreUniq)");
}
}
int main(int argc, char ** argv)
{
if (argc != 5) {
fprintf(stderr, "./allSyncPrims-1kernel <syncPrim> <numLdSt> <numWGs> "
"<numCSIters>\n");
fprintf(stderr, "where:\n");
fprintf(stderr, "\t<syncPrim>: a string that represents which synchronization primitive to run.\n"
"\t\tatomicTreeBarrUniq - Atomic Tree Barrier\n"
"\t\tatomicTreeBarrUniqLocalExch - Atomic Tree Barrier with local exchange\n"
"\t\tlfTreeBarrUniq - Lock-Free Tree Barrier\n"
"\t\tlfTreeBarrUniqLocalExch - Lock-Free Tree Barrier with local exchange\n"
"\t\tspinMutex - Spin Lock Mutex\n"
"\t\tspinMutexEBO - Spin Lock Mutex with Backoff\n"
"\t\tsleepMutex - Sleep Mutex\n"
"\t\tfaMutex - Fetch-and-Add Mutex\n"
"\t\tspinSem1 - Spin Semaphore (Max: 1)\n"
"\t\tspinSem2 - Spin Semaphore (Max: 2)\n"
"\t\tspinSem10 - Spin Semaphore (Max: 10)\n"
"\t\tspinSem120 - Spin Semaphore (Max: 120)\n"
"\t\tspinSemEBO1 - Spin Semaphore with Backoff (Max: 1)\n"
"\t\tspinSemEBO2 - Spin Semaphore with Backoff (Max: 2)\n"
"\t\tspinSemEBO10 - Spin Semaphore with Backoff (Max: 10)\n"
"\t\tspinSemEBO120 - Spin Semaphore with Backoff (Max: 120)\n"
"\t\tspinMutexUniq - Spin Lock Mutex -- accesses to unique locations per WG\n"
"\t\tspinMutexEBOUniq - Spin Lock Mutex with Backoff -- accesses to unique locations per WG\n"
"\t\tsleepMutexUniq - Sleep Mutex -- accesses to unique locations per WG\n"
"\t\tfaMutexUniq - Fetch-and-Add Mutex -- accesses to unique locations per WG\n"
"\t\tspinSemUniq1 - Spin Semaphore (Max: 1) -- accesses to unique locations per WG\n"
"\t\tspinSemUniq2 - Spin Semaphore (Max: 2) -- accesses to unique locations per WG\n"
"\t\tspinSemUniq10 - Spin Semaphore (Max: 10) -- accesses to unique locations per WG\n"
"\t\tspinSemUniq120 - Spin Semaphore (Max: 120) -- accesses to unique locations per WG\n"
"\t\tspinSemEBOUniq1 - Spin Semaphore with Backoff (Max: 1) -- accesses to unique locations per WG\n"
"\t\tspinSemEBOUniq2 - Spin Semaphore with Backoff (Max: 2) -- accesses to unique locations per WG\n"
"\t\tspinSemEBOUniq10 - Spin Semaphore with Backoff (Max: 10) -- accesses to unique locations per WG\n"
"\t\tspinSemEBOUniq120 - Spin Semaphore with Backoff (Max: 120) -- accesses to unique locations per WG\n");
fprintf(stderr, "\t<numLdSt>: the # of LDs and STs to do for each thread "
"in the critical section.\n");
fprintf(stderr, "\t<numWGs>: the # of WGs to execute (want to be "
"divisible by the number of CUs).\n");
fprintf(stderr, "\t<numCSIters>: number of iterations of the critical "
"section.\n");
exit(-1);
}
// boilerplate code to identify compute capability, # CU/CUM/CUX, etc.
int deviceCount;
hipGetDeviceCount(&deviceCount);
if (deviceCount == 0) {
fprintf(stderr, "There is no device supporting HIP\n");
exit(-1);
}
hipDeviceProp_t deviceProp;
hipGetDeviceProperties(&deviceProp, 0);
fprintf(stdout, "GPU Compute Capability: %d.%d\n", deviceProp.major,
deviceProp.minor);
if ((deviceProp.major == 9999) && (deviceProp.minor == 9999)) {
fprintf(stderr, "There is no HIP capable device\n");
exit(-1);
}
NUM_CU = deviceProp.multiProcessorCount;
const int maxWGPerCU = deviceProp.maxThreadsPerBlock/NUM_WIS_PER_WG;
//assert(maxWGPerCU * NUM_WIS_PER_WG <=
// deviceProp.maxThreadsPerMultiProcessor);
MAX_WGS = maxWGPerCU * NUM_CU;
fprintf(stdout, "# CU: %d, Max Thrs/WG: %d, Max WG/CU: %d, Max # WG: %d\n",
NUM_CU, deviceProp.maxThreadsPerBlock, maxWGPerCU, MAX_WGS);
hipError_t hipErr = hipGetLastError();
checkError(hipErr, "Begin");
// parse input args
const char * syncPrim_str = argv[1];
NUM_LDST = atoi(argv[2]);
numWGs = atoi(argv[3]);
assert(numWGs <= MAX_WGS);
const int NUM_ITERS = atoi(argv[4]);
const int numWGs_perCU = (int)ceil((float)numWGs / NUM_CU);
assert(numWGs_perCU > 0);
unsigned int syncPrim = 9999;
// set the syncPrim variable to the appropriate value based on the inputted
// string for the microbenchmark
if (strcmp(syncPrim_str, "atomicTreeBarrUniq") == 0) { syncPrim = 0; }
else if (strcmp(syncPrim_str, "atomicTreeBarrUniqLocalExch") == 0) {
syncPrim = 1;
}
else if (strcmp(syncPrim_str, "lfTreeBarrUniq") == 0) { syncPrim = 2; }
else if (strcmp(syncPrim_str, "lfTreeBarrUniqLocalExch") == 0) {
syncPrim = 3;
}
else if (strcmp(syncPrim_str, "spinMutex") == 0) { syncPrim = 4; }
else if (strcmp(syncPrim_str, "spinMutexEBO") == 0) { syncPrim = 5; }
else if (strcmp(syncPrim_str, "sleepMutex") == 0) { syncPrim = 6; }
else if (strcmp(syncPrim_str, "faMutex") == 0) { syncPrim = 7; }
else if (strcmp(syncPrim_str, "spinSem1") == 0) { syncPrim = 8; }
else if (strcmp(syncPrim_str, "spinSem2") == 0) { syncPrim = 9; }
else if (strcmp(syncPrim_str, "spinSem10") == 0) { syncPrim = 10; }
else if (strcmp(syncPrim_str, "spinSem120") == 0) { syncPrim = 11; }
else if (strcmp(syncPrim_str, "spinSemEBO1") == 0) { syncPrim = 12; }
else if (strcmp(syncPrim_str, "spinSemEBO2") == 0) { syncPrim = 13; }
else if (strcmp(syncPrim_str, "spinSemEBO10") == 0) { syncPrim = 14; }
else if (strcmp(syncPrim_str, "spinSemEBO120") == 0) { syncPrim = 15; }
// cases 16-19 reserved
else if (strcmp(syncPrim_str, "spinMutexUniq") == 0) { syncPrim = 20; }
else if (strcmp(syncPrim_str, "spinMutexEBOUniq") == 0) { syncPrim = 21; }
else if (strcmp(syncPrim_str, "sleepMutexUniq") == 0) { syncPrim = 22; }
else if (strcmp(syncPrim_str, "faMutexUniq") == 0) { syncPrim = 23; }
else if (strcmp(syncPrim_str, "spinSemUniq1") == 0) { syncPrim = 24; }
else if (strcmp(syncPrim_str, "spinSemUniq2") == 0) { syncPrim = 25; }
else if (strcmp(syncPrim_str, "spinSemUniq10") == 0) { syncPrim = 26; }
else if (strcmp(syncPrim_str, "spinSemUniq120") == 0) { syncPrim = 27; }
else if (strcmp(syncPrim_str, "spinSemEBOUniq1") == 0) { syncPrim = 28; }
else if (strcmp(syncPrim_str, "spinSemEBOUniq2") == 0) { syncPrim = 29; }
else if (strcmp(syncPrim_str, "spinSemEBOUniq10") == 0) { syncPrim = 30; }
else if (strcmp(syncPrim_str, "spinSemEBOUniq120") == 0) { syncPrim = 31; }
// cases 32-36 reserved
else
{
fprintf(stderr, "ERROR: Unknown synchronization primitive: %s\n",
syncPrim_str);
exit(-1);
}
// multiply number of mutexes, semaphores by NUM_CU to
// allow per-core locks
hipLocksInit(MAX_WGS, 8 * NUM_CU, 24 * NUM_CU, pageAlign, NUM_CU, NUM_REPEATS, NUM_ITERS);
hipErr = hipGetLastError();
checkError(hipErr, "After hipLocksInit");
/*
The barriers need a per-CU barrier that is not part of the global synch
structure. In terms of size, for the lock-free barrier there are 2 arrays
in here -- inVars and outVars. Each needs to be sized to hold the maximum
number of WGs/CU and each CU needs an array.
The atomic barrier per-CU synchronization fits inside the lock-free size
requirements so we can reuse the same locations.
*/
unsigned int * perCUBarriers = (unsigned int *)malloc(sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
int numLocsMult = 0;
// barriers and unique semaphores have numWGs WGs accessing unique locations
if ((syncPrim < 4) ||
((syncPrim >= 24) && (syncPrim <= 35))) { numLocsMult = numWGs; }
// The non-unique mutex microbenchmarks, all WGs access the same locations so
// multiplier is 1
else if ((syncPrim >= 4) && (syncPrim <= 7)) { numLocsMult = 1; }
// The non-unique semaphores have 1 writer and numWGs_perCU - 1 readers per CU
// so the multiplier is numWGs_perCU
else if ((syncPrim >= 8) && (syncPrim <= 19)) { numLocsMult = numWGs_perCU; }
// For the unique mutex microbenchmarks and condition variable, all WGs on
// same CU access same data so multiplier is NUM_CU.
else if (((syncPrim >= 20) && (syncPrim <= 23)) ||
(syncPrim == 36)) { numLocsMult = ((numWGs < NUM_CU) ?
numWGs : NUM_CU); }
else { // should never reach here
fprintf(stderr, "ERROR: Unknown syncPrim: %u\n", syncPrim);
exit(-1);
}
// each thread in a WG accesses NUM_LDST locations but accesses
// per WI are offset so that each subsequent access is dependent
// on the previous one -- thus need an extra access per WI.
int numUniqLocsAccPerWG = (NUM_WIS_PER_WG * (NUM_LDST + 1));
assert(numUniqLocsAccPerWG > 0);
int numStorageLocs = (numLocsMult * numUniqLocsAccPerWG);
assert(numStorageLocs > 0);
float * storage = (float *)malloc(sizeof(float) * numStorageLocs);
fprintf(stdout, "# WGs: %d, # Ld/St: %d, # Locs Mult: %d, # Uniq Locs/WG: %d, # Storage Locs: %d\n", numWGs, NUM_LDST, numLocsMult, numUniqLocsAccPerWG, numStorageLocs);
// initialize storage
for (int i = 0; i < numStorageLocs; ++i) { storage[i] = i; }
// initialize per-CU barriers to 0's
for (int i = 0; i < (NUM_CU * MAX_WGS * 2); ++i) { perCUBarriers[i] = 0; }
// gpu copies of storage and perCUBarriers
//float elapsedTime = 0.0f;
unsigned int * perCUBarriers_d = NULL;
float * storage_d = NULL;
hipMalloc(&perCUBarriers_d, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
hipMalloc(&storage_d, sizeof(float) * numStorageLocs);
hipMemcpy(perCUBarriers_d, perCUBarriers, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2), hipMemcpyHostToDevice);
hipMemcpy(storage_d, storage, sizeof(float) * numStorageLocs, hipMemcpyHostToDevice);
// lock variables
hipMutex_t spinMutex, faMutex, sleepMutex, eboMutex;
hipMutex_t spinMutex_uniq, faMutex_uniq, sleepMutex_uniq, eboMutex_uniq;
hipSemaphore_t spinSem1, eboSem1,
spinSem2, eboSem2,
spinSem10, eboSem10,
spinSem120, eboSem120;
hipSemaphore_t spinSem1_uniq, eboSem1_uniq,
spinSem2_uniq, eboSem2_uniq,
spinSem10_uniq, eboSem10_uniq,
spinSem120_uniq, eboSem120_uniq;
switch (syncPrim) {
case 0: // atomic tree barrier doesn't require any special fields to be
// created
printf("atomic_tree_barrier_%03d\n", NUM_ITERS); fflush(stdout);
break;
case 1: // atomic tree barrier with local exchange doesn't require any
// special fields to be created
printf("atomic_tree_barrier_localExch_%03d\n", NUM_ITERS); fflush(stdout);
break;
case 2: // lock-free tree barrier doesn't require any special fields to be
// created
printf("fbs_tree_barrier_%03d\n", NUM_ITERS); fflush(stdout);
break;
case 3: // lock-free barrier with local exchange doesn't require any
// special fields to be created
printf("fbs_tree_barrier_localExch_%03d\n", NUM_ITERS); fflush(stdout);
break;
case 4:
printf("spin_lock_mutex_%03d\n", NUM_ITERS); fflush(stdout);
hipMutexCreateSpin (&spinMutex, 0);
break;
case 5:
printf("ebo_mutex_%03d\n", NUM_ITERS); fflush(stdout);
hipMutexCreateEBO (&eboMutex, 1);
break;
case 6:
printf("sleeping_mutex_%03d\n", NUM_ITERS); fflush(stdout);
hipMutexCreateSleep (&sleepMutex, 2);
break;
case 7:
printf("fetchadd_mutex_%03d\n", NUM_ITERS); fflush(stdout);
hipMutexCreateFA (&faMutex, 3);
break;
case 8:
printf("spin_lock_sem_%03d_%03d\n", 1, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateSpin (&spinSem1, 0, 1, NUM_CU);
break;
case 9:
printf("spin_lock_sem_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateSpin (&spinSem2, 1, 2, NUM_CU);
break;
case 10:
printf("spin_lock_sem_%03d_%03d\n", 10, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateSpin (&spinSem10, 2, 10, NUM_CU);
break;
case 11:
printf("spin_lock_sem_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateSpin (&spinSem120, 3, 120, NUM_CU);
break;
case 12:
printf("ebo_sem_%03d_%03d\n", 1, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateEBO (&eboSem1, 4, 1, NUM_CU);
break;
case 13:
printf("ebo_sem_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateEBO (&eboSem2, 5, 2, NUM_CU);
break;
case 14:
printf("ebo_sem_%03d_%03d\n", 10, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateEBO (&eboSem10, 6, 10, NUM_CU);
break;
case 15:
printf("ebo_sem_%03d_%03d\n", 120, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateEBO (&eboSem120, 7, 120, NUM_CU);
break;
// cases 16-19 reserved
case 16:
break;
case 17:
break;
case 18:
break;
case 19:
break;
case 20:
printf("spin_lock_mutex_uniq_%03d\n", NUM_ITERS); fflush(stdout);
hipMutexCreateSpin (&spinMutex_uniq, 4);
break;
case 21:
printf("ebo_mutex_uniq_%03d\n", NUM_ITERS); fflush(stdout);
hipMutexCreateEBO (&eboMutex_uniq, 5);
break;
case 22:
printf("sleeping_mutex_uniq_%03d\n", NUM_ITERS); fflush(stdout);
hipMutexCreateSleep (&sleepMutex_uniq, 6);
break;
case 23:
printf("fetchadd_mutex_uniq_%03d\n", NUM_ITERS); fflush(stdout);
hipMutexCreateFA (&faMutex_uniq, 7);
break;
case 24:
printf("spin_lock_sem_uniq_%03d_%03d\n", 1, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateSpin (&spinSem1_uniq, 12, 1, NUM_CU);
break;
case 25:
printf("spin_lock_sem_uniq_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateSpin (&spinSem2_uniq, 13, 2, NUM_CU);
break;
case 26:
printf("spin_lock_sem_uniq_%03d_%03d\n", 10, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateSpin (&spinSem10_uniq, 14, 10, NUM_CU);
break;
case 27:
printf("spin_lock_sem_uniq_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateSpin (&spinSem120_uniq, 15, 120, NUM_CU);
break;
case 28:
printf("ebo_sem_uniq_%03d_%03d\n", 1, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateEBO (&eboSem1_uniq, 16, 1, NUM_CU);
break;
case 29:
printf("ebo_sem_uniq_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateEBO (&eboSem2_uniq, 17, 2, NUM_CU);
break;
case 30:
printf("ebo_sem_uniq_%03d_%03d\n", 10, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateEBO (&eboSem10_uniq, 18, 10, NUM_CU);
break;
case 31:
printf("ebo_sem_uniq_%03d_%03d\n", 120, NUM_ITERS); fflush(stdout);
hipSemaphoreCreateEBO (&eboSem120_uniq, 19, 120, NUM_CU);
break;
// cases 32-36 reserved
case 32:
break;
case 33:
break;
case 34:
break;
case 35:
break;
case 36:
break;
default:
fprintf(stderr, "ERROR: Trying to run synch prim #%u, but only 0-36 are "
"supported\n", syncPrim);
exit(-1);
break;
}
// # WGs must be < maxBufferSize or sleep mutex ring buffer won't work
if ((syncPrim == 6) || (syncPrim == 22)) {
assert(MAX_WGS <= cpuLockData->maxBufferSize);
}
// NOTE: region of interest begins here
hipDeviceSynchronize();
switch (syncPrim) {
case 0: // atomic tree barrier
invokeAtomicTreeBarrier(storage_d, perCUBarriers_d, NUM_ITERS);
break;
case 1: // atomic tree barrier with local exchange
invokeAtomicTreeBarrierLocalExch(storage_d, perCUBarriers_d, NUM_ITERS);
break;
case 2: // lock-free barrier
invokeFBSTreeBarrier(storage_d, perCUBarriers_d, NUM_ITERS);
break;
case 3: // lock-free barrier with local exchange
invokeFBSTreeBarrierLocalExch(storage_d, perCUBarriers_d, NUM_ITERS);
break;
case 4: // Spin Lock Mutex
invokeSpinLockMutex (spinMutex, storage_d, NUM_ITERS);
break;
case 5: // Spin Lock Mutex with backoff
invokeEBOMutex (eboMutex, storage_d, NUM_ITERS);
break;
case 6: // Sleeping Mutex
invokeSleepingMutex (sleepMutex, storage_d, NUM_ITERS);
break;
case 7: // fetch-and-add mutex
invokeFetchAndAddMutex(faMutex, storage_d, NUM_ITERS);
break;
case 8: // spin semaphore (1)
invokeSpinLockSemaphore(spinSem1, storage_d, 1, NUM_ITERS, numStorageLocs);
break;
case 9: // spin semaphore (2)
invokeSpinLockSemaphore(spinSem2, storage_d, 2, NUM_ITERS, numStorageLocs);
break;
case 10: // spin semaphore (10)
invokeSpinLockSemaphore(spinSem10, storage_d, 10, NUM_ITERS, numStorageLocs);
break;
case 11: // spin semaphore (120)
invokeSpinLockSemaphore(spinSem120, storage_d, 120, NUM_ITERS, numStorageLocs);
break;
case 12: // spin semaphore with backoff (1)
invokeEBOSemaphore(eboSem1, storage_d, 1, NUM_ITERS, numStorageLocs);
break;
case 13: // spin semaphore with backoff (2)
invokeEBOSemaphore(eboSem2, storage_d, 2, NUM_ITERS, numStorageLocs);
break;
case 14: // spin semaphore with backoff (10)
invokeEBOSemaphore(eboSem10, storage_d, 10, NUM_ITERS, numStorageLocs);
break;
case 15: // spin semaphore with backoff (120)
invokeEBOSemaphore(eboSem120, storage_d, 120, NUM_ITERS, numStorageLocs);
break;
// cases 16-19 reserved
case 16:
break;
case 17:
break;
case 18:
break;
case 19:
break;
case 20: // Spin Lock Mutex (uniq)
invokeSpinLockMutex_uniq (spinMutex_uniq, storage_d, NUM_ITERS);
break;
case 21: // Spin Lock Mutex with backoff (uniq)
invokeEBOMutex_uniq (eboMutex_uniq, storage_d, NUM_ITERS);
break;
case 22: // Sleeping Mutex (uniq)
invokeSleepingMutex_uniq (sleepMutex_uniq, storage_d, NUM_ITERS);
break;
case 23: // fetch-and-add mutex (uniq)
invokeFetchAndAddMutex_uniq(faMutex_uniq, storage_d, NUM_ITERS);
break;
case 24: // spin semaphore (1) (uniq)
invokeSpinLockSemaphore_uniq(spinSem1_uniq, storage_d, 1, NUM_ITERS);
break;
case 25: // spin semaphore (2) (uniq)
invokeSpinLockSemaphore_uniq(spinSem2_uniq, storage_d, 2, NUM_ITERS);
break;
case 26: // spin semaphore (10) (uniq)
invokeSpinLockSemaphore_uniq(spinSem10_uniq, storage_d, 10, NUM_ITERS);
break;
case 27: // spin semaphore (120) (uniq)
invokeSpinLockSemaphore_uniq(spinSem120_uniq, storage_d, 120, NUM_ITERS);
break;
case 28: // spin semaphore with backoff (1) (uniq)
invokeEBOSemaphore_uniq(eboSem1_uniq, storage_d, 1, NUM_ITERS);
break;
case 29: // spin semaphore with backoff (2) (uniq)
invokeEBOSemaphore_uniq(eboSem2_uniq, storage_d, 2, NUM_ITERS);
break;
case 30: // spin semaphore with backoff (10) (uniq)
invokeEBOSemaphore_uniq(eboSem10_uniq, storage_d, 10, NUM_ITERS);
break;
case 31: // spin semaphore with backoff (120) (uniq)
invokeEBOSemaphore_uniq(eboSem120_uniq, storage_d, 120, NUM_ITERS);
break;
// cases 32-36 reserved
case 32:
break;
case 33:
break;
case 34:
break;
case 35:
break;
case 36:
break;
default:
fprintf(stderr,
"ERROR: Trying to run synch prim #%u, but only 0-36 are "
"supported\n",
syncPrim);
exit(-1);
break;
}
// NOTE: Can end simulation here if don't care about output checking
hipDeviceSynchronize();
// copy results back to compare to golden
hipMemcpy(storage, storage_d, sizeof(float) * numStorageLocs, hipMemcpyDeviceToHost);
// get golden results
float storageGolden[numStorageLocs];
int numLocsAccessed = 0, currLoc = 0;
// initialize
for (int i = 0; i < numStorageLocs; ++i) { storageGolden[i] = i; }
for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
{
for (int j = 0; j < NUM_ITERS; ++j)
{
/*
The barrier algorithms exchange data across CUs, so we need to perform
the exchanges in the golden code.
The barrier algorithms with local exchange exchange data both across
CUs and across WGs within an CU, so need to perform both in the golden
code.
*/
if (syncPrim < 4)
{
// Some kernels only access a fraction of the total # of locations,
// determine how many locations are accessed by each kernel here.
numLocsAccessed = (numWGs * numUniqLocsAccPerWG);
// first cache line of words aren't written to
for (int i = (numLocsAccessed-1); i >= 0; --i)
{
// every iteration of the critical section, the location being
// accessed is shifted by numUniqLocsAccPerWG
currLoc = (i + (j * numUniqLocsAccPerWG)) % numLocsAccessed;
accessData_golden(storageGolden, currLoc, numStorageLocs);
}
// local exchange versions do additional accesses when there are
// multiple WGs on a CU
if ((syncPrim == 1) || (syncPrim == 3))
{
if (numWGs_perCU > 1)
{
for (int i = (numLocsAccessed-1); i >= 0; --i)
{
// advance the current location by the number of unique locations
// accessed by a CU (mod the number of memory locations accessed)
currLoc = (i + (numWGs_perCU * numUniqLocsAccPerWG)) %
numLocsAccessed;
// every iteration of the critical section, the location being
// accessed is also shifted by numUniqLocsAccPerWG
currLoc = (currLoc + (j * numUniqLocsAccPerWG)) %
numLocsAccessed;
accessData_golden(storageGolden, currLoc, numStorageLocs);
}
}
}
}
/*
In the non-unique mutex microbenchmarks (4-7), all WGs on all CUs access
the same locations.
*/
else if ((syncPrim >= 4) && (syncPrim <= 7))
{
// need to iterate over the locations for each WG since all WGs
// access the same locations
for (int WG = 0; WG < numWGs; ++WG)
{
for (int i = (numUniqLocsAccPerWG-1); i >= 0; --i)
{
accessData_golden(storageGolden, i, numStorageLocs);
}
}
}
/*
In the non-unique semaphore microbenchmarks (8-19), 1 "writer" WG
per CU writes all the locations accessed by that CU (i.e.,
numUniqLocsAccPerWG * numWGs_perCU). Moreover, all writer WGs across
all CUs access the same locations.
*/
else if ((syncPrim <= 19) && (syncPrim >= 8))
{
int cuID = 0, perCU_wgID = 0;
const int numCU = ((numWGs < NUM_CU) ? numWGs : NUM_CU);
bool isWriter = false;
// need to iterate over the locations for each WG since all WGs
// access the same locations
for (int wg = 0; wg < numWGs; ++wg)
{
cuID = (wg % numCU);
perCU_wgID = (wg / numCU);
// which WG is writer varies per CU
isWriter = (perCU_wgID == (cuID % numWGs_perCU));
if (isWriter)
{
for (int k = 0; k < numWGs_perCU; ++k)
{
// first cache line of words aren't written to
for (int i = (numUniqLocsAccPerWG-1); i >= 0; --i)
{
/*
The locations the writer is writing are numUniqLocsAccPerWG
apart because the WGs are assigned in round-robin fashion.
Thus, need to shift the location accordingly.
*/
currLoc = (i + (k * numUniqLocsAccPerWG)) % numStorageLocs;
accessData_golden(storageGolden, currLoc, numStorageLocs);
}
}
}
}
}
/*
In the unique mutex microbenchmarks (20-23), all WGs on a CU access
the same data and the data accessed by each CU is unique.
*/
else if ((syncPrim <= 23) && (syncPrim >= 20))
{
// Some kernels only access a fraction of the total # of locations,
// determine how many locations are accessed by each kernel here.
numLocsAccessed = (numWGs * numUniqLocsAccPerWG);
// first cache line of words aren't written to
for (int i = (numLocsAccessed-1); i >= 0; --i)
{
/*
If this location would be accessed by a WG other than the first
WG on an CU, wraparound and access the same location as the
first WG on the CU -- only for the mutexes, for semaphores this
isn't true.
*/
currLoc = i % (NUM_CU * numUniqLocsAccPerWG);
accessData_golden(storageGolden, currLoc, numStorageLocs);
}
}
/*
In the unique semaphore microbenchmarks (24-35), 1 "writer" WG per
CU writes all the locations accessed by that CU, but each CU accesses
unique data. We model this behavior by accessing all of the data
accessed by all CUs, since this has the same effect (assuming same
number of WGs/CU).
*/
else
{
// Some kernels only access a fraction of the total # of locations,
// determine how many locations are accessed by each kernel here.
numLocsAccessed = (numWGs * numUniqLocsAccPerWG);
// first cache line of words aren't written to
for (int i = (numLocsAccessed-1); i >= 0; --i)
{
accessData_golden(storageGolden, i, numStorageLocs);
}
}
}
}
fprintf(stdout, "Comparing GPU results to golden results:\n");
unsigned int numErrors = 0;
// check the output values
for (int i = 0; i < numStorageLocs; ++i)
{
if (std::abs(storage[i] - storageGolden[i]) > 1E-5)
{
fprintf(stderr, "\tERROR: storage[%d] = %f, golden[%d] = %f\n", i,
storage[i], i, storageGolden[i]);
++numErrors;
}
}
if (numErrors > 0)
{
fprintf(stderr, "ERROR: %s has %u output errors\n", syncPrim_str,
numErrors);
exit(-1);
}
else { fprintf(stdout, "PASSED!\n"); }
// free arrays
hipLocksDestroy();
hipFree(storage_d);
hipFree(perCUBarriers_d);
free(storage);
free(perCUBarriers);
return 0;
}