| #include <cstdio> |
| #include <string> |
| #include <assert.h> |
| #include <math.h> |
| #include "hip/hip_runtime.h" |
| #include "hip_error.h" |
| |
| #define MAX_BACKOFF 1024 |
| #define NUM_WIS_PER_WG 64 |
| #define MAD_MUL 1.1f |
| #define MAD_ADD 0.25f |
| #define NUM_WORDS_PER_CACHELINE 16 |
| #define NUM_WIS_PER_QUARTERWAVE 16 |
| |
| // separate .h files |
| #include "hipLocksBarrier.h" |
| #include "hipLocksImpl.h" |
| #include "hipLocksMutex.h" |
| #include "hipLocksSemaphore.h" |
| // program globals |
| const int NUM_REPEATS = 10; |
| int NUM_LDST = 0; |
| int numWGs = 0; |
| // number of CUs our GPU has |
| int NUM_CU = 0; |
| int MAX_WGS = 0; |
| |
| bool pageAlign = false; |
| |
| /* |
| Helper function to do data accesses for golden checking code. |
| */ |
| void accessData_golden(float * storageGolden, int currLoc, int numStorageLocs) |
| { |
| /* |
| If this location isn't the first location accessed by a |
| thread, update it -- each half-warp accesses (NUM_LDST + 1) cache |
| lines. |
| */ |
| if (currLoc % (NUM_WIS_PER_QUARTERWAVE * (NUM_LDST + 1)) >= |
| NUM_WORDS_PER_CACHELINE) |
| { |
| assert((currLoc - NUM_WORDS_PER_CACHELINE) >= 0); |
| assert(currLoc < numStorageLocs); |
| // each location is dependent on the location accessed at the |
| // same word on the previous cache line |
| storageGolden[currLoc] = |
| ((storageGolden[currLoc - |
| NUM_WORDS_PER_CACHELINE]/* * MAD_MUL*/) /*+ MAD_ADD*/); |
| } |
| } |
| |
| /* |
| Shared function that does the critical section data accesses for the barrier |
| and mutex kernels. |
| |
| NOTE: kernels that access data differently (e.g., semaphores) should not call |
| this function. |
| */ |
| inline __device__ void accessData(float * storage, int threadBaseLoc, |
| int threadOffset, int NUM_LDST) |
| { |
| // local variables |
| int readLoc = 0, writeLoc = 0; |
| |
| for (int n = NUM_LDST-1; n >= 0; --n) { |
| writeLoc = ((threadBaseLoc + n + 1) * NUM_WORDS_PER_CACHELINE) + |
| threadOffset; |
| readLoc = ((threadBaseLoc + n) * NUM_WORDS_PER_CACHELINE) + threadOffset; |
| storage[writeLoc] = ((storage[readLoc]/* * MAD_MUL*/) /*+ MAD_ADD*/); |
| } |
| } |
| |
| /* |
| Helper function for semaphore writers. Although semaphore writers are |
| iterating over all locations accessed on a given CU, the logic for this |
| varies and is done outside this helper, so can just call accessData(). |
| */ |
| inline __device__ void accessData_semWr(float * storage, int threadBaseLoc, |
| int threadOffset, int NUM_LDST) |
| { |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| } |
| |
| |
| /* |
| Helper function for semaphore readers. |
| */ |
| inline __device__ void accessData_semRd(float * storage, |
| volatile float * dummyArray, |
| int threadBaseLoc, |
| int threadOffset, int NUM_LDST) |
| { |
| for (int n = NUM_LDST-1; n >= 0; --n) { |
| dummyArray[hipThreadIdx_x] += |
| storage[((threadBaseLoc + n) * NUM_WORDS_PER_CACHELINE) + |
| threadOffset]; |
| __syncthreads(); |
| } |
| } |
| |
| // performs a tree barrier. Each WG on an CU accesses unique data then joins a |
| // local barrier. 1 of the WGs from each CU then joins the global barrier |
| __global__ void kernelAtomicTreeBarrierUniq(float * storage, |
| hipLockData_t * gpuLockData, |
| unsigned int * perCUBarrierBuffers, |
| const int ITERATIONS, |
| const int NUM_LDST, |
| const int NUM_CU, |
| const int MAX_WGS) |
| { |
| // local variables |
| // thread 0 is master thread |
| const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && |
| (hipThreadIdx_z == 0)); |
| // represents the number of WGs going to the barrier (max NUM_CU, hipGridDim_x if |
| // fewer WGs than CUs). |
| const unsigned int numWGsAtBarr = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : |
| NUM_CU); |
| const int cuID = (hipBlockIdx_x % numWGsAtBarr); // mod by # CUs to get CU ID |
| // all work groups on the same CU access unique locations because the |
| // barrier can't ensure DRF between WGs |
| int currWGID = hipBlockIdx_x; |
| int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| // determine if I'm WG 0 on my CU |
| const int perCU_WGID = (hipBlockIdx_x / numWGsAtBarr); |
| // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume |
| // all CUs have an identical number of WGs |
| int numWGs_perCU = (int)ceil((float)hipGridDim_x / numWGsAtBarr); |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| joinBarrier_helper(gpuLockData->barrierBuffers, perCUBarrierBuffers, |
| numWGsAtBarr, cuID, perCU_WGID, numWGs_perCU, |
| isMasterThread, MAX_WGS); |
| |
| // get new thread ID by trading amongst WGs -- + 1 WG ID to shift to next |
| // CUs data |
| currWGID = ((currWGID + 1) % hipGridDim_x); |
| tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| } |
| } |
| |
| // like the tree barrier but also has WGs exchange work locally before joining |
| // the global barrier |
| __global__ void kernelAtomicTreeBarrierUniqLocalExch(float * storage, |
| hipLockData_t * gpuLockData, |
| unsigned int * perCUBarrierBuffers, |
| const int ITERATIONS, |
| const int NUM_LDST, |
| const int NUM_CU, |
| const int MAX_WGS) |
| { |
| // local variables |
| // thread 0 is master thread |
| const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && |
| (hipThreadIdx_z == 0)); |
| // represents the number of WGs going to the barrier (max NUM_CU, hipGridDim_x if |
| // fewer WGs than CUs). |
| const unsigned int numWGsAtBarr = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : |
| NUM_CU); |
| const int cuID = (hipBlockIdx_x % numWGsAtBarr); // mod by # CUs to get CU ID |
| // all work groups on the same CU access unique locations because the |
| // barrier can't ensure DRF between WGs |
| int currWGID = hipBlockIdx_x; |
| int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| // determine if I'm WG 0 on my CU |
| const int perCU_WGID = (hipBlockIdx_x / numWGsAtBarr); |
| // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume |
| // all CUs have an identical number of WGs |
| int numWGs_perCU = (int)ceil((float)hipGridDim_x / numWGsAtBarr); |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| // all WGs on this CU do a local barrier (if > 1 WG) |
| if (numWGs_perCU > 1) { |
| hipBarrierAtomicLocal(perCUBarrierBuffers, cuID, numWGs_perCU, |
| isMasterThread, MAX_WGS); |
| |
| // exchange data within the WGs on this CU, then do some more computations |
| currWGID = ((currWGID + numWGsAtBarr) % hipGridDim_x); |
| tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| } |
| |
| joinBarrier_helper(gpuLockData->barrierBuffers, perCUBarrierBuffers, |
| numWGsAtBarr, cuID, perCU_WGID, numWGs_perCU, |
| isMasterThread, MAX_WGS); |
| |
| // get new thread ID by trading amongst WGs -- + 1 WG ID to shift to next |
| // CUs data |
| currWGID = ((currWGID + 1) % hipGridDim_x); |
| tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| } |
| } |
| |
| // performs a tree barrier like above but with a lock-free barrier |
| __global__ void kernelFBSTreeBarrierUniq(float * storage, |
| hipLockData_t * gpuLockData, |
| unsigned int * perCUBarrierBuffers, |
| const int ITERATIONS, |
| const int NUM_LDST, |
| const int NUM_CU, |
| const int MAX_WGS) |
| { |
| // local variables |
| // represents the number of WGs going to the barrier (max NUM_CU, hipGridDim_x if |
| // fewer WGs than CUs). |
| const unsigned int numWGsAtBarr = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : |
| NUM_CU); |
| const int cuID = (hipBlockIdx_x % numWGsAtBarr); // mod by # CUs to get CU ID |
| // all work groups on the same CU access unique locations because the |
| // barrier can't ensure DRF between WGs |
| int currWGID = hipBlockIdx_x; |
| int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| // determine if I'm WG 0 on my CU |
| const int perCU_WGID = (hipBlockIdx_x / numWGsAtBarr); |
| // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume |
| // all CUs have an identical number of WGs |
| int numWGs_perCU = (int)ceil((float)hipGridDim_x/numWGsAtBarr); |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| joinLFBarrier_helper(gpuLockData->barrierBuffers, perCUBarrierBuffers, |
| numWGsAtBarr, cuID, perCU_WGID, numWGs_perCU, |
| gpuLockData->arrayStride, MAX_WGS); |
| |
| // get new thread ID by trading amongst WGs -- + 1 WG ID to shift to next |
| // CUs data |
| currWGID = ((currWGID + 1) % hipGridDim_x); |
| tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| } |
| } |
| |
| // performs a tree barrier like above but with a lock-free barrier and has WGs |
| // exchange work locally before joining the global barrier |
| __global__ void kernelFBSTreeBarrierUniqLocalExch(float * storage, |
| hipLockData_t * gpuLockData, |
| unsigned int * perCUBarrierBuffers, |
| const int ITERATIONS, |
| const int NUM_LDST, |
| const int NUM_CU, |
| const int MAX_WGS) |
| { |
| // local variables |
| // represents the number of WGs going to the barrier (max NUM_CU, hipGridDim_x if |
| // fewer WGs than CUs). |
| const unsigned int numWGsAtBarr = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU); |
| const int cuID = (hipBlockIdx_x % numWGsAtBarr); // mod by # CUs to get CU ID |
| // all work groups on the same CU access unique locations because the |
| // barrier can't ensure DRF between WGs |
| int currWGID = hipBlockIdx_x; |
| int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| // determine if I'm WG 0 on my CU |
| const int perCU_WGID = (hipBlockIdx_x / numWGsAtBarr); |
| // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume |
| // all CUs have an identical number of WGs |
| int numWGs_perCU = (int)ceil((float)hipGridDim_x/numWGsAtBarr); |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| // all WGs on this CU do a local barrier (if > 1 WG per CU) |
| if (numWGs_perCU > 1) { |
| hipBarrierLocal(gpuLockData->barrierBuffers, numWGsAtBarr, |
| gpuLockData->arrayStride, perCUBarrierBuffers, cuID, numWGs_perCU, |
| perCU_WGID, false, MAX_WGS); |
| |
| // exchange data within the WGs on this CU and do some more computations |
| currWGID = ((currWGID + numWGsAtBarr) % hipGridDim_x); |
| tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| } |
| |
| joinLFBarrier_helper(gpuLockData->barrierBuffers, perCUBarrierBuffers, |
| numWGsAtBarr, cuID, perCU_WGID, numWGs_perCU, |
| gpuLockData->arrayStride, MAX_WGS); |
| |
| // get new thread ID by trading amongst WGs -- + 1 WG ID to shift to next |
| // CUs data |
| currWGID = ((currWGID + 1) % hipGridDim_x); |
| tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| } |
| } |
| |
| __global__ void kernelSleepingMutex(hipMutex_t mutex, float * storage, |
| hipLockData_t * gpuLockData, |
| const int ITERATIONS, const int NUM_LDST, |
| const int NUM_CU) |
| { |
| // local variables |
| // all work groups access the same locations (rely on release to get |
| // ownership in time) |
| const int tid = hipThreadIdx_x; |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| __shared__ int myRingBufferLoc; // tracks my WGs location in the ring buffer |
| |
| if (hipThreadIdx_x == 0) { |
| myRingBufferLoc = -1; // initially I have no location |
| } |
| __syncthreads(); |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| myRingBufferLoc = hipMutexSleepLock(mutex, gpuLockData->mutexBuffers, |
| gpuLockData->mutexBufferTails, gpuLockData->maxBufferSize, |
| gpuLockData->arrayStride, NUM_CU); |
| |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| hipMutexSleepUnlock(mutex, gpuLockData->mutexBuffers, myRingBufferLoc, |
| gpuLockData->maxBufferSize, gpuLockData->arrayStride, NUM_CU); |
| } |
| } |
| |
| __global__ void kernelSleepingMutexUniq(hipMutex_t mutex, float * storage, |
| hipLockData_t * gpuLockData, |
| const int ITERATIONS, |
| const int NUM_LDST, const int NUM_CU) |
| { |
| // local variables |
| const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID |
| // all work groups on the same CU access the same locations |
| const int tid = ((cuID * hipBlockDim_x) + hipThreadIdx_x); |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| __shared__ int myRingBufferLoc; // tracks my WGs location in the ring buffer |
| |
| if (hipThreadIdx_x == 0) { |
| myRingBufferLoc = -1; // initially I have no location |
| } |
| __syncthreads(); |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| myRingBufferLoc = hipMutexSleepLockLocal(mutex, cuID, gpuLockData->mutexBuffers, |
| gpuLockData->mutexBufferTails, gpuLockData->maxBufferSize, |
| gpuLockData->arrayStride, NUM_CU); |
| |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| hipMutexSleepUnlockLocal(mutex, cuID, gpuLockData->mutexBuffers, myRingBufferLoc, |
| gpuLockData->maxBufferSize, gpuLockData->arrayStride, |
| NUM_CU); |
| } |
| } |
| |
| __global__ void kernelFetchAndAddMutex(hipMutex_t mutex, float * storage, |
| hipLockData_t * gpuLockData, |
| const int ITERATIONS, |
| const int NUM_LDST, const int NUM_CU) |
| { |
| // local variables |
| // all work groups access the same locations (rely on release to get |
| // ownership in time) |
| const int tid = hipThreadIdx_x; |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| hipMutexFALock(mutex, gpuLockData->mutexBufferHeads, gpuLockData->mutexBufferTails, |
| NUM_CU); |
| |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| hipMutexFAUnlock(mutex, gpuLockData->mutexBufferTails, NUM_CU); |
| } |
| } |
| |
| __global__ void kernelFetchAndAddMutexUniq(hipMutex_t mutex, float * storage, |
| hipLockData_t * gpuLockData, |
| const int ITERATIONS, |
| const int NUM_LDST, |
| const int NUM_CU) |
| { |
| // local variables |
| const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID |
| // all work groups on the same CU access the same locations |
| const int tid = ((cuID * hipBlockDim_x) + hipThreadIdx_x); |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| hipMutexFALockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, |
| gpuLockData->mutexBufferTails, NUM_CU); |
| |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| hipMutexFAUnlockLocal(mutex, cuID, gpuLockData->mutexBufferTails, NUM_CU); |
| } |
| } |
| |
| __global__ void kernelSpinLockMutex(hipMutex_t mutex, float * storage, |
| hipLockData_t * gpuLockData, |
| const int ITERATIONS, const int NUM_LDST, |
| const int NUM_CU) |
| { |
| // local variables |
| // all work groups access the same locations (rely on release to get |
| // ownership in time) |
| const int tid = hipThreadIdx_x; |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| hipMutexSpinLock(mutex, gpuLockData->mutexBufferHeads, NUM_CU); |
| |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| hipMutexSpinUnlock(mutex, gpuLockData->mutexBufferHeads, NUM_CU); |
| } |
| } |
| |
| __global__ void kernelSpinLockMutexUniq(hipMutex_t mutex, float * storage, |
| hipLockData_t * gpuLockData, |
| const int ITERATIONS, |
| const int NUM_LDST, const int NUM_CU) |
| { |
| // local variables |
| const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID |
| // all work groups on the same CU access the same locations |
| const int tid = ((cuID * hipBlockDim_x) + hipThreadIdx_x); |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| hipMutexSpinLockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, NUM_CU); |
| |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| hipMutexSpinUnlockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, NUM_CU); |
| } |
| } |
| |
| __global__ void kernelEBOMutex(hipMutex_t mutex, float * storage, |
| hipLockData_t * gpuLockData, |
| const int ITERATIONS, const int NUM_LDST, |
| const int NUM_CU) |
| { |
| // local variables |
| // all work groups access the same locations (rely on release to get |
| // ownership in time) |
| const int tid = hipThreadIdx_x; |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| hipMutexEBOLock(mutex, gpuLockData->mutexBufferHeads, NUM_CU); |
| |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| hipMutexEBOUnlock(mutex, gpuLockData->mutexBufferHeads, NUM_CU); |
| } |
| } |
| |
| __global__ void kernelEBOMutexUniq(hipMutex_t mutex, float * storage, |
| hipLockData_t * gpuLockData, |
| const int ITERATIONS, const int NUM_LDST, |
| const int NUM_CU) |
| { |
| // local variables |
| const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID |
| // all work groups on the same CU access the same locations |
| const int tid = ((cuID * hipBlockDim_x) + hipThreadIdx_x); |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| hipMutexEBOLockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, NUM_CU); |
| |
| accessData(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| hipMutexEBOUnlockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, NUM_CU); |
| } |
| } |
| |
| // All WGs on all CUs access the same data with 1 writer per CU (and N-1) |
| // readers per CU. |
| __global__ void kernelSpinLockSemaphore(hipSemaphore_t sem, |
| float * storage, |
| hipLockData_t * gpuLockData, |
| const unsigned int numStorageLocs, |
| const int ITERATIONS, |
| const int NUM_LDST, |
| const int NUM_CU) |
| { |
| // local variables |
| const unsigned int maxSemCount = |
| gpuLockData->semaphoreBuffers[((sem * 4 * NUM_CU) + (0 * 4))]; |
| const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID |
| // If there are fewer WGs than # CUs, need to take into account for various |
| // math below. If WGs >= NUM_CU, use NUM_CU. |
| const unsigned int numCU = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU); |
| // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume |
| // all CUs have an identical number of WGs |
| int numWGs_perCU = (int)ceil((float)hipGridDim_x / numCU); |
| // number of threads on each WG |
| //const int numThrs_perCU = (hipBlockDim_x * numWGs_perCU); |
| const int perCU_WGID = (hipBlockIdx_x / numCU); |
| // rotate which WG is the writer |
| const bool isWriter = (perCU_WGID == (cuID % numWGs_perCU)); |
| |
| // all work groups on the same CU access unique locations except the writer, |
| // which writes all of the locations that all of the WGs access |
| //int currWGID = hipBlockIdx_x; |
| // the (reader) WGs on each CU access unique locations but those same |
| // locations are accessed by the reader WGs on all CUs |
| int tid = ((perCU_WGID * hipBlockDim_x) + hipThreadIdx_x); |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| |
| // dummy array to hold the loads done in the readers |
| __shared__ volatile float dummyArray[NUM_WIS_PER_WG]; |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| /* |
| NOTE: There is a race here for entering the critical section. Most |
| importantly, it means that the at least one of the readers could win and |
| thus the readers will read before the writer has had a chance to write |
| the data. |
| */ |
| hipSemaphoreSpinWait(sem, isWriter, maxSemCount, |
| gpuLockData->semaphoreBuffers, NUM_CU); |
| |
| // writer writes all the data that the WGs on this CU access |
| if (isWriter) { |
| for (int j = 0; j < numWGs_perCU; ++j) { |
| /* |
| Update the writer's "location" so it writes to the locations that the |
| readers will access (due to RR scheduling the next WG on this CU is |
| numCU WGs away). Use loop counter because the non-unique version |
| writes the same locations on all CUs. |
| */ |
| tid = ((j * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| |
| accessData_semWr(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| } |
| // reset locations |
| tid = ((perCU_WGID * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| } |
| // rest of WGs on this CU read the data written by each CU's writer WG |
| else { |
| accessData_semRd(storage, dummyArray, threadBaseLoc, threadOffset, |
| NUM_LDST); |
| } |
| hipSemaphoreSpinPost(sem, isWriter, maxSemCount, |
| gpuLockData->semaphoreBuffers, NUM_CU); |
| } |
| } |
| |
| __global__ void kernelSpinLockSemaphoreUniq(hipSemaphore_t sem, |
| float * storage, |
| hipLockData_t * gpuLockData, |
| const int ITERATIONS, |
| const int NUM_LDST, |
| const int NUM_CU) |
| { |
| // local variables |
| const unsigned int maxSemCount = |
| gpuLockData->semaphoreBuffers[((sem * 4 * NUM_CU) + (0 * 4))]; |
| // If there are fewer WGs than # CUs, need to take into account for various |
| // math below. If WGs >= NUM_CU, use NUM_CU. |
| const unsigned int numCU = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU); |
| const int cuID = (hipBlockIdx_x % numCU); // mod by # CUs to get CU ID |
| // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume |
| // all CUs have an identical number of WGs |
| int numWGs_perCU = (int)ceil((float)hipGridDim_x / numCU); |
| const int perCU_WGID = (hipBlockIdx_x / numCU); |
| // rotate which WG is the writer |
| const bool isWriter = (perCU_WGID == (cuID % numWGs_perCU)); |
| |
| // all work groups on the same CU access unique locations except the writer, |
| // which writes all of the locations that all of the WGs access |
| int currWGID = hipBlockIdx_x; |
| int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| // dummy array to hold the loads done in the readers |
| __shared__ volatile float dummyArray[NUM_WIS_PER_WG]; |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| /* |
| NOTE: There is a race here for entering the critical section. Most |
| importantly, it means that the at least one of the readers could win and |
| thus the readers will read before the writer has had a chance to write |
| the data. |
| */ |
| hipSemaphoreSpinWaitLocal(sem, cuID, isWriter, maxSemCount, |
| gpuLockData->semaphoreBuffers, NUM_CU); |
| |
| // writer WG writes all the data that the WGs on this CU access |
| if (isWriter) { |
| for (int j = 0; j < numWGs_perCU; ++j) { |
| accessData_semWr(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| /* |
| update the writer's "location" so it writes to the locations that the |
| readers will access (due to RR scheduling the next WG on this CU is |
| numCU WGs away and < hipGridDim_x). |
| |
| NOTE: First location writer writes to is its own location(s). If the |
| writer is not CU 0 on this CU, it may require wrapping around to CUs |
| with smaller WG IDs. |
| */ |
| currWGID = (currWGID + numCU) % hipGridDim_x; |
| tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| } |
| // reset locations |
| currWGID = hipBlockIdx_x; |
| tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| } |
| // rest of WGs on this CU read the data written by each CU's writer WG |
| else { |
| accessData_semRd(storage, dummyArray, threadBaseLoc, threadOffset, |
| NUM_LDST); |
| } |
| hipSemaphoreSpinPostLocal(sem, cuID, isWriter, maxSemCount, |
| gpuLockData->semaphoreBuffers, NUM_CU); |
| } |
| } |
| |
| // All WGs on all CUs access the same data with 1 writer per CU (and N-1) |
| // readers per CU. |
| __global__ void kernelEBOSemaphore(hipSemaphore_t sem, float * storage, |
| hipLockData_t * gpuLockData, |
| const unsigned int numStorageLocs, |
| const int ITERATIONS, const int NUM_LDST, |
| const int NUM_CU) |
| { |
| // local variables |
| const unsigned int maxSemCount = |
| gpuLockData->semaphoreBuffers[((sem * 4 * NUM_CU) + (0 * 4))]; |
| // If there are fewer WGs than # CUs, need to take into account for various |
| // math below. If WGs >= NUM_CU, use NUM_CU. |
| const unsigned int numCU = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU); |
| const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID |
| // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume |
| // all CUs have an identical number of WGs |
| int numWGs_perCU = (int)ceil((float)hipGridDim_x / numCU); |
| // number of threads on each WG |
| //const int numThrs_perCU = (hipBlockDim_x * numWGs_perCU); |
| const int perCU_WGID = (hipBlockIdx_x / numCU); |
| // rotate which WG is the writer |
| const bool isWriter = (perCU_WGID == (cuID % numWGs_perCU)); |
| |
| // all work groups on the same CU access unique locations except the writer, |
| // which writes all of the locations that all of the WGs access |
| //int currWGID = hipBlockIdx_x; |
| // the (reader) WGs on each CU access unique locations but those same |
| // locations are accessed by the reader WGs on all CUs |
| int tid = ((perCU_WGID * hipBlockDim_x) + hipThreadIdx_x); |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| |
| // dummy array to hold the loads done in the readers |
| __shared__ volatile float dummyArray[NUM_WIS_PER_WG]; |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| /* |
| NOTE: There is a race here for entering the critical section. Most |
| importantly, it means that the at least one of the readers could win and |
| thus the readers will read before the writer has had a chance to write |
| the data. |
| */ |
| hipSemaphoreEBOWait(sem, isWriter, maxSemCount, |
| gpuLockData->semaphoreBuffers, NUM_CU); |
| |
| // writer WG writes all the data that the WGs on this CU access |
| if (isWriter) { |
| for (int j = 0; j < numWGs_perCU; ++j) { |
| /* |
| Update the writer's "location" so it writes to the locations that the |
| readers will access (due to RR scheduling the next WG on this CU is |
| numCU WGs away). Use loop counter because the non-unique version |
| writes the same locations on all CUs. |
| */ |
| tid = ((j * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| |
| accessData_semWr(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| } |
| // reset locations |
| tid = ((perCU_WGID * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| } |
| // rest of WGs on this CU read the data written by each CU's writer WG |
| else { |
| accessData_semRd(storage, dummyArray, threadBaseLoc, threadOffset, |
| NUM_LDST); |
| } |
| hipSemaphoreEBOPost(sem, isWriter, maxSemCount, |
| gpuLockData->semaphoreBuffers, NUM_CU); |
| } |
| } |
| |
| __global__ void kernelEBOSemaphoreUniq(hipSemaphore_t sem, float * storage, |
| hipLockData_t * gpuLockData, |
| const int ITERATIONS, |
| const int NUM_LDST, |
| const int NUM_CU) |
| { |
| // local variables |
| const unsigned int maxSemCount = |
| gpuLockData->semaphoreBuffers[((sem * 4 * NUM_CU) + (0 * 4))]; |
| // If there are fewer WGs than # CUs, need to take into account for various |
| // math below. If WGs >= NUM_CU, use NUM_CU. |
| const unsigned int numCU = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU); |
| const int cuID = (hipBlockIdx_x % numCU); // mod by # CUs to get CU ID |
| // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume |
| // all CUs have an identical number of WGs |
| int numWGs_perCU = (int)ceil((float)hipGridDim_x / numCU); |
| const int perCU_WGID = (hipBlockIdx_x / numCU); |
| // rotate which WG is the writer |
| const bool isWriter = (perCU_WGID == (cuID % numWGs_perCU)); |
| |
| // all work groups on the same CU access unique locations except the writer, |
| // which writes all of the locations that all of the WGs access |
| int currWGID = hipBlockIdx_x; |
| int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| // want striding to happen across cache lines so that each thread in a |
| // half-warp accesses sequential words |
| int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE); |
| // dummy array to hold the loads done in the readers |
| __shared__ volatile float dummyArray[NUM_WIS_PER_WG]; |
| |
| for (int i = 0; i < ITERATIONS; ++i) |
| { |
| /* |
| NOTE: There is a race here for entering the critical section. Most |
| importantly, it means that the at least one of the readers could win and |
| thus the readers will read before the writer has had a chance to write |
| the data. |
| */ |
| hipSemaphoreEBOWaitLocal(sem, cuID, isWriter, maxSemCount, |
| gpuLockData->semaphoreBuffers, NUM_CU); |
| |
| // writer WG writes all the data that the WGs on this CU access |
| if (isWriter) { |
| for (int j = 0; j < numWGs_perCU; ++j) { |
| accessData_semWr(storage, threadBaseLoc, threadOffset, NUM_LDST); |
| |
| /* |
| update the writer's "location" so it writes to the locations that the |
| readers will access (due to RR scheduling the next WG on this CU is |
| numCU WGs away and < hipGridDim_x). |
| |
| NOTE: First location writer writes to is its own location(s). If the |
| writer is not CU 0 on this CU, it may require wrapping around to CUs |
| with smaller WG IDs. |
| */ |
| currWGID = (currWGID + numCU) % hipGridDim_x; |
| tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| } |
| // reset locations |
| currWGID = hipBlockIdx_x; |
| tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x); |
| threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1)); |
| } |
| // rest of WGs on this CU read the data written by each CU's writer WG |
| else { |
| accessData_semRd(storage, dummyArray, threadBaseLoc, threadOffset, |
| NUM_LDST); |
| } |
| hipSemaphoreEBOPostLocal(sem, cuID, isWriter, maxSemCount, |
| gpuLockData->semaphoreBuffers, NUM_CU); |
| } |
| } |
| |
| void invokeAtomicTreeBarrier(float * storage_d, unsigned int * perCUBarriers_d, |
| int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelAtomicTreeBarrierUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| storage_d, cpuLockData, perCUBarriers_d, numIters, NUM_LDST, NUM_CU, |
| MAX_WGS); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelAtomicTreeBarrierUniq)"); |
| } |
| } |
| |
| void invokeAtomicTreeBarrierLocalExch(float * storage_d, |
| unsigned int * perCUBarriers_d, |
| int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelAtomicTreeBarrierUniqLocalExch), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| storage_d, cpuLockData, perCUBarriers_d, numIters, NUM_LDST, NUM_CU, |
| MAX_WGS); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, |
| "hipDeviceSynchronize (kernelAtomicTreeBarrierUniqLockExch)"); |
| } |
| } |
| |
| void invokeFBSTreeBarrier(float * storage_d, unsigned int * perCUBarriers_d, |
| int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelFBSTreeBarrierUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| storage_d, cpuLockData, perCUBarriers_d, numIters, NUM_LDST, NUM_CU, |
| MAX_WGS); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelFBSTreeBarrierUniq)"); |
| } |
| } |
| |
| void invokeFBSTreeBarrierLocalExch(float * storage_d, |
| unsigned int * perCUBarriers_d, |
| int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelFBSTreeBarrierUniqLocalExch), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| storage_d, cpuLockData, perCUBarriers_d, numIters, NUM_LDST, NUM_CU, |
| MAX_WGS); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelFBSTreeBarrierUniqLocalExch)"); |
| } |
| } |
| |
| void invokeSpinLockMutex(hipMutex_t mutex, float * storage_d, int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSpinLockMutex), dim3(1, 1, 1), dim3(NUM_WIS_PER_WG), 0, 0, |
| mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelSpinLockMutex)"); |
| } |
| } |
| |
| void invokeSpinLockMutex_uniq(hipMutex_t mutex, float * storage_d, |
| int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSpinLockMutexUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelSpinLockMutexUniq)"); |
| } |
| } |
| |
| void invokeEBOMutex(hipMutex_t mutex, float * storage_d, int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelEBOMutex), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelEBOMutex)"); |
| } |
| } |
| |
| void invokeEBOMutex_uniq(hipMutex_t mutex, float * storage_d, int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelEBOMutexUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelEBOMutexUniq)"); |
| } |
| } |
| |
| void invokeSleepingMutex(hipMutex_t mutex, float * storage_d, int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSleepingMutex), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelSleepingMutex)"); |
| } |
| } |
| |
| void invokeSleepingMutex_uniq(hipMutex_t mutex, float * storage_d, |
| int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSleepingMutexUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelSleepingMutexUniq)"); |
| } |
| } |
| |
| void invokeFetchAndAddMutex(hipMutex_t mutex, float * storage_d, int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelFetchAndAddMutex), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelFetchAndAddMutex)"); |
| } |
| } |
| |
| void invokeFetchAndAddMutex_uniq(hipMutex_t mutex, float * storage_d, int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelFetchAndAddMutexUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelFetchAndAddMutexUniq)"); |
| } |
| } |
| |
| void invokeSpinLockSemaphore(hipSemaphore_t sem, float * storage_d, |
| const int maxVal, |
| int numIters, int numStorageLocs) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSpinLockSemaphore), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| sem, storage_d, cpuLockData, numStorageLocs, numIters, NUM_LDST, |
| NUM_CU); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelSpinLockSemaphore)"); |
| } |
| } |
| |
| void invokeSpinLockSemaphore_uniq(hipSemaphore_t sem, float * storage_d, |
| const int maxVal, int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSpinLockSemaphoreUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| sem, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelSpinLockSemaphoreUniq)"); |
| } |
| } |
| |
| void invokeEBOSemaphore(hipSemaphore_t sem, float * storage_d, const int maxVal, |
| int numIters, int numStorageLocs) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelEBOSemaphore), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| sem, storage_d, cpuLockData, numStorageLocs, numIters, NUM_LDST, |
| NUM_CU); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelEBOSemaphore)"); |
| } |
| } |
| |
| void invokeEBOSemaphore_uniq(hipSemaphore_t sem, float * storage_d, |
| const int maxVal, int numIters) |
| { |
| // local variable |
| const int WGs = numWGs; |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelEBOSemaphoreUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0, |
| sem, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU); |
| |
| // Blocks until the device has completed all preceding requested |
| // tasks (make sure that the device returned before continuing). |
| hipError_t hipErr = hipDeviceSynchronize(); |
| checkError(hipErr, "hipDeviceSynchronize (kernelEBOSemaphoreUniq)"); |
| } |
| } |
| |
| int main(int argc, char ** argv) |
| { |
| if (argc != 5) { |
| fprintf(stderr, "./allSyncPrims-1kernel <syncPrim> <numLdSt> <numWGs> " |
| "<numCSIters>\n"); |
| fprintf(stderr, "where:\n"); |
| fprintf(stderr, "\t<syncPrim>: a string that represents which synchronization primitive to run.\n" |
| "\t\tatomicTreeBarrUniq - Atomic Tree Barrier\n" |
| "\t\tatomicTreeBarrUniqLocalExch - Atomic Tree Barrier with local exchange\n" |
| "\t\tlfTreeBarrUniq - Lock-Free Tree Barrier\n" |
| "\t\tlfTreeBarrUniqLocalExch - Lock-Free Tree Barrier with local exchange\n" |
| "\t\tspinMutex - Spin Lock Mutex\n" |
| "\t\tspinMutexEBO - Spin Lock Mutex with Backoff\n" |
| "\t\tsleepMutex - Sleep Mutex\n" |
| "\t\tfaMutex - Fetch-and-Add Mutex\n" |
| "\t\tspinSem1 - Spin Semaphore (Max: 1)\n" |
| "\t\tspinSem2 - Spin Semaphore (Max: 2)\n" |
| "\t\tspinSem10 - Spin Semaphore (Max: 10)\n" |
| "\t\tspinSem120 - Spin Semaphore (Max: 120)\n" |
| "\t\tspinSemEBO1 - Spin Semaphore with Backoff (Max: 1)\n" |
| "\t\tspinSemEBO2 - Spin Semaphore with Backoff (Max: 2)\n" |
| "\t\tspinSemEBO10 - Spin Semaphore with Backoff (Max: 10)\n" |
| "\t\tspinSemEBO120 - Spin Semaphore with Backoff (Max: 120)\n" |
| "\t\tspinMutexUniq - Spin Lock Mutex -- accesses to unique locations per WG\n" |
| "\t\tspinMutexEBOUniq - Spin Lock Mutex with Backoff -- accesses to unique locations per WG\n" |
| "\t\tsleepMutexUniq - Sleep Mutex -- accesses to unique locations per WG\n" |
| "\t\tfaMutexUniq - Fetch-and-Add Mutex -- accesses to unique locations per WG\n" |
| "\t\tspinSemUniq1 - Spin Semaphore (Max: 1) -- accesses to unique locations per WG\n" |
| "\t\tspinSemUniq2 - Spin Semaphore (Max: 2) -- accesses to unique locations per WG\n" |
| "\t\tspinSemUniq10 - Spin Semaphore (Max: 10) -- accesses to unique locations per WG\n" |
| "\t\tspinSemUniq120 - Spin Semaphore (Max: 120) -- accesses to unique locations per WG\n" |
| "\t\tspinSemEBOUniq1 - Spin Semaphore with Backoff (Max: 1) -- accesses to unique locations per WG\n" |
| "\t\tspinSemEBOUniq2 - Spin Semaphore with Backoff (Max: 2) -- accesses to unique locations per WG\n" |
| "\t\tspinSemEBOUniq10 - Spin Semaphore with Backoff (Max: 10) -- accesses to unique locations per WG\n" |
| "\t\tspinSemEBOUniq120 - Spin Semaphore with Backoff (Max: 120) -- accesses to unique locations per WG\n"); |
| fprintf(stderr, "\t<numLdSt>: the # of LDs and STs to do for each thread " |
| "in the critical section.\n"); |
| fprintf(stderr, "\t<numWGs>: the # of WGs to execute (want to be " |
| "divisible by the number of CUs).\n"); |
| fprintf(stderr, "\t<numCSIters>: number of iterations of the critical " |
| "section.\n"); |
| exit(-1); |
| } |
| |
| // boilerplate code to identify compute capability, # CU/CUM/CUX, etc. |
| int deviceCount; |
| hipGetDeviceCount(&deviceCount); |
| if (deviceCount == 0) { |
| fprintf(stderr, "There is no device supporting HIP\n"); |
| exit(-1); |
| } |
| |
| hipDeviceProp_t deviceProp; |
| hipGetDeviceProperties(&deviceProp, 0); |
| fprintf(stdout, "GPU Compute Capability: %d.%d\n", deviceProp.major, |
| deviceProp.minor); |
| if ((deviceProp.major == 9999) && (deviceProp.minor == 9999)) { |
| fprintf(stderr, "There is no HIP capable device\n"); |
| exit(-1); |
| } |
| |
| NUM_CU = deviceProp.multiProcessorCount; |
| const int maxWGPerCU = deviceProp.maxThreadsPerBlock/NUM_WIS_PER_WG; |
| //assert(maxWGPerCU * NUM_WIS_PER_WG <= |
| // deviceProp.maxThreadsPerMultiProcessor); |
| MAX_WGS = maxWGPerCU * NUM_CU; |
| |
| fprintf(stdout, "# CU: %d, Max Thrs/WG: %d, Max WG/CU: %d, Max # WG: %d\n", |
| NUM_CU, deviceProp.maxThreadsPerBlock, maxWGPerCU, MAX_WGS); |
| |
| hipError_t hipErr = hipGetLastError(); |
| checkError(hipErr, "Begin"); |
| |
| // parse input args |
| const char * syncPrim_str = argv[1]; |
| NUM_LDST = atoi(argv[2]); |
| numWGs = atoi(argv[3]); |
| assert(numWGs <= MAX_WGS); |
| const int NUM_ITERS = atoi(argv[4]); |
| const int numWGs_perCU = (int)ceil((float)numWGs / NUM_CU); |
| assert(numWGs_perCU > 0); |
| |
| unsigned int syncPrim = 9999; |
| // set the syncPrim variable to the appropriate value based on the inputted |
| // string for the microbenchmark |
| if (strcmp(syncPrim_str, "atomicTreeBarrUniq") == 0) { syncPrim = 0; } |
| else if (strcmp(syncPrim_str, "atomicTreeBarrUniqLocalExch") == 0) { |
| syncPrim = 1; |
| } |
| else if (strcmp(syncPrim_str, "lfTreeBarrUniq") == 0) { syncPrim = 2; } |
| else if (strcmp(syncPrim_str, "lfTreeBarrUniqLocalExch") == 0) { |
| syncPrim = 3; |
| } |
| else if (strcmp(syncPrim_str, "spinMutex") == 0) { syncPrim = 4; } |
| else if (strcmp(syncPrim_str, "spinMutexEBO") == 0) { syncPrim = 5; } |
| else if (strcmp(syncPrim_str, "sleepMutex") == 0) { syncPrim = 6; } |
| else if (strcmp(syncPrim_str, "faMutex") == 0) { syncPrim = 7; } |
| else if (strcmp(syncPrim_str, "spinSem1") == 0) { syncPrim = 8; } |
| else if (strcmp(syncPrim_str, "spinSem2") == 0) { syncPrim = 9; } |
| else if (strcmp(syncPrim_str, "spinSem10") == 0) { syncPrim = 10; } |
| else if (strcmp(syncPrim_str, "spinSem120") == 0) { syncPrim = 11; } |
| else if (strcmp(syncPrim_str, "spinSemEBO1") == 0) { syncPrim = 12; } |
| else if (strcmp(syncPrim_str, "spinSemEBO2") == 0) { syncPrim = 13; } |
| else if (strcmp(syncPrim_str, "spinSemEBO10") == 0) { syncPrim = 14; } |
| else if (strcmp(syncPrim_str, "spinSemEBO120") == 0) { syncPrim = 15; } |
| // cases 16-19 reserved |
| else if (strcmp(syncPrim_str, "spinMutexUniq") == 0) { syncPrim = 20; } |
| else if (strcmp(syncPrim_str, "spinMutexEBOUniq") == 0) { syncPrim = 21; } |
| else if (strcmp(syncPrim_str, "sleepMutexUniq") == 0) { syncPrim = 22; } |
| else if (strcmp(syncPrim_str, "faMutexUniq") == 0) { syncPrim = 23; } |
| else if (strcmp(syncPrim_str, "spinSemUniq1") == 0) { syncPrim = 24; } |
| else if (strcmp(syncPrim_str, "spinSemUniq2") == 0) { syncPrim = 25; } |
| else if (strcmp(syncPrim_str, "spinSemUniq10") == 0) { syncPrim = 26; } |
| else if (strcmp(syncPrim_str, "spinSemUniq120") == 0) { syncPrim = 27; } |
| else if (strcmp(syncPrim_str, "spinSemEBOUniq1") == 0) { syncPrim = 28; } |
| else if (strcmp(syncPrim_str, "spinSemEBOUniq2") == 0) { syncPrim = 29; } |
| else if (strcmp(syncPrim_str, "spinSemEBOUniq10") == 0) { syncPrim = 30; } |
| else if (strcmp(syncPrim_str, "spinSemEBOUniq120") == 0) { syncPrim = 31; } |
| // cases 32-36 reserved |
| else |
| { |
| fprintf(stderr, "ERROR: Unknown synchronization primitive: %s\n", |
| syncPrim_str); |
| exit(-1); |
| } |
| |
| // multiply number of mutexes, semaphores by NUM_CU to |
| // allow per-core locks |
| hipLocksInit(MAX_WGS, 8 * NUM_CU, 24 * NUM_CU, pageAlign, NUM_CU, NUM_REPEATS, NUM_ITERS); |
| |
| hipErr = hipGetLastError(); |
| checkError(hipErr, "After hipLocksInit"); |
| |
| /* |
| The barriers need a per-CU barrier that is not part of the global synch |
| structure. In terms of size, for the lock-free barrier there are 2 arrays |
| in here -- inVars and outVars. Each needs to be sized to hold the maximum |
| number of WGs/CU and each CU needs an array. |
| |
| The atomic barrier per-CU synchronization fits inside the lock-free size |
| requirements so we can reuse the same locations. |
| */ |
| unsigned int * perCUBarriers = (unsigned int *)malloc(sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2)); |
| |
| int numLocsMult = 0; |
| // barriers and unique semaphores have numWGs WGs accessing unique locations |
| if ((syncPrim < 4) || |
| ((syncPrim >= 24) && (syncPrim <= 35))) { numLocsMult = numWGs; } |
| // The non-unique mutex microbenchmarks, all WGs access the same locations so |
| // multiplier is 1 |
| else if ((syncPrim >= 4) && (syncPrim <= 7)) { numLocsMult = 1; } |
| // The non-unique semaphores have 1 writer and numWGs_perCU - 1 readers per CU |
| // so the multiplier is numWGs_perCU |
| else if ((syncPrim >= 8) && (syncPrim <= 19)) { numLocsMult = numWGs_perCU; } |
| // For the unique mutex microbenchmarks and condition variable, all WGs on |
| // same CU access same data so multiplier is NUM_CU. |
| else if (((syncPrim >= 20) && (syncPrim <= 23)) || |
| (syncPrim == 36)) { numLocsMult = ((numWGs < NUM_CU) ? |
| numWGs : NUM_CU); } |
| else { // should never reach here |
| fprintf(stderr, "ERROR: Unknown syncPrim: %u\n", syncPrim); |
| exit(-1); |
| } |
| |
| // each thread in a WG accesses NUM_LDST locations but accesses |
| // per WI are offset so that each subsequent access is dependent |
| // on the previous one -- thus need an extra access per WI. |
| int numUniqLocsAccPerWG = (NUM_WIS_PER_WG * (NUM_LDST + 1)); |
| assert(numUniqLocsAccPerWG > 0); |
| int numStorageLocs = (numLocsMult * numUniqLocsAccPerWG); |
| assert(numStorageLocs > 0); |
| float * storage = (float *)malloc(sizeof(float) * numStorageLocs); |
| |
| fprintf(stdout, "# WGs: %d, # Ld/St: %d, # Locs Mult: %d, # Uniq Locs/WG: %d, # Storage Locs: %d\n", numWGs, NUM_LDST, numLocsMult, numUniqLocsAccPerWG, numStorageLocs); |
| |
| // initialize storage |
| for (int i = 0; i < numStorageLocs; ++i) { storage[i] = i; } |
| // initialize per-CU barriers to 0's |
| for (int i = 0; i < (NUM_CU * MAX_WGS * 2); ++i) { perCUBarriers[i] = 0; } |
| |
| // gpu copies of storage and perCUBarriers |
| //float elapsedTime = 0.0f; |
| unsigned int * perCUBarriers_d = NULL; |
| float * storage_d = NULL; |
| |
| hipMalloc(&perCUBarriers_d, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2)); |
| hipMalloc(&storage_d, sizeof(float) * numStorageLocs); |
| |
| hipMemcpy(perCUBarriers_d, perCUBarriers, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2), hipMemcpyHostToDevice); |
| hipMemcpy(storage_d, storage, sizeof(float) * numStorageLocs, hipMemcpyHostToDevice); |
| |
| // lock variables |
| hipMutex_t spinMutex, faMutex, sleepMutex, eboMutex; |
| hipMutex_t spinMutex_uniq, faMutex_uniq, sleepMutex_uniq, eboMutex_uniq; |
| hipSemaphore_t spinSem1, eboSem1, |
| spinSem2, eboSem2, |
| spinSem10, eboSem10, |
| spinSem120, eboSem120; |
| hipSemaphore_t spinSem1_uniq, eboSem1_uniq, |
| spinSem2_uniq, eboSem2_uniq, |
| spinSem10_uniq, eboSem10_uniq, |
| spinSem120_uniq, eboSem120_uniq; |
| switch (syncPrim) { |
| case 0: // atomic tree barrier doesn't require any special fields to be |
| // created |
| printf("atomic_tree_barrier_%03d\n", NUM_ITERS); fflush(stdout); |
| break; |
| case 1: // atomic tree barrier with local exchange doesn't require any |
| // special fields to be created |
| printf("atomic_tree_barrier_localExch_%03d\n", NUM_ITERS); fflush(stdout); |
| break; |
| case 2: // lock-free tree barrier doesn't require any special fields to be |
| // created |
| printf("fbs_tree_barrier_%03d\n", NUM_ITERS); fflush(stdout); |
| break; |
| case 3: // lock-free barrier with local exchange doesn't require any |
| // special fields to be created |
| printf("fbs_tree_barrier_localExch_%03d\n", NUM_ITERS); fflush(stdout); |
| break; |
| case 4: |
| printf("spin_lock_mutex_%03d\n", NUM_ITERS); fflush(stdout); |
| hipMutexCreateSpin (&spinMutex, 0); |
| break; |
| case 5: |
| printf("ebo_mutex_%03d\n", NUM_ITERS); fflush(stdout); |
| hipMutexCreateEBO (&eboMutex, 1); |
| break; |
| case 6: |
| printf("sleeping_mutex_%03d\n", NUM_ITERS); fflush(stdout); |
| hipMutexCreateSleep (&sleepMutex, 2); |
| break; |
| case 7: |
| printf("fetchadd_mutex_%03d\n", NUM_ITERS); fflush(stdout); |
| hipMutexCreateFA (&faMutex, 3); |
| break; |
| case 8: |
| printf("spin_lock_sem_%03d_%03d\n", 1, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateSpin (&spinSem1, 0, 1, NUM_CU); |
| break; |
| case 9: |
| printf("spin_lock_sem_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateSpin (&spinSem2, 1, 2, NUM_CU); |
| break; |
| case 10: |
| printf("spin_lock_sem_%03d_%03d\n", 10, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateSpin (&spinSem10, 2, 10, NUM_CU); |
| break; |
| case 11: |
| printf("spin_lock_sem_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateSpin (&spinSem120, 3, 120, NUM_CU); |
| break; |
| case 12: |
| printf("ebo_sem_%03d_%03d\n", 1, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateEBO (&eboSem1, 4, 1, NUM_CU); |
| break; |
| case 13: |
| printf("ebo_sem_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateEBO (&eboSem2, 5, 2, NUM_CU); |
| break; |
| case 14: |
| printf("ebo_sem_%03d_%03d\n", 10, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateEBO (&eboSem10, 6, 10, NUM_CU); |
| break; |
| case 15: |
| printf("ebo_sem_%03d_%03d\n", 120, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateEBO (&eboSem120, 7, 120, NUM_CU); |
| break; |
| // cases 16-19 reserved |
| case 16: |
| break; |
| case 17: |
| break; |
| case 18: |
| break; |
| case 19: |
| break; |
| case 20: |
| printf("spin_lock_mutex_uniq_%03d\n", NUM_ITERS); fflush(stdout); |
| hipMutexCreateSpin (&spinMutex_uniq, 4); |
| break; |
| case 21: |
| printf("ebo_mutex_uniq_%03d\n", NUM_ITERS); fflush(stdout); |
| hipMutexCreateEBO (&eboMutex_uniq, 5); |
| break; |
| case 22: |
| printf("sleeping_mutex_uniq_%03d\n", NUM_ITERS); fflush(stdout); |
| hipMutexCreateSleep (&sleepMutex_uniq, 6); |
| break; |
| case 23: |
| printf("fetchadd_mutex_uniq_%03d\n", NUM_ITERS); fflush(stdout); |
| hipMutexCreateFA (&faMutex_uniq, 7); |
| break; |
| case 24: |
| printf("spin_lock_sem_uniq_%03d_%03d\n", 1, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateSpin (&spinSem1_uniq, 12, 1, NUM_CU); |
| break; |
| case 25: |
| printf("spin_lock_sem_uniq_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateSpin (&spinSem2_uniq, 13, 2, NUM_CU); |
| break; |
| case 26: |
| printf("spin_lock_sem_uniq_%03d_%03d\n", 10, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateSpin (&spinSem10_uniq, 14, 10, NUM_CU); |
| break; |
| case 27: |
| printf("spin_lock_sem_uniq_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateSpin (&spinSem120_uniq, 15, 120, NUM_CU); |
| break; |
| case 28: |
| printf("ebo_sem_uniq_%03d_%03d\n", 1, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateEBO (&eboSem1_uniq, 16, 1, NUM_CU); |
| break; |
| case 29: |
| printf("ebo_sem_uniq_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateEBO (&eboSem2_uniq, 17, 2, NUM_CU); |
| break; |
| case 30: |
| printf("ebo_sem_uniq_%03d_%03d\n", 10, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateEBO (&eboSem10_uniq, 18, 10, NUM_CU); |
| break; |
| case 31: |
| printf("ebo_sem_uniq_%03d_%03d\n", 120, NUM_ITERS); fflush(stdout); |
| hipSemaphoreCreateEBO (&eboSem120_uniq, 19, 120, NUM_CU); |
| break; |
| // cases 32-36 reserved |
| case 32: |
| break; |
| case 33: |
| break; |
| case 34: |
| break; |
| case 35: |
| break; |
| case 36: |
| break; |
| default: |
| fprintf(stderr, "ERROR: Trying to run synch prim #%u, but only 0-36 are " |
| "supported\n", syncPrim); |
| exit(-1); |
| break; |
| } |
| |
| // # WGs must be < maxBufferSize or sleep mutex ring buffer won't work |
| if ((syncPrim == 6) || (syncPrim == 22)) { |
| assert(MAX_WGS <= cpuLockData->maxBufferSize); |
| } |
| |
| // NOTE: region of interest begins here |
| hipDeviceSynchronize(); |
| |
| switch (syncPrim) { |
| case 0: // atomic tree barrier |
| invokeAtomicTreeBarrier(storage_d, perCUBarriers_d, NUM_ITERS); |
| break; |
| case 1: // atomic tree barrier with local exchange |
| invokeAtomicTreeBarrierLocalExch(storage_d, perCUBarriers_d, NUM_ITERS); |
| break; |
| case 2: // lock-free barrier |
| invokeFBSTreeBarrier(storage_d, perCUBarriers_d, NUM_ITERS); |
| break; |
| case 3: // lock-free barrier with local exchange |
| invokeFBSTreeBarrierLocalExch(storage_d, perCUBarriers_d, NUM_ITERS); |
| break; |
| case 4: // Spin Lock Mutex |
| invokeSpinLockMutex (spinMutex, storage_d, NUM_ITERS); |
| break; |
| case 5: // Spin Lock Mutex with backoff |
| invokeEBOMutex (eboMutex, storage_d, NUM_ITERS); |
| break; |
| case 6: // Sleeping Mutex |
| invokeSleepingMutex (sleepMutex, storage_d, NUM_ITERS); |
| break; |
| case 7: // fetch-and-add mutex |
| invokeFetchAndAddMutex(faMutex, storage_d, NUM_ITERS); |
| break; |
| case 8: // spin semaphore (1) |
| invokeSpinLockSemaphore(spinSem1, storage_d, 1, NUM_ITERS, numStorageLocs); |
| break; |
| case 9: // spin semaphore (2) |
| invokeSpinLockSemaphore(spinSem2, storage_d, 2, NUM_ITERS, numStorageLocs); |
| break; |
| case 10: // spin semaphore (10) |
| invokeSpinLockSemaphore(spinSem10, storage_d, 10, NUM_ITERS, numStorageLocs); |
| break; |
| case 11: // spin semaphore (120) |
| invokeSpinLockSemaphore(spinSem120, storage_d, 120, NUM_ITERS, numStorageLocs); |
| break; |
| case 12: // spin semaphore with backoff (1) |
| invokeEBOSemaphore(eboSem1, storage_d, 1, NUM_ITERS, numStorageLocs); |
| break; |
| case 13: // spin semaphore with backoff (2) |
| invokeEBOSemaphore(eboSem2, storage_d, 2, NUM_ITERS, numStorageLocs); |
| break; |
| case 14: // spin semaphore with backoff (10) |
| invokeEBOSemaphore(eboSem10, storage_d, 10, NUM_ITERS, numStorageLocs); |
| break; |
| case 15: // spin semaphore with backoff (120) |
| invokeEBOSemaphore(eboSem120, storage_d, 120, NUM_ITERS, numStorageLocs); |
| break; |
| // cases 16-19 reserved |
| case 16: |
| break; |
| case 17: |
| break; |
| case 18: |
| break; |
| case 19: |
| break; |
| case 20: // Spin Lock Mutex (uniq) |
| invokeSpinLockMutex_uniq (spinMutex_uniq, storage_d, NUM_ITERS); |
| break; |
| case 21: // Spin Lock Mutex with backoff (uniq) |
| invokeEBOMutex_uniq (eboMutex_uniq, storage_d, NUM_ITERS); |
| break; |
| case 22: // Sleeping Mutex (uniq) |
| invokeSleepingMutex_uniq (sleepMutex_uniq, storage_d, NUM_ITERS); |
| break; |
| case 23: // fetch-and-add mutex (uniq) |
| invokeFetchAndAddMutex_uniq(faMutex_uniq, storage_d, NUM_ITERS); |
| break; |
| case 24: // spin semaphore (1) (uniq) |
| invokeSpinLockSemaphore_uniq(spinSem1_uniq, storage_d, 1, NUM_ITERS); |
| break; |
| case 25: // spin semaphore (2) (uniq) |
| invokeSpinLockSemaphore_uniq(spinSem2_uniq, storage_d, 2, NUM_ITERS); |
| break; |
| case 26: // spin semaphore (10) (uniq) |
| invokeSpinLockSemaphore_uniq(spinSem10_uniq, storage_d, 10, NUM_ITERS); |
| break; |
| case 27: // spin semaphore (120) (uniq) |
| invokeSpinLockSemaphore_uniq(spinSem120_uniq, storage_d, 120, NUM_ITERS); |
| break; |
| case 28: // spin semaphore with backoff (1) (uniq) |
| invokeEBOSemaphore_uniq(eboSem1_uniq, storage_d, 1, NUM_ITERS); |
| break; |
| case 29: // spin semaphore with backoff (2) (uniq) |
| invokeEBOSemaphore_uniq(eboSem2_uniq, storage_d, 2, NUM_ITERS); |
| break; |
| case 30: // spin semaphore with backoff (10) (uniq) |
| invokeEBOSemaphore_uniq(eboSem10_uniq, storage_d, 10, NUM_ITERS); |
| break; |
| case 31: // spin semaphore with backoff (120) (uniq) |
| invokeEBOSemaphore_uniq(eboSem120_uniq, storage_d, 120, NUM_ITERS); |
| break; |
| // cases 32-36 reserved |
| case 32: |
| break; |
| case 33: |
| break; |
| case 34: |
| break; |
| case 35: |
| break; |
| case 36: |
| break; |
| default: |
| fprintf(stderr, |
| "ERROR: Trying to run synch prim #%u, but only 0-36 are " |
| "supported\n", |
| syncPrim); |
| exit(-1); |
| break; |
| } |
| |
| // NOTE: Can end simulation here if don't care about output checking |
| hipDeviceSynchronize(); |
| |
| // copy results back to compare to golden |
| hipMemcpy(storage, storage_d, sizeof(float) * numStorageLocs, hipMemcpyDeviceToHost); |
| |
| // get golden results |
| float storageGolden[numStorageLocs]; |
| int numLocsAccessed = 0, currLoc = 0; |
| // initialize |
| for (int i = 0; i < numStorageLocs; ++i) { storageGolden[i] = i; } |
| |
| for (int repeat = 0; repeat < NUM_REPEATS; ++repeat) |
| { |
| for (int j = 0; j < NUM_ITERS; ++j) |
| { |
| /* |
| The barrier algorithms exchange data across CUs, so we need to perform |
| the exchanges in the golden code. |
| |
| The barrier algorithms with local exchange exchange data both across |
| CUs and across WGs within an CU, so need to perform both in the golden |
| code. |
| */ |
| if (syncPrim < 4) |
| { |
| // Some kernels only access a fraction of the total # of locations, |
| // determine how many locations are accessed by each kernel here. |
| numLocsAccessed = (numWGs * numUniqLocsAccPerWG); |
| |
| // first cache line of words aren't written to |
| for (int i = (numLocsAccessed-1); i >= 0; --i) |
| { |
| // every iteration of the critical section, the location being |
| // accessed is shifted by numUniqLocsAccPerWG |
| currLoc = (i + (j * numUniqLocsAccPerWG)) % numLocsAccessed; |
| |
| accessData_golden(storageGolden, currLoc, numStorageLocs); |
| } |
| |
| // local exchange versions do additional accesses when there are |
| // multiple WGs on a CU |
| if ((syncPrim == 1) || (syncPrim == 3)) |
| { |
| if (numWGs_perCU > 1) |
| { |
| for (int i = (numLocsAccessed-1); i >= 0; --i) |
| { |
| // advance the current location by the number of unique locations |
| // accessed by a CU (mod the number of memory locations accessed) |
| currLoc = (i + (numWGs_perCU * numUniqLocsAccPerWG)) % |
| numLocsAccessed; |
| // every iteration of the critical section, the location being |
| // accessed is also shifted by numUniqLocsAccPerWG |
| currLoc = (currLoc + (j * numUniqLocsAccPerWG)) % |
| numLocsAccessed; |
| |
| accessData_golden(storageGolden, currLoc, numStorageLocs); |
| } |
| } |
| } |
| } |
| /* |
| In the non-unique mutex microbenchmarks (4-7), all WGs on all CUs access |
| the same locations. |
| */ |
| else if ((syncPrim >= 4) && (syncPrim <= 7)) |
| { |
| // need to iterate over the locations for each WG since all WGs |
| // access the same locations |
| for (int WG = 0; WG < numWGs; ++WG) |
| { |
| for (int i = (numUniqLocsAccPerWG-1); i >= 0; --i) |
| { |
| accessData_golden(storageGolden, i, numStorageLocs); |
| } |
| } |
| } |
| /* |
| In the non-unique semaphore microbenchmarks (8-19), 1 "writer" WG |
| per CU writes all the locations accessed by that CU (i.e., |
| numUniqLocsAccPerWG * numWGs_perCU). Moreover, all writer WGs across |
| all CUs access the same locations. |
| */ |
| else if ((syncPrim <= 19) && (syncPrim >= 8)) |
| { |
| int cuID = 0, perCU_wgID = 0; |
| const int numCU = ((numWGs < NUM_CU) ? numWGs : NUM_CU); |
| bool isWriter = false; |
| |
| // need to iterate over the locations for each WG since all WGs |
| // access the same locations |
| for (int wg = 0; wg < numWGs; ++wg) |
| { |
| cuID = (wg % numCU); |
| perCU_wgID = (wg / numCU); |
| // which WG is writer varies per CU |
| isWriter = (perCU_wgID == (cuID % numWGs_perCU)); |
| |
| if (isWriter) |
| { |
| for (int k = 0; k < numWGs_perCU; ++k) |
| { |
| // first cache line of words aren't written to |
| for (int i = (numUniqLocsAccPerWG-1); i >= 0; --i) |
| { |
| /* |
| The locations the writer is writing are numUniqLocsAccPerWG |
| apart because the WGs are assigned in round-robin fashion. |
| Thus, need to shift the location accordingly. |
| */ |
| currLoc = (i + (k * numUniqLocsAccPerWG)) % numStorageLocs; |
| accessData_golden(storageGolden, currLoc, numStorageLocs); |
| } |
| } |
| } |
| } |
| } |
| /* |
| In the unique mutex microbenchmarks (20-23), all WGs on a CU access |
| the same data and the data accessed by each CU is unique. |
| */ |
| else if ((syncPrim <= 23) && (syncPrim >= 20)) |
| { |
| // Some kernels only access a fraction of the total # of locations, |
| // determine how many locations are accessed by each kernel here. |
| numLocsAccessed = (numWGs * numUniqLocsAccPerWG); |
| |
| // first cache line of words aren't written to |
| for (int i = (numLocsAccessed-1); i >= 0; --i) |
| { |
| /* |
| If this location would be accessed by a WG other than the first |
| WG on an CU, wraparound and access the same location as the |
| first WG on the CU -- only for the mutexes, for semaphores this |
| isn't true. |
| */ |
| currLoc = i % (NUM_CU * numUniqLocsAccPerWG); |
| |
| accessData_golden(storageGolden, currLoc, numStorageLocs); |
| } |
| } |
| /* |
| In the unique semaphore microbenchmarks (24-35), 1 "writer" WG per |
| CU writes all the locations accessed by that CU, but each CU accesses |
| unique data. We model this behavior by accessing all of the data |
| accessed by all CUs, since this has the same effect (assuming same |
| number of WGs/CU). |
| */ |
| else |
| { |
| // Some kernels only access a fraction of the total # of locations, |
| // determine how many locations are accessed by each kernel here. |
| numLocsAccessed = (numWGs * numUniqLocsAccPerWG); |
| |
| // first cache line of words aren't written to |
| for (int i = (numLocsAccessed-1); i >= 0; --i) |
| { |
| accessData_golden(storageGolden, i, numStorageLocs); |
| } |
| } |
| } |
| } |
| |
| fprintf(stdout, "Comparing GPU results to golden results:\n"); |
| unsigned int numErrors = 0; |
| // check the output values |
| for (int i = 0; i < numStorageLocs; ++i) |
| { |
| if (std::abs(storage[i] - storageGolden[i]) > 1E-5) |
| { |
| fprintf(stderr, "\tERROR: storage[%d] = %f, golden[%d] = %f\n", i, |
| storage[i], i, storageGolden[i]); |
| ++numErrors; |
| } |
| } |
| if (numErrors > 0) |
| { |
| fprintf(stderr, "ERROR: %s has %u output errors\n", syncPrim_str, |
| numErrors); |
| exit(-1); |
| } |
| else { fprintf(stdout, "PASSED!\n"); } |
| |
| // free arrays |
| hipLocksDestroy(); |
| hipFree(storage_d); |
| hipFree(perCUBarriers_d); |
| free(storage); |
| free(perCUBarriers); |
| |
| return 0; |
| } |