| #ifndef __HIPLOCKSSEMAPHORESPIN_H__ |
| #define __HIPLOCKSSEMAPHORESPIN_H__ |
| |
| #include "hip/hip_runtime.h" |
| #include "hipLocks.h" |
| |
| inline __host__ hipError_t hipSemaphoreCreateSpin(hipSemaphore_t * const handle, |
| const int semaphoreNumber, |
| const unsigned int count, |
| const int NUM_CU) |
| { |
| // Here we set the initial value to be count+1, this allows us to do an |
| // atomicExch(sem, 0) and basically use the semaphore value as both a |
| // lock and a semaphore. |
| unsigned int initialValue = (count + 1), zero = 0; |
| *handle = semaphoreNumber; |
| for (int id = 0; id < NUM_CU; ++id) { // need to set these values for all CUs |
| hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4))]), &initialValue, sizeof(initialValue), hipMemcpyHostToDevice); |
| hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 1]), &zero, sizeof(zero), hipMemcpyHostToDevice); |
| hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 2]), &zero, sizeof(zero), hipMemcpyHostToDevice); |
| hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 3]), &initialValue, sizeof(initialValue), hipMemcpyHostToDevice); |
| } |
| return hipSuccess; |
| } |
| |
| inline __device__ bool hipSemaphoreSpinTryWait(const hipSemaphore_t sem, |
| const bool isWriter, |
| const unsigned int maxSemCount, |
| unsigned int * semaphoreBuffers, |
| const int NUM_CU) |
| { |
| const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && |
| hipThreadIdx_z == 0); |
| /* |
| Each sem has NUM_CU * 4 locations in the buffer. Of these locations, each |
| CU uses 4 of them (current count, head, tail, max count). For the global |
| semaphore all CUs use semaphoreBuffers[sem * 4 * NUM_CU]. |
| */ |
| unsigned int * const currCount = semaphoreBuffers + (sem * 4 * NUM_CU); |
| unsigned int * const lock = currCount + 1; |
| /* |
| Reuse the tail for the "writers are waiting" flag since tail is unused. |
| |
| For now just use to indicate that at least 1 writer is waiting instead of |
| a count to make sure that readers aren't totally starved out until all the |
| writers are done. |
| */ |
| unsigned int * const writerWaiting = currCount + 2; |
| __shared__ bool acq1, acq2; |
| |
| __syncthreads(); |
| if (isMasterThread) |
| { |
| acq1 = false; |
| // try to acquire the sem head "lock" |
| if (atomicCAS(lock, 0, 1) == 0) { |
| // atomicCAS acts as a load acquire, need TF to enforce ordering |
| __threadfence(); |
| acq1 = true; |
| } |
| } |
| __syncthreads(); |
| |
| if (!acq1) { return false; } // return if we couldn't acquire the lock |
| if (isMasterThread) |
| { |
| acq2 = false; |
| /* |
| NOTE: currCount is only accessed by 1 WG at a time and has a lock around |
| it, so we can safely access it as a regular data access instead of with |
| atomics. |
| */ |
| unsigned int currSemCount = currCount[0]; |
| |
| if (isWriter) { |
| // writer needs the count to be == maxSemCount to enter the critical |
| // section (otherwise there are readers in the critical section) |
| if (currSemCount == maxSemCount) { acq2 = true; } |
| } else { |
| // if there is a writer waiting, readers aren't allowed to enter the |
| // critical section |
| if (writerWaiting[0] == 0) { |
| // readers need count > 1 to enter critical section (otherwise semaphore |
| // is full) |
| if (currSemCount > 1) { acq2 = true; } |
| } |
| } |
| } |
| __syncthreads(); |
| |
| if (!acq2) // release the sem head "lock" since the semaphore was full |
| { |
| // writers set a flag to note that they are waiting so more readers don't |
| // join after the writer started waiting |
| if (isWriter) { writerWaiting[0] = 1; /* if already 1, just reset to 1 */ } |
| |
| if (isMasterThread) { |
| // atomicExch acts as a store release, need TF to enforce ordering |
| __threadfence(); |
| atomicExch(lock, 0); |
| } |
| __syncthreads(); |
| return false; |
| } |
| __syncthreads(); |
| |
| if (isMasterThread) { |
| /* |
| NOTE: currCount is only accessed by 1 WG at a time and has a lock around |
| it, so we can safely access it as a regular data access instead of with |
| atomics. |
| */ |
| if (isWriter) { |
| /* |
| writer decrements the current count of the semaphore by the max to |
| ensure that no one else can enter the critical section while it's |
| writing. |
| */ |
| currCount[0] -= maxSemCount; |
| |
| // writers also need to unset the "writer is waiting" flag |
| writerWaiting[0] = 0; |
| } else { |
| // readers decrement the current count of the semaphore by 1 so other |
| // readers can also read the data (but not the writers since they needs |
| // the entire CS). |
| --currCount[0]; |
| } |
| |
| // atomicExch acts as a store release, need TF to enforce ordering |
| __threadfence(); |
| // now that we've updated the semaphore count can release the lock |
| atomicExch(lock, 0); |
| } |
| __syncthreads(); |
| |
| return true; |
| } |
| |
| inline __device__ void hipSemaphoreSpinWait(const hipSemaphore_t sem, |
| const bool isWriter, |
| const unsigned int maxSemCount, |
| unsigned int * semaphoreBuffers, |
| const int NUM_CU) |
| { |
| while (!hipSemaphoreSpinTryWait(sem, isWriter, maxSemCount, semaphoreBuffers, NUM_CU)) |
| { |
| __syncthreads(); |
| } |
| } |
| |
| inline __device__ void hipSemaphoreSpinPost(const hipSemaphore_t sem, |
| const bool isWriter, |
| const unsigned int maxSemCount, |
| unsigned int * semaphoreBuffers, |
| const int NUM_CU) |
| { |
| const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && |
| hipThreadIdx_z == 0); |
| /* |
| Each sem has NUM_CU * 4 locations in the buffer. Of these locations, each |
| CU uses 4 of them (current count, head, tail, max count). For the global |
| semaphore use semaphoreBuffers[sem * 4 * NUM_CU]. |
| */ |
| unsigned int * const currCount = semaphoreBuffers + (sem * 4 * NUM_CU); |
| unsigned int * const lock = currCount + 1; |
| __shared__ bool acquired; |
| |
| if (isMasterThread) { acquired = false; } |
| __syncthreads(); |
| |
| while (!acquired) |
| { |
| if (isMasterThread) |
| { |
| /* |
| NOTE: This CAS will trigger an invalidation since we overload CAS's. |
| Since most of the data in the local critical section is written, it |
| hopefully won't affect performance too much. |
| */ |
| // try to acquire sem head lock |
| if (atomicCAS(lock, 0, 1) == 0) { |
| // atomicCAS acts as a load acquire, need TF to enforce ordering |
| __threadfence(); |
| acquired = true; |
| } else { |
| acquired = false; |
| } |
| } |
| __syncthreads(); |
| } |
| __syncthreads(); |
| |
| if (isMasterThread) { |
| /* |
| NOTE: currCount is only accessed by 1 WG at a time and has a lock around |
| it, so we can safely access it as a regular data access instead of with |
| atomics. |
| */ |
| if (isWriter) { |
| // writers add the max value to the semaphore to allow the readers to |
| // start accessing the critical section. |
| currCount[0] += maxSemCount; |
| } else { |
| // readers add 1 to the semaphore |
| ++currCount[0]; |
| } |
| |
| // atomicExch acts as a store release, need TF to enforce ordering |
| __threadfence(); |
| // now that we've updated the semaphore count can release the lock |
| atomicExch(lock, 0); |
| } |
| __syncthreads(); |
| } |
| |
| // same wait algorithm but with local scope and per-CU synchronization |
| inline __device__ bool hipSemaphoreSpinTryWaitLocal(const hipSemaphore_t sem, |
| const unsigned int cuID, |
| const bool isWriter, |
| const unsigned int maxSemCount, |
| unsigned int * semaphoreBuffers, |
| const int NUM_CU) |
| { |
| const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && |
| hipThreadIdx_z == 0); |
| // Each sem has NUM_CU * 4 locations in the buffer. Of these locations, each |
| // CU gets 4 of them (current count, head, tail, max count). So CU 0 starts |
| // at semaphoreBuffers[sem * 4 * NUM_CU]. |
| unsigned int * const currCount = semaphoreBuffers + |
| ((sem * 4 * NUM_CU) + (cuID * 4)); |
| unsigned int * const lock = currCount + 1; |
| /* |
| Reuse the tail for the "writers are waiting" flag since tail is unused. |
| |
| For now just use to indicate that at least 1 writer is waiting instead of |
| a count to make sure that readers aren't totally starved out until all the |
| writers are done. |
| */ |
| unsigned int * const writerWaiting = currCount + 2; |
| __shared__ bool acq1, acq2; |
| |
| __syncthreads(); |
| if (isMasterThread) |
| { |
| acq1 = false; |
| // try to acquire the sem head "lock" |
| if (atomicCAS(lock, 0, 1) == 0) { |
| // atomicCAS acts as a load acquire, need TF to enforce ordering locally |
| __threadfence_block(); |
| acq1 = true; |
| } |
| } |
| __syncthreads(); |
| |
| if (!acq1) { return false; } // return if we couldn't acquire the lock |
| if (isMasterThread) |
| { |
| acq2 = false; |
| /* |
| NOTE: currCount is only accessed by 1 WG at a time and has a lock around |
| it, so we can safely access it as a regular data access instead of with |
| atomics. |
| */ |
| unsigned int currSemCount = currCount[0]; |
| |
| if (isWriter) { |
| // writer needs the count to be == maxSemCount to enter the critical |
| // section (otherwise there are readers in the critical section) |
| if (currSemCount == maxSemCount) { acq2 = true; } |
| } else { |
| // if there is a writer waiting, readers aren't allowed to enter the |
| // critical section |
| if (writerWaiting[0] == 0) { |
| // readers need count > 1 to enter critical section (otherwise semaphore |
| // is full) |
| if (currSemCount > 1) { acq2 = true; } |
| } |
| } |
| } |
| __syncthreads(); |
| |
| if (!acq2) // release the sem head "lock" since the semaphore was full |
| { |
| // writers set a flag to note that they are waiting so more readers don't |
| // join after the writer started waiting |
| if (isWriter) { writerWaiting[0] = 1; /* if already 1, just reset to 1 */ } |
| |
| if (isMasterThread) { |
| // atomicExch acts as a store release, need TF to enforce ordering locally |
| __threadfence_block(); |
| atomicExch(lock, 0); |
| } |
| __syncthreads(); |
| return false; |
| } |
| __syncthreads(); |
| |
| if (isMasterThread) { |
| /* |
| NOTE: currCount is only accessed by 1 WG at a time and has a lock around |
| it, so we can safely access it as a regular data access instead of with |
| atomics. |
| */ |
| if (isWriter) { |
| /* |
| writer decrements the current count of the semaphore by the max to |
| ensure that no one else can enter the critical section while it's |
| writing. |
| */ |
| currCount[0] -= maxSemCount; |
| |
| // writers also need to unset the "writer is waiting" flag |
| writerWaiting[0] = 0; |
| } else { |
| /* |
| readers decrement the current count of the semaphore by 1 so other |
| readers can also read the data (but not the writers since they needs |
| the entire CS). |
| */ |
| --currCount[0]; |
| } |
| |
| // atomicExch acts as a store release, need TF to enforce ordering locally |
| __threadfence_block(); |
| // now that we've updated the semaphore count can release the lock |
| atomicExch(lock, 0); |
| } |
| __syncthreads(); |
| |
| return true; |
| } |
| |
| inline __device__ void hipSemaphoreSpinWaitLocal(const hipSemaphore_t sem, |
| const unsigned int cuID, |
| const bool isWriter, |
| const unsigned int maxSemCount, |
| unsigned int * semaphoreBuffers, |
| const int NUM_CU) |
| { |
| while (!hipSemaphoreSpinTryWaitLocal(sem, cuID, isWriter, maxSemCount, semaphoreBuffers, NUM_CU)) |
| { |
| __syncthreads(); |
| } |
| } |
| |
| inline __device__ void hipSemaphoreSpinPostLocal(const hipSemaphore_t sem, |
| const unsigned int cuID, |
| const bool isWriter, |
| const unsigned int maxSemCount, |
| unsigned int * semaphoreBuffers, |
| const int NUM_CU) |
| { |
| bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && hipThreadIdx_z == 0); |
| // Each sem has NUM_CU * 4 locations in the buffer. Of these locations, each |
| // CU gets 4 of them. So CU 0 starts at semaphoreBuffers[sem * 4 * NUM_CU]. |
| unsigned int * const currCount = semaphoreBuffers + |
| ((sem * 4 * NUM_CU) + (cuID * 4)); |
| unsigned int * const lock = currCount + 1; |
| __shared__ bool acquired; |
| |
| if (isMasterThread) { acquired = false; } |
| __syncthreads(); |
| |
| while (!acquired) |
| { |
| if (isMasterThread) |
| { |
| /* |
| NOTE: This CAS will trigger an invalidation since we overload CAS's. |
| Since most of the data in the local critical section is written, it |
| hopefully won't affect performance too much. |
| */ |
| // try to acquire sem head lock |
| if (atomicCAS(lock, 0, 1) == 0) { |
| // atomicCAS acts as a load acquire, need TF to enforce ordering locally |
| __threadfence_block(); |
| acquired = true; |
| } else { |
| acquired = false; |
| } |
| } |
| __syncthreads(); |
| } |
| __syncthreads(); |
| |
| if (isMasterThread) { |
| /* |
| NOTE: currCount is only accessed by 1 WG at a time and has a lock around |
| it, so we can safely access it as a regular data access instead of with |
| atomics. |
| */ |
| if (isWriter) { |
| // writers add the max value to the semaphore to allow the readers to |
| // start accessing the critical section. |
| currCount[0] += maxSemCount; |
| } else { |
| // readers add 1 to the semaphore |
| ++currCount[0]; |
| } |
| |
| // atomicExch acts as a store release, need TF to enforce ordering locally |
| __threadfence_block(); |
| // now that we've updated the semaphore count can release the lock |
| atomicExch(lock, 0); |
| } |
| __syncthreads(); |
| } |
| |
| #endif // #ifndef __HIPLOCKSSEMAPHORESPIN_H__ |