src/heterosync/src/hipLocksSemaphoreEBO.h - public/gem5-resources - Git at Google

 #ifndef __HIPSEMAPHOREEBO_H__
 #define __HIPSEMAPHOREEBO_H__

 #include "hip/hip_runtime.h"
 #include "hipLocks.h"

 inline __host__ hipError_t hipSemaphoreCreateEBO(hipSemaphore_t * const handle,
                                                  const int semaphoreNumber,
                                                  const unsigned int count,
                                                  const int NUM_CU)
 {
   // Here we set the initial value to be count+1, this allows us to do an
   // atomicExch(sem, 0) and basically use the semaphore value as both a
   // lock and a semaphore.
   unsigned int initialValue = (count + 1), zero = 0;
   *handle = semaphoreNumber;
   for (int id = 0; id < NUM_CU; ++id) { // need to set these values for all CUs
     hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4))]), &initialValue, sizeof(initialValue), hipMemcpyHostToDevice);
     hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 1]), &zero, sizeof(zero), hipMemcpyHostToDevice);
     hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 2]), &zero, sizeof(zero), hipMemcpyHostToDevice);
     hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 3]), &initialValue, sizeof(initialValue), hipMemcpyHostToDevice);
   }
   return hipSuccess;
 }

 inline __device__ bool hipSemaphoreEBOTryWait(const hipSemaphore_t sem,
                                               const bool isWriter,
                                               const unsigned int maxSemCount,
                                               unsigned int * semaphoreBuffers,
                                               const int NUM_CU)
 {
   const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
                                hipThreadIdx_z == 0);
   /*
     Each sem has NUM_CU * 4 locations in the buffer.  Of these locations, each
     CU uses 4 of them (current count, head, tail, max count).  For the global
     semaphore all CUs use semaphoreBuffers[sem * 4 * NUM_CU].
   */
   unsigned int * const currCount = semaphoreBuffers + (sem * 4 * NUM_CU);
   unsigned int * const lock = currCount + 1;
   /*
     Reuse the tail for the "writers are waiting" flag since tail is unused.

     For now just use to indicate that at least 1 writer is waiting instead of
     a count to make sure that readers aren't totally starved out until all the
     writers are done.
   */
   unsigned int * const writerWaiting = currCount + 2;
   __shared__ bool acq1, acq2;

   __syncthreads();
   if (isMasterThread)
   {
     acq1 = false;
     // try to acquire the sem head "lock"
     if (atomicCAS(lock, 0, 1) == 0) {
       // atomicCAS acts as a load acquire, need TF to enforce ordering
       __threadfence();
       acq1 = true;
     }
   }
   __syncthreads();

   if (!acq1) { return false; } // return if we couldn't acquire the lock
   if (isMasterThread)
   {
     acq2 = false;
     /*
       NOTE: currCount is only accessed by 1 WG at a time and has a lock around
       it, so we can safely access it as a regular data access instead of with
       atomics.
     */
     unsigned int currSemCount = currCount[0];

     if (isWriter) {
       // writer needs the count to be == maxSemCount to enter the critical
       // section (otherwise there are readers in the critical section)
       if (currSemCount == maxSemCount) { acq2 = true; }
     } else {
       // if there is a writer waiting, readers aren't allowed to enter the
       // critical section
       if (writerWaiting[0] == 0) {
         // readers need count > 1 to enter critical section (otherwise semaphore
         // is full)
         if (currSemCount > 1) { acq2 = true; }
       }
     }
   }
   __syncthreads();

   if (!acq2) // release the sem head "lock" since the semaphore was full
   {
     // writers set a flag to note that they are waiting so more readers don't
     // join after the writer started waiting
     if (isWriter) { writerWaiting[0] = 1; /* if already 1, just reset to 1 */ }

     if (isMasterThread) {
       // atomicExch acts as a store release, need TF to enforce ordering
       __threadfence();
       atomicExch(lock, 0);
     }
     __syncthreads();
     return false;
   }
   __syncthreads();

   if (isMasterThread) {
     /*
       NOTE: currCount is only accessed by 1 WG at a time and has a lock around
       it, so we can safely access it as a regular data access instead of with
       atomics.
     */
     if (isWriter) {
       /*
         writer decrements the current count of the semaphore by the max to
         ensure that no one else can enter the critical section while it's
         writing.
       */
       currCount[0] -= maxSemCount;

       // writers also need to unset the "writer is waiting" flag
       writerWaiting[0] = 0;
     } else {
       /*
         readers decrement the current count of the semaphore by 1 so other
         readers can also read the data (but not the writers since they needs
         the entire CS).
       */
       --currCount[0]; //atomicSub(currCount, 1);
     }

     // atomicExch acts as a store release, need TF to enforce ordering
     __threadfence();
     // now that we've updated the semaphore count can release the lock
     atomicExch(lock, 0);
   }
   __syncthreads();

   return true;
 }

 inline __device__ void hipSemaphoreEBOWait(const hipSemaphore_t sem,
                                            const bool isWriter,
                                            const unsigned int maxSemCount,
                                            unsigned int * semaphoreBuffers,
                                            const int NUM_CU)
 {
   __shared__ int backoff;
   const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
                                hipThreadIdx_z == 0);

   if (isMasterThread)
   {
     backoff = 1;
   }
   __syncthreads();

   while (!hipSemaphoreEBOTryWait(sem, isWriter, maxSemCount, semaphoreBuffers, NUM_CU))
   {
     __syncthreads();
     if (isMasterThread)
     {
       // if we failed to enter the semaphore, wait for a little while before
       // trying again
       //sleepFunc(backoff);
       for (int j = 0; j < backoff; ++j) { ; }
       /*
         for writers increse backoff a lot because failing means readers are in
         the CS currently -- most important for non-unique because all WGs on
         all CUs are going for the same semaphore.
       */
       if (isWriter) {
         // (capped) exponential backoff
         backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
       }
       else { backoff += 5; /* small, linear backoff increase for readers */ }
     }
     __syncthreads();
   }
   __syncthreads();
 }

 inline __device__ void hipSemaphoreEBOPost(const hipSemaphore_t sem,
                                            const bool isWriter,
                                            const unsigned int maxSemCount,
                                            unsigned int * semaphoreBuffers,
                                            const int NUM_CU)
 {
   const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
                                hipThreadIdx_z == 0);
   /*
     Each sem has NUM_CU * 4 locations in the buffer.  Of these locations, each
     CU uses 4 of them (current count, head, tail, max count).  For the global
     semaphore use semaphoreBuffers[sem * 4 * NUM_CU].
   */
   unsigned int * const currCount = semaphoreBuffers + (sem * 4 * NUM_CU);
   unsigned int * const lock = currCount + 1;
   __shared__ bool acquired;

   if (isMasterThread) { acquired = false; }
   __syncthreads();

   while (!acquired)
   {
     __syncthreads();
     if (isMasterThread)
     {
       // try to acquire sem head "lock"
       if (atomicCAS(lock, 0, 1) == 0) {
         // atomicCAS acts as a load acquire, need TF to enforce ordering
         __threadfence();
         acquired = true;
       } else {
         acquired = false;
       }
     }
     __syncthreads();
   }

   if (isMasterThread) {
     /*
       NOTE: currCount is only accessed by 1 WG at a time and has a lock around
       it, so we can safely access it as a regular data access instead of with
       atomics.
     */
     if (isWriter) {
       // writers add the max value to the semaphore to allow the readers to
       // start accessing the critical section.
       currCount[0] += maxSemCount;
     } else {
       ++currCount[0]; // readers add 1 to the semaphore
     }

     // atomicExch acts as a store release, need TF to enforce ordering
     __threadfence();
     // now that we've updated the semaphore count can release the lock
     atomicExch(lock, 0);
   }
   __syncthreads();
 }

 // same wait algorithm but with local scope and per-CU synchronization
 inline __device__ bool hipSemaphoreEBOTryWaitLocal(const hipSemaphore_t sem,
                                                    const unsigned int cuID,
                                                    const bool isWriter,
                                                    const unsigned int maxSemCount,
                                                    unsigned int * semaphoreBuffers,
                                                    const int NUM_CU)
 {
   const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
                                hipThreadIdx_z == 0);
   /*
     Each sem has NUM_CU * 4 locations in the buffer.  Of these locations, each
     CU gets 4 of them (current count, head, tail, max count).  So CU 0 starts
     at semaphoreBuffers[sem * 4 * NUM_CU].
   */
   unsigned int * const currCount = semaphoreBuffers + ((sem * 4 * NUM_CU) +
                                                        (cuID * 4));
   unsigned int * const lock = currCount + 1;
   /*
     Reuse the tail for the "writers are waiting" flag since tail is unused.

     For now just use to indicate that at least 1 writer is waiting instead of
     a count to make sure that readers aren't totally starved out until all the
     writers are done.
   */
   unsigned int * const writerWaiting = currCount + 2;
   __shared__ bool acq1, acq2;

   __syncthreads();
   if (isMasterThread)
   {
     acq1 = false;
     // try to acquire the sem head "lock"
     if (atomicCAS(lock, 0, 1) == 0) {
       // atomicCAS acts as a load acquire, need TF to enforce ordering locally
       __threadfence_block();
       acq1 = true;
     }
   }
   __syncthreads();

   if (!acq1) { return false; } // return if we couldn't acquire the lock
   if (isMasterThread)
   {
     acq2 = false;
     /*
       NOTE: currCount is only accessed by 1 WG at a time and has a lock around
       it, so we can safely access it as a regular data access instead of with
       atomics.
     */
     unsigned int currSemCount = currCount[0];

     if (isWriter) {
       // writer needs the count to be == maxSemCount to enter the critical
       // section (otherwise there are readers in the critical section)
       if (currSemCount == maxSemCount) { acq2 = true; }
     } else {
       // if there is a writer waiting, readers aren't allowed to enter the
       // critical section
       if (writerWaiting[0] == 0) {
         // readers need count > 1 to enter critical section (otherwise semaphore
         // is full)
         if (currSemCount > 1) { acq2 = true; }
       }
     }
   }
   __syncthreads();

   if (!acq2) // release the sem head "lock" since the semaphore was full
   {
     // writers set a flag to note that they are waiting so more readers don't
     // join after the writer started waiting
     if (isWriter) { writerWaiting[0] = 1; /* if already 1, just reset to 1 */ }

     if (isMasterThread) {
       // atomicExch acts as a store release, need TF to enforce ordering locally
       __threadfence_block();
       atomicExch(lock, 0);
     }
     __syncthreads();
     return false;
   }
   __syncthreads();

   if (isMasterThread) {
     /*
       NOTE: currCount is only accessed by 1 WG at a time and has a lock around
       it, so we can safely access it as a regular data access instead of with
       atomics.
     */
     if (isWriter) {
       /*
         writer decrements the current count of the semaphore by the max to
         ensure that no one else can enter the critical section while it's
         writing.
       */
       currCount[0] -= maxSemCount;

       // writers also need to unset the "writer is waiting" flag
       writerWaiting[0] = 0;
     } else {
       /*
         readers decrement the current count of the semaphore by 1 so other
         readers can also read the data (but not the writers since they needs
         the entire CS).
       */
       --currCount[0];
     }

     // atomicExch acts as a store release, need TF to enforce ordering locally
     __threadfence_block();
     // now that we've updated the semaphore count can release the lock
     atomicExch(lock, 0);
   }
   __syncthreads();

   return true;
 }

 // same algorithm but with local scope
 inline __device__ void hipSemaphoreEBOWaitLocal(const hipSemaphore_t sem,
                                                 const unsigned int cuID,
                                                 const bool isWriter,
                                                 const unsigned int maxSemCount,
                                                 unsigned int * semaphoreBuffers,
                                                 const int NUM_CU)
 {
   __shared__ int backoff;
   const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
                                hipThreadIdx_z == 0);

   if (isMasterThread)
   {
     backoff = 1;
   }
   __syncthreads();

   while (!hipSemaphoreEBOTryWaitLocal(sem, cuID, isWriter, maxSemCount, semaphoreBuffers, NUM_CU))
   {
     __syncthreads();
     if (isMasterThread)
     {
       /*
         if we failed to enter the semaphore, wait for a little while before
         trying again
       */
       //sleepFunc(backoff);
       for (int j = 0; j < backoff; ++j) { ; }
       // (capped) exponential backoff
       backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
     }
     __syncthreads();
   }
   __syncthreads();
 }

 inline __device__ void hipSemaphoreEBOPostLocal(const hipSemaphore_t sem,
                                                 const unsigned int cuID,
                                                 const bool isWriter,
                                                 const unsigned int maxSemCount,
                                                 unsigned int * semaphoreBuffers,
                                                 const int NUM_CU)
 {
   const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
                                hipThreadIdx_z == 0);
   // Each sem has NUM_CU * 4 locations in the buffer.  Of these locations, each
   // CU gets 4 of them.  So CU 0 starts at semaphoreBuffers[sem * 4 * NUM_CU].
   unsigned int * const currCount = semaphoreBuffers + ((sem * 4 * NUM_CU) +
                                                        (cuID * 4));
   unsigned int * const lock = currCount + 1;
   __shared__ bool acquired;

   if (isMasterThread) { acquired = false; }
   __syncthreads();

   while (!acquired)
   {
     __syncthreads();
     if (isMasterThread)
     {
       // try to acquire sem head "lock"
       if (atomicCAS(lock, 0, 1) == 0) {
         // atomicCAS acts as a load acquire, need TF to enforce ordering locally
         __threadfence_block();
         acquired = true;
       } else {
         acquired = false;
       }
     }
     __syncthreads();
   }

   if (isMasterThread) {
     /*
       NOTE: currCount is only accessed by 1 WGs at a time and has a lock around
       it, so we can safely access it as a regular data access instead of with
       atomics.
     */
     if (isWriter) {
       // writers add the max value to the semaphore to allow the readers to
       // start accessing the critical section.
       currCount[0] += maxSemCount;
     } else {
       ++currCount[0]; // readers add 1 to the semaphore
     }

     // atomicExch acts as a store release, need TF to enforce ordering locally
     __threadfence_block();
     // now that we've updated the semaphore count can release the lock
     atomicExch(lock, 0);
   }
   __syncthreads();
 }

 #endif // #ifndef __HIPSEMAPHOREEBO_H__
	#ifndef __HIPSEMAPHOREEBO_H__
	#define __HIPSEMAPHOREEBO_H__

	#include "hip/hip_runtime.h"
	#include "hipLocks.h"

	inline __host__ hipError_t hipSemaphoreCreateEBO(hipSemaphore_t * const handle,
	const int semaphoreNumber,
	const unsigned int count,
	const int NUM_CU)
	{
	// Here we set the initial value to be count+1, this allows us to do an
	// atomicExch(sem, 0) and basically use the semaphore value as both a
	// lock and a semaphore.
	unsigned int initialValue = (count + 1), zero = 0;
	*handle = semaphoreNumber;
	for (int id = 0; id < NUM_CU; ++id) { // need to set these values for all CUs
	hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4))]), &initialValue, sizeof(initialValue), hipMemcpyHostToDevice);
	hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 1]), &zero, sizeof(zero), hipMemcpyHostToDevice);
	hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 2]), &zero, sizeof(zero), hipMemcpyHostToDevice);
	hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 3]), &initialValue, sizeof(initialValue), hipMemcpyHostToDevice);
	}
	return hipSuccess;
	}

	inline __device__ bool hipSemaphoreEBOTryWait(const hipSemaphore_t sem,
	const bool isWriter,
	const unsigned int maxSemCount,
	unsigned int * semaphoreBuffers,
	const int NUM_CU)
	{
	const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
	hipThreadIdx_z == 0);
	/*
	Each sem has NUM_CU * 4 locations in the buffer. Of these locations, each
	CU uses 4 of them (current count, head, tail, max count). For the global
	semaphore all CUs use semaphoreBuffers[sem * 4 * NUM_CU].
	*/
	unsigned int * const currCount = semaphoreBuffers + (sem * 4 * NUM_CU);
	unsigned int * const lock = currCount + 1;
	/*
	Reuse the tail for the "writers are waiting" flag since tail is unused.

	For now just use to indicate that at least 1 writer is waiting instead of
	a count to make sure that readers aren't totally starved out until all the
	writers are done.
	*/
	unsigned int * const writerWaiting = currCount + 2;
	__shared__ bool acq1, acq2;

	__syncthreads();
	if (isMasterThread)
	{
	acq1 = false;
	// try to acquire the sem head "lock"
	if (atomicCAS(lock, 0, 1) == 0) {
	// atomicCAS acts as a load acquire, need TF to enforce ordering
	__threadfence();
	acq1 = true;
	}
	}
	__syncthreads();

	if (!acq1) { return false; } // return if we couldn't acquire the lock
	if (isMasterThread)
	{
	acq2 = false;
	/*
	NOTE: currCount is only accessed by 1 WG at a time and has a lock around
	it, so we can safely access it as a regular data access instead of with
	atomics.
	*/
	unsigned int currSemCount = currCount[0];

	if (isWriter) {
	// writer needs the count to be == maxSemCount to enter the critical
	// section (otherwise there are readers in the critical section)
	if (currSemCount == maxSemCount) { acq2 = true; }
	} else {
	// if there is a writer waiting, readers aren't allowed to enter the
	// critical section
	if (writerWaiting[0] == 0) {
	// readers need count > 1 to enter critical section (otherwise semaphore
	// is full)
	if (currSemCount > 1) { acq2 = true; }
	}
	}
	}
	__syncthreads();

	if (!acq2) // release the sem head "lock" since the semaphore was full
	{
	// writers set a flag to note that they are waiting so more readers don't
	// join after the writer started waiting
	if (isWriter) { writerWaiting[0] = 1; /* if already 1, just reset to 1 */ }

	if (isMasterThread) {
	// atomicExch acts as a store release, need TF to enforce ordering
	__threadfence();
	atomicExch(lock, 0);
	}
	__syncthreads();
	return false;
	}
	__syncthreads();

	if (isMasterThread) {
	/*
	NOTE: currCount is only accessed by 1 WG at a time and has a lock around
	it, so we can safely access it as a regular data access instead of with
	atomics.
	*/
	if (isWriter) {
	/*
	writer decrements the current count of the semaphore by the max to
	ensure that no one else can enter the critical section while it's
	writing.
	*/
	currCount[0] -= maxSemCount;

	// writers also need to unset the "writer is waiting" flag
	writerWaiting[0] = 0;
	} else {
	/*
	readers decrement the current count of the semaphore by 1 so other
	readers can also read the data (but not the writers since they needs
	the entire CS).
	*/
	--currCount[0]; //atomicSub(currCount, 1);
	}

	// atomicExch acts as a store release, need TF to enforce ordering
	__threadfence();
	// now that we've updated the semaphore count can release the lock
	atomicExch(lock, 0);
	}
	__syncthreads();

	return true;
	}

	inline __device__ void hipSemaphoreEBOWait(const hipSemaphore_t sem,
	const bool isWriter,
	const unsigned int maxSemCount,
	unsigned int * semaphoreBuffers,
	const int NUM_CU)
	{
	__shared__ int backoff;
	const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
	hipThreadIdx_z == 0);

	if (isMasterThread)
	{
	backoff = 1;
	}
	__syncthreads();

	while (!hipSemaphoreEBOTryWait(sem, isWriter, maxSemCount, semaphoreBuffers, NUM_CU))
	{
	__syncthreads();
	if (isMasterThread)
	{
	// if we failed to enter the semaphore, wait for a little while before
	// trying again
	//sleepFunc(backoff);
	for (int j = 0; j < backoff; ++j) { ; }
	/*
	for writers increse backoff a lot because failing means readers are in
	the CS currently -- most important for non-unique because all WGs on
	all CUs are going for the same semaphore.
	*/
	if (isWriter) {
	// (capped) exponential backoff
	backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
	}
	else { backoff += 5; /* small, linear backoff increase for readers */ }
	}
	__syncthreads();
	}
	__syncthreads();
	}

	inline __device__ void hipSemaphoreEBOPost(const hipSemaphore_t sem,
	const bool isWriter,
	const unsigned int maxSemCount,
	unsigned int * semaphoreBuffers,
	const int NUM_CU)
	{
	const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
	hipThreadIdx_z == 0);
	/*
	Each sem has NUM_CU * 4 locations in the buffer. Of these locations, each
	CU uses 4 of them (current count, head, tail, max count). For the global
	semaphore use semaphoreBuffers[sem * 4 * NUM_CU].
	*/
	unsigned int * const currCount = semaphoreBuffers + (sem * 4 * NUM_CU);
	unsigned int * const lock = currCount + 1;
	__shared__ bool acquired;

	if (isMasterThread) { acquired = false; }
	__syncthreads();

	while (!acquired)
	{
	__syncthreads();
	if (isMasterThread)
	{
	// try to acquire sem head "lock"
	if (atomicCAS(lock, 0, 1) == 0) {
	// atomicCAS acts as a load acquire, need TF to enforce ordering
	__threadfence();
	acquired = true;
	} else {
	acquired = false;
	}
	}
	__syncthreads();
	}

	if (isMasterThread) {
	/*
	NOTE: currCount is only accessed by 1 WG at a time and has a lock around
	it, so we can safely access it as a regular data access instead of with
	atomics.
	*/
	if (isWriter) {
	// writers add the max value to the semaphore to allow the readers to
	// start accessing the critical section.
	currCount[0] += maxSemCount;
	} else {
	++currCount[0]; // readers add 1 to the semaphore
	}

	// atomicExch acts as a store release, need TF to enforce ordering
	__threadfence();
	// now that we've updated the semaphore count can release the lock
	atomicExch(lock, 0);
	}
	__syncthreads();
	}

	// same wait algorithm but with local scope and per-CU synchronization
	inline __device__ bool hipSemaphoreEBOTryWaitLocal(const hipSemaphore_t sem,
	const unsigned int cuID,
	const bool isWriter,
	const unsigned int maxSemCount,
	unsigned int * semaphoreBuffers,
	const int NUM_CU)
	{
	const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
	hipThreadIdx_z == 0);
	/*
	Each sem has NUM_CU * 4 locations in the buffer. Of these locations, each
	CU gets 4 of them (current count, head, tail, max count). So CU 0 starts
	at semaphoreBuffers[sem * 4 * NUM_CU].
	*/
	unsigned int * const currCount = semaphoreBuffers + ((sem * 4 * NUM_CU) +
	(cuID * 4));
	unsigned int * const lock = currCount + 1;
	/*
	Reuse the tail for the "writers are waiting" flag since tail is unused.

	For now just use to indicate that at least 1 writer is waiting instead of
	a count to make sure that readers aren't totally starved out until all the
	writers are done.
	*/
	unsigned int * const writerWaiting = currCount + 2;
	__shared__ bool acq1, acq2;

	__syncthreads();
	if (isMasterThread)
	{
	acq1 = false;
	// try to acquire the sem head "lock"
	if (atomicCAS(lock, 0, 1) == 0) {
	// atomicCAS acts as a load acquire, need TF to enforce ordering locally
	__threadfence_block();
	acq1 = true;
	}
	}
	__syncthreads();

	if (!acq1) { return false; } // return if we couldn't acquire the lock
	if (isMasterThread)
	{
	acq2 = false;
	/*
	NOTE: currCount is only accessed by 1 WG at a time and has a lock around
	it, so we can safely access it as a regular data access instead of with
	atomics.
	*/
	unsigned int currSemCount = currCount[0];

	if (isWriter) {
	// writer needs the count to be == maxSemCount to enter the critical
	// section (otherwise there are readers in the critical section)
	if (currSemCount == maxSemCount) { acq2 = true; }
	} else {
	// if there is a writer waiting, readers aren't allowed to enter the
	// critical section
	if (writerWaiting[0] == 0) {
	// readers need count > 1 to enter critical section (otherwise semaphore
	// is full)
	if (currSemCount > 1) { acq2 = true; }
	}
	}
	}
	__syncthreads();

	if (!acq2) // release the sem head "lock" since the semaphore was full
	{
	// writers set a flag to note that they are waiting so more readers don't
	// join after the writer started waiting
	if (isWriter) { writerWaiting[0] = 1; /* if already 1, just reset to 1 */ }

	if (isMasterThread) {
	// atomicExch acts as a store release, need TF to enforce ordering locally
	__threadfence_block();
	atomicExch(lock, 0);
	}
	__syncthreads();
	return false;
	}
	__syncthreads();

	if (isMasterThread) {
	/*
	NOTE: currCount is only accessed by 1 WG at a time and has a lock around
	it, so we can safely access it as a regular data access instead of with
	atomics.
	*/
	if (isWriter) {
	/*
	writer decrements the current count of the semaphore by the max to
	ensure that no one else can enter the critical section while it's
	writing.
	*/
	currCount[0] -= maxSemCount;

	// writers also need to unset the "writer is waiting" flag
	writerWaiting[0] = 0;
	} else {
	/*
	readers decrement the current count of the semaphore by 1 so other
	readers can also read the data (but not the writers since they needs
	the entire CS).
	*/
	--currCount[0];
	}

	// atomicExch acts as a store release, need TF to enforce ordering locally
	__threadfence_block();
	// now that we've updated the semaphore count can release the lock
	atomicExch(lock, 0);
	}
	__syncthreads();

	return true;
	}

	// same algorithm but with local scope
	inline __device__ void hipSemaphoreEBOWaitLocal(const hipSemaphore_t sem,
	const unsigned int cuID,
	const bool isWriter,
	const unsigned int maxSemCount,
	unsigned int * semaphoreBuffers,
	const int NUM_CU)
	{
	__shared__ int backoff;
	const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
	hipThreadIdx_z == 0);

	if (isMasterThread)
	{
	backoff = 1;
	}
	__syncthreads();

	while (!hipSemaphoreEBOTryWaitLocal(sem, cuID, isWriter, maxSemCount, semaphoreBuffers, NUM_CU))
	{
	__syncthreads();
	if (isMasterThread)
	{
	/*
	if we failed to enter the semaphore, wait for a little while before
	trying again
	*/
	//sleepFunc(backoff);
	for (int j = 0; j < backoff; ++j) { ; }
	// (capped) exponential backoff
	backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
	}
	__syncthreads();
	}
	__syncthreads();
	}

	inline __device__ void hipSemaphoreEBOPostLocal(const hipSemaphore_t sem,
	const unsigned int cuID,
	const bool isWriter,
	const unsigned int maxSemCount,
	unsigned int * semaphoreBuffers,
	const int NUM_CU)
	{
	const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
	hipThreadIdx_z == 0);
	// Each sem has NUM_CU * 4 locations in the buffer. Of these locations, each
	// CU gets 4 of them. So CU 0 starts at semaphoreBuffers[sem * 4 * NUM_CU].
	unsigned int * const currCount = semaphoreBuffers + ((sem * 4 * NUM_CU) +
	(cuID * 4));
	unsigned int * const lock = currCount + 1;
	__shared__ bool acquired;

	if (isMasterThread) { acquired = false; }
	__syncthreads();

	while (!acquired)
	{
	__syncthreads();
	if (isMasterThread)
	{
	// try to acquire sem head "lock"
	if (atomicCAS(lock, 0, 1) == 0) {
	// atomicCAS acts as a load acquire, need TF to enforce ordering locally
	__threadfence_block();
	acquired = true;
	} else {
	acquired = false;
	}
	}
	__syncthreads();
	}

	if (isMasterThread) {
	/*
	NOTE: currCount is only accessed by 1 WGs at a time and has a lock around
	it, so we can safely access it as a regular data access instead of with
	atomics.
	*/
	if (isWriter) {
	// writers add the max value to the semaphore to allow the readers to
	// start accessing the critical section.
	currCount[0] += maxSemCount;
	} else {
	++currCount[0]; // readers add 1 to the semaphore
	}

	// atomicExch acts as a store release, need TF to enforce ordering locally
	__threadfence_block();
	// now that we've updated the semaphore count can release the lock
	atomicExch(lock, 0);
	}
	__syncthreads();
	}

	#endif // #ifndef __HIPSEMAPHOREEBO_H__