src/gpu/heterosync/src/hipLocksImpl.h - public/gem5-resources - Git at Google

 #include "hipLocks.h"

 hipError_t hipLocksInit(const int maxWGsPerKernel, const int numMutexes,
                         const int numSemaphores, const bool pageAlign,
                         const int NUM_CU, const int NUM_REPEATS,
                         const int NUM_ITERS)
 {
   hipError_t hipErr = hipGetLastError();
   checkError(hipErr, "Start hipLocksInit");

   hipHostMalloc(&cpuLockData, sizeof(hipLockData_t));

   if (maxWGsPerKernel <= 0)       return hipErrorInitializationError;
   if (numMutexes <= 0)            return hipErrorInitializationError;
   if (numSemaphores <= 0)         return hipErrorInitializationError;

   // initialize some of the lock data's values
   /*
     Since HIP doesn't generate the correct code for atomicInc's, this
     means wraparound is not handled properly.  However, since in the current
     version each subsequent kernel launch starts in the ring buffer where
     the last kernel left off, this eventually leads to wraparound.  Increase
     buffer size to prevent wraparound and hide this.
   */
   cpuLockData->maxBufferSize          = maxWGsPerKernel * NUM_REPEATS * NUM_ITERS;
   cpuLockData->arrayStride            = (cpuLockData->maxBufferSize + NUM_CU) /
                                           NUM_WORDS_PER_CACHELINE * NUM_WORDS_PER_CACHELINE;
   cpuLockData->mutexCount             = numMutexes;
   cpuLockData->semaphoreCount         = numSemaphores;

   hipHostMalloc(&cpuLockData->barrierBuffers,   sizeof(unsigned int) * cpuLockData->arrayStride * 2);

   hipHostMalloc(&cpuLockData->mutexBuffers,     sizeof(int) * cpuLockData->arrayStride * cpuLockData->mutexCount);
   hipHostMalloc(&cpuLockData->mutexBufferHeads, sizeof(unsigned int) * cpuLockData->mutexCount);
   hipHostMalloc(&cpuLockData->mutexBufferTails, sizeof(unsigned int) * cpuLockData->mutexCount);

   hipHostMalloc(&cpuLockData->semaphoreBuffers, sizeof(unsigned int) * 4 * cpuLockData->semaphoreCount);

   hipErr = hipGetLastError();
   checkError(hipErr, "Before memsets");

   hipDeviceSynchronize();

   hipMemset(cpuLockData->barrierBuffers, 0,
             sizeof(unsigned int) * cpuLockData->arrayStride * 2);

   hipMemset(cpuLockData->mutexBufferHeads, 0,
             sizeof(unsigned int) * cpuLockData->mutexCount);
   hipMemset(cpuLockData->mutexBufferTails, 0,
             sizeof(unsigned int) * cpuLockData->mutexCount);

   /*
     initialize mutexBuffers to appropriate values

     set the first location for each CU to 1 so that the ring buffer can be
     used by the first WG right away (otherwise livelock because no locations
     ever == 1)

     for all other locations initialize to -1 so WGs for these locations
     don't think it's their turn right away

     since hipMemset sets everything in bytes, initialize all to 0 first
   */
   hipMemset(&(cpuLockData->mutexBuffers[0]), 0,
             cpuLockData->arrayStride * cpuLockData->mutexCount * sizeof(int));
   for (int i = 0; i < (cpuLockData->arrayStride * cpuLockData->mutexCount);
        i += cpuLockData->arrayStride) {
     hipMemset(&(cpuLockData->mutexBuffers[i]), 0x0001, 1);
     hipMemset(&(cpuLockData->mutexBuffers[i + 1]), -1,
               (cpuLockData->arrayStride - 1) * sizeof(int));
   }

   hipMemset(cpuLockData->semaphoreBuffers, 0,
             sizeof(unsigned int) * cpuLockData->semaphoreCount * 4);

   hipDeviceSynchronize();

   return hipSuccess;
 }

 hipError_t hipLocksDestroy()
 {
   if (cpuLockData == NULL) { return hipErrorInitializationError; }
   hipHostFree(cpuLockData->mutexBuffers);
   hipHostFree(cpuLockData->mutexBufferHeads);
   hipHostFree(cpuLockData->mutexBufferTails);

   hipHostFree(cpuLockData->semaphoreBuffers);

   hipHostFree(cpuLockData);

   return hipSuccess;
 }
	#include "hipLocks.h"

	hipError_t hipLocksInit(const int maxWGsPerKernel, const int numMutexes,
	const int numSemaphores, const bool pageAlign,
	const int NUM_CU, const int NUM_REPEATS,
	const int NUM_ITERS)
	{
	hipError_t hipErr = hipGetLastError();
	checkError(hipErr, "Start hipLocksInit");

	hipHostMalloc(&cpuLockData, sizeof(hipLockData_t));

	if (maxWGsPerKernel <= 0) return hipErrorInitializationError;
	if (numMutexes <= 0) return hipErrorInitializationError;
	if (numSemaphores <= 0) return hipErrorInitializationError;

	// initialize some of the lock data's values
	/*
	Since HIP doesn't generate the correct code for atomicInc's, this
	means wraparound is not handled properly. However, since in the current
	version each subsequent kernel launch starts in the ring buffer where
	the last kernel left off, this eventually leads to wraparound. Increase
	buffer size to prevent wraparound and hide this.
	*/
	cpuLockData->maxBufferSize = maxWGsPerKernel * NUM_REPEATS * NUM_ITERS;
	cpuLockData->arrayStride = (cpuLockData->maxBufferSize + NUM_CU) /
	NUM_WORDS_PER_CACHELINE * NUM_WORDS_PER_CACHELINE;
	cpuLockData->mutexCount = numMutexes;
	cpuLockData->semaphoreCount = numSemaphores;

	hipHostMalloc(&cpuLockData->barrierBuffers, sizeof(unsigned int) * cpuLockData->arrayStride * 2);

	hipHostMalloc(&cpuLockData->mutexBuffers, sizeof(int) * cpuLockData->arrayStride * cpuLockData->mutexCount);
	hipHostMalloc(&cpuLockData->mutexBufferHeads, sizeof(unsigned int) * cpuLockData->mutexCount);
	hipHostMalloc(&cpuLockData->mutexBufferTails, sizeof(unsigned int) * cpuLockData->mutexCount);

	hipHostMalloc(&cpuLockData->semaphoreBuffers, sizeof(unsigned int) * 4 * cpuLockData->semaphoreCount);

	hipErr = hipGetLastError();
	checkError(hipErr, "Before memsets");

	hipDeviceSynchronize();

	hipMemset(cpuLockData->barrierBuffers, 0,
	sizeof(unsigned int) * cpuLockData->arrayStride * 2);

	hipMemset(cpuLockData->mutexBufferHeads, 0,
	sizeof(unsigned int) * cpuLockData->mutexCount);
	hipMemset(cpuLockData->mutexBufferTails, 0,
	sizeof(unsigned int) * cpuLockData->mutexCount);

	/*
	initialize mutexBuffers to appropriate values

	set the first location for each CU to 1 so that the ring buffer can be
	used by the first WG right away (otherwise livelock because no locations
	ever == 1)

	for all other locations initialize to -1 so WGs for these locations
	don't think it's their turn right away

	since hipMemset sets everything in bytes, initialize all to 0 first
	*/
	hipMemset(&(cpuLockData->mutexBuffers[0]), 0,
	cpuLockData->arrayStride * cpuLockData->mutexCount * sizeof(int));
	for (int i = 0; i < (cpuLockData->arrayStride * cpuLockData->mutexCount);
	i += cpuLockData->arrayStride) {
	hipMemset(&(cpuLockData->mutexBuffers[i]), 0x0001, 1);
	hipMemset(&(cpuLockData->mutexBuffers[i + 1]), -1,
	(cpuLockData->arrayStride - 1) * sizeof(int));
	}

	hipMemset(cpuLockData->semaphoreBuffers, 0,
	sizeof(unsigned int) * cpuLockData->semaphoreCount * 4);

	hipDeviceSynchronize();

	return hipSuccess;
	}

	hipError_t hipLocksDestroy()
	{
	if (cpuLockData == NULL) { return hipErrorInitializationError; }
	hipHostFree(cpuLockData->mutexBuffers);
	hipHostFree(cpuLockData->mutexBufferHeads);
	hipHostFree(cpuLockData->mutexBufferTails);

	hipHostFree(cpuLockData->semaphoreBuffers);

	hipHostFree(cpuLockData);

	return hipSuccess;
	}