src/gpu/halo-finder/src/cudaUtil.h - public/gem5-resources - Git at Google

 #include "hip/hip_runtime.h"
 #pragma once
 #include <stdio.h>

 #define cudaCheckError() {                                          \
  hipError_t e=hipGetLastError();                                 \
  if(e!=hipSuccess) {                                              \
    printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,hipGetErrorString(e));           \
    exit(0); \
  }                                                                 \
 }

 template <int hipWarpSize, typename T>
 __device__ __inline__ T warpReduceSum(T val) {
   if(hipWarpSize>16) val+=__shfl_down(val,16,hipWarpSize);
   if(hipWarpSize>8) val+=__shfl_down(val,8,hipWarpSize);
   if(hipWarpSize>4) val+=__shfl_down(val,4,hipWarpSize);
   if(hipWarpSize>2) val+=__shfl_down(val,2,hipWarpSize);
   if(hipWarpSize>1) val+=__shfl_down(val,1,hipWarpSize);
   return val;
 }
 template <int hipWarpSize, typename T>
 __device__ __inline__ T warpReduceMax(T val) {
   if(hipWarpSize>16) val=max(val,__shfl_down(val,16,hipWarpSize));
   if(hipWarpSize>8)  val=max(val,__shfl_down(val,8,hipWarpSize));
   if(hipWarpSize>4)  val=max(val,__shfl_down(val,4,hipWarpSize));
   if(hipWarpSize>2)  val=max(val,__shfl_down(val,2,hipWarpSize));
   if(hipWarpSize>1)  val=max(val,__shfl_down(val,1,hipWarpSize));
   return val;
 }
 template <int hipWarpSize, typename T>
 __device__ __inline__ T warpReduceMin(T val) {
   if(hipWarpSize>16) val=min(val,__shfl_down(val,16,hipWarpSize));
   if(hipWarpSize>8)  val=min(val,__shfl_down(val,8,hipWarpSize));
   if(hipWarpSize>4)  val=min(val,__shfl_down(val,4,hipWarpSize));
   if(hipWarpSize>2)  val=min(val,__shfl_down(val,2,hipWarpSize));
   if(hipWarpSize>1)  val=min(val,__shfl_down(val,1,hipWarpSize));
   return val;
 }

 template <typename T>
 __device__ __inline__ T blockReduceSum(T val) {
   __shared__ volatile T smem[32];

   val=warpReduceSum<32>(val);
   if(hipThreadIdx_x%32==0) smem[hipThreadIdx_x/32]=val;
   __syncthreads();
   val=0;
   if(hipThreadIdx_x<hipBlockDim_x/32) val=smem[hipThreadIdx_x];
   if(hipThreadIdx_x<32) val=warpReduceSum<32>(val);
   __syncthreads();
   return val;
 }

 template <typename T>
 __device__ __inline__ T blockReduceMax(T val) {
   __shared__ volatile T smem[32];

   val=warpReduceMax<32>(val);
   if(hipThreadIdx_x%32==0) smem[hipThreadIdx_x/32]=val;
   __syncthreads();

   if(hipThreadIdx_x<32==0) {
     val=max(smem[0],smem[hipThreadIdx_x]);
     val=warpReduceMax<32>(val);
   }
   __syncthreads();
   return val;
 }

 template <typename T>
 __device__ __inline__ T blockReduceMin(T val) {
   __shared__ volatile T smem[32];

   val=warpReduceMin<32>(val);
   if(hipThreadIdx_x%32==0) smem[hipThreadIdx_x/32]=val;
   __syncthreads();

   if(hipThreadIdx_x<32==0) {
     val=min(smem[0],smem[hipThreadIdx_x]);
     val=warpReduceMin<32>(val);
   }
   __syncthreads();
   return val;
 }

 __device__  __forceinline__
 void atomicWarpReduceAndUpdate(POSVEL_T *out, POSVEL_T val) {
   //perform shfl reduction
   val+=__shfl_down(val, 16);
   val+=__shfl_down(val, 8);
   val+=__shfl_down(val, 4);
   val+=__shfl_down(val, 2);
   val+=__shfl_down(val, 1);

   if(hipThreadIdx_x%32==0)
     atomicAdd(out,val);  //atomics are unecessary but they are faster than non-atomics due to a single bus transaction
 }

 class cudaDeviceSelector {
   public:
   cudaDeviceSelector() {
     char* str;
     int local_rank = 0;
     int numDev=1;

     //No MPI at this time so go by enviornment variables.
     //This may need to be updated to match your MPI flavor
     if((str = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) {
       local_rank = atoi(str);
     }
     else if((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) {
       local_rank = atoi(str);
     }
     else if((str = getenv("SLURM_LOCALID")) != NULL) {
       local_rank = atoi(str);
     }

     //get the number of devices to use
     if((str = getenv("HACC_NUM_CUDA_DEV")) != NULL) {
       numDev=atoi(str);
     }

 #if 0

 #if 0
     //Use MPS,  need to figure out how to set numDev, perhaps and enviornment varaible?
     char var[100];
     sprintf(var,"/tmp/nvidia-mps_%d",local_rank%numDev);
     setenv("CUDA_MPS_PIPE_DIRECTORY",var,1);
 #endif
 #else
     int dev;
     //set via local MPI rank
     dev = local_rank % numDev;

     //we must set this for all threads
 	hipSetDevice(dev);
 #endif
   }
 };

 inline void checkCudaPtr(const void* ptr, const char* name) {
   int rank;

   MPI_Comm_rank(MPI_COMM_WORLD,&rank);

   int dev;
   hipGetDevice(&dev);

   hipPointerAttribute_t at;

   hipPointerGetAttributes(&at,ptr);

   if(dev!=at.device) {
     printf("%d: Error '%s', dev: %d, at.device: %d\n", rank, name, dev, at.device);
   }
 }
	#include "hip/hip_runtime.h"
	#pragma once
	#include <stdio.h>

	#define cudaCheckError() { \
	hipError_t e=hipGetLastError(); \
	if(e!=hipSuccess) { \
	printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,hipGetErrorString(e)); \
	exit(0); \
	} \
	}

	template <int hipWarpSize, typename T>
	__device__ __inline__ T warpReduceSum(T val) {
	if(hipWarpSize>16) val+=__shfl_down(val,16,hipWarpSize);
	if(hipWarpSize>8) val+=__shfl_down(val,8,hipWarpSize);
	if(hipWarpSize>4) val+=__shfl_down(val,4,hipWarpSize);
	if(hipWarpSize>2) val+=__shfl_down(val,2,hipWarpSize);
	if(hipWarpSize>1) val+=__shfl_down(val,1,hipWarpSize);
	return val;
	}
	template <int hipWarpSize, typename T>
	__device__ __inline__ T warpReduceMax(T val) {
	if(hipWarpSize>16) val=max(val,__shfl_down(val,16,hipWarpSize));
	if(hipWarpSize>8) val=max(val,__shfl_down(val,8,hipWarpSize));
	if(hipWarpSize>4) val=max(val,__shfl_down(val,4,hipWarpSize));
	if(hipWarpSize>2) val=max(val,__shfl_down(val,2,hipWarpSize));
	if(hipWarpSize>1) val=max(val,__shfl_down(val,1,hipWarpSize));
	return val;
	}
	template <int hipWarpSize, typename T>
	__device__ __inline__ T warpReduceMin(T val) {
	if(hipWarpSize>16) val=min(val,__shfl_down(val,16,hipWarpSize));
	if(hipWarpSize>8) val=min(val,__shfl_down(val,8,hipWarpSize));
	if(hipWarpSize>4) val=min(val,__shfl_down(val,4,hipWarpSize));
	if(hipWarpSize>2) val=min(val,__shfl_down(val,2,hipWarpSize));
	if(hipWarpSize>1) val=min(val,__shfl_down(val,1,hipWarpSize));
	return val;
	}

	template <typename T>
	__device__ __inline__ T blockReduceSum(T val) {
	__shared__ volatile T smem[32];

	val=warpReduceSum<32>(val);
	if(hipThreadIdx_x%32==0) smem[hipThreadIdx_x/32]=val;
	__syncthreads();
	val=0;
	if(hipThreadIdx_x<hipBlockDim_x/32) val=smem[hipThreadIdx_x];
	if(hipThreadIdx_x<32) val=warpReduceSum<32>(val);
	__syncthreads();
	return val;
	}

	template <typename T>
	__device__ __inline__ T blockReduceMax(T val) {
	__shared__ volatile T smem[32];

	val=warpReduceMax<32>(val);
	if(hipThreadIdx_x%32==0) smem[hipThreadIdx_x/32]=val;
	__syncthreads();

	if(hipThreadIdx_x<32==0) {
	val=max(smem[0],smem[hipThreadIdx_x]);
	val=warpReduceMax<32>(val);
	}
	__syncthreads();
	return val;
	}

	template <typename T>
	__device__ __inline__ T blockReduceMin(T val) {
	__shared__ volatile T smem[32];

	val=warpReduceMin<32>(val);
	if(hipThreadIdx_x%32==0) smem[hipThreadIdx_x/32]=val;
	__syncthreads();

	if(hipThreadIdx_x<32==0) {
	val=min(smem[0],smem[hipThreadIdx_x]);
	val=warpReduceMin<32>(val);
	}
	__syncthreads();
	return val;
	}

	__device__ __forceinline__
	void atomicWarpReduceAndUpdate(POSVEL_T *out, POSVEL_T val) {
	//perform shfl reduction
	val+=__shfl_down(val, 16);
	val+=__shfl_down(val, 8);
	val+=__shfl_down(val, 4);
	val+=__shfl_down(val, 2);
	val+=__shfl_down(val, 1);

	if(hipThreadIdx_x%32==0)
	atomicAdd(out,val); //atomics are unecessary but they are faster than non-atomics due to a single bus transaction
	}

	class cudaDeviceSelector {
	public:
	cudaDeviceSelector() {
	char* str;
	int local_rank = 0;
	int numDev=1;

	//No MPI at this time so go by enviornment variables.
	//This may need to be updated to match your MPI flavor
	if((str = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) {
	local_rank = atoi(str);
	}
	else if((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) {
	local_rank = atoi(str);
	}
	else if((str = getenv("SLURM_LOCALID")) != NULL) {
	local_rank = atoi(str);
	}

	//get the number of devices to use
	if((str = getenv("HACC_NUM_CUDA_DEV")) != NULL) {
	numDev=atoi(str);
	}

	#if 0

	#if 0
	//Use MPS, need to figure out how to set numDev, perhaps and enviornment varaible?
	char var[100];
	sprintf(var,"/tmp/nvidia-mps_%d",local_rank%numDev);
	setenv("CUDA_MPS_PIPE_DIRECTORY",var,1);
	#endif
	#else
	int dev;
	//set via local MPI rank
	dev = local_rank % numDev;

	//we must set this for all threads
	hipSetDevice(dev);
	#endif
	}
	};

	inline void checkCudaPtr(const void* ptr, const char* name) {
	int rank;

	MPI_Comm_rank(MPI_COMM_WORLD,&rank);

	int dev;
	hipGetDevice(&dev);

	hipPointerAttribute_t at;

	hipPointerGetAttributes(&at,ptr);

	if(dev!=at.device) {
	printf("%d: Error '%s', dev: %d, at.device: %d\n", rank, name, dev, at.device);
	}
	}