src/arch/amdgpu/gcn3/gpu_mem_helpers.hh - public/gem5 - Git at Google

 /*
  * Copyright (c) 2021 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 #ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__
 #define __ARCH_GCN3_GPU_MEM_HELPERS_HH__

 #include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
 #include "arch/amdgpu/gcn3/insts/op_encodings.hh"
 #include "debug/GPUMem.hh"
 #include "gpu-compute/gpu_dyn_inst.hh"

 namespace gem5
 {

 /**
  * Helper function for instructions declared in op_encodings.  This function
  * takes in all of the arguments for a given memory request we are trying to
  * initialize, then submits the request or requests depending on if the
  * original request is aligned or unaligned.
  */
 template<typename T, int N>
 inline void
 initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
                  bool is_atomic=false)
 {
     // local variables
     int req_size = N * sizeof(T);
     int block_size = gpuDynInst->computeUnit()->cacheLineSize();
     Addr vaddr = 0, split_addr = 0;
     bool misaligned_acc = false;
     RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
     PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;

     gpuDynInst->resetEntireStatusVector();
     for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {
         if (gpuDynInst->exec_mask[lane]) {
             vaddr = gpuDynInst->addr[lane];

             /**
              * the base address of the cache line where the the last
              * byte of the request will be stored.
              */
             split_addr = roundDown(vaddr + req_size - 1, block_size);

             assert(split_addr <= vaddr || split_addr - vaddr < block_size);
             /**
              * if the base cache line address of the last byte is
              * greater than the address of the first byte then we have
              * a misaligned access.
              */
             misaligned_acc = split_addr > vaddr;

             if (is_atomic) {
                 // make sure request is word aligned
                 assert((vaddr & 0x3) == 0);

                 // a given lane's atomic can't cross cache lines
                 assert(!misaligned_acc);

                 req = std::make_shared<Request>(vaddr, sizeof(T), 0,
                     gpuDynInst->computeUnit()->requestorId(), 0,
                     gpuDynInst->wfDynId,
                     gpuDynInst->makeAtomicOpFunctor<T>(
                         &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
                         &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));
             } else {
                 req = std::make_shared<Request>(vaddr, req_size, 0,
                                   gpuDynInst->computeUnit()->requestorId(), 0,
                                   gpuDynInst->wfDynId);
             }

             if (misaligned_acc) {
                 gpuDynInst->setStatusVector(lane, 2);
                 req->splitOnVaddr(split_addr, req1, req2);
                 gpuDynInst->setRequestFlags(req1);
                 gpuDynInst->setRequestFlags(req2);
                 pkt1 = new Packet(req1, mem_req_type);
                 pkt2 = new Packet(req2, mem_req_type);
                 pkt1->dataStatic(&(reinterpret_cast<T*>(
                     gpuDynInst->d_data))[lane * N]);
                 pkt2->dataStatic(&(reinterpret_cast<T*>(
                     gpuDynInst->d_data))[lane * N +
                                          req1->getSize()/sizeof(T)]);
                 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "
                         "request for %#x\n", gpuDynInst->cu_id,
                         gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
                         split_addr);
                 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
                 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
             } else {
                 gpuDynInst->setStatusVector(lane, 1);
                 gpuDynInst->setRequestFlags(req);
                 pkt = new Packet(req, mem_req_type);
                 pkt->dataStatic(&(reinterpret_cast<T*>(
                     gpuDynInst->d_data))[lane * N]);
                 gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
             }
         } else { // if lane is not active, then no pending requests
             gpuDynInst->setStatusVector(lane, 0);
         }
     }
 }

 /**
  * Helper function for scalar instructions declared in op_encodings.  This
  * function takes in all of the arguments for a given memory request we are
  * trying to initialize, then submits the request or requests depending on if
  * the original request is aligned or unaligned.
  */
 template<typename T, int N>
 inline void
 initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
 {
     int req_size = N * sizeof(T);
     int block_size = gpuDynInst->computeUnit()->cacheLineSize();
     Addr vaddr = gpuDynInst->scalarAddr;

     /**
      * the base address of the cache line where the the last byte of
      * the request will be stored.
      */
     Addr split_addr = roundDown(vaddr + req_size - 1, block_size);

     assert(split_addr <= vaddr || split_addr - vaddr < block_size);
     /**
      * if the base cache line address of the last byte is greater
      * than the address of the first byte then we have a misaligned
      * access.
      */
     bool misaligned_acc = split_addr > vaddr;

     RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
                                  gpuDynInst->computeUnit()->requestorId(), 0,
                                  gpuDynInst->wfDynId);

     if (misaligned_acc) {
         RequestPtr req1, req2;
         req->splitOnVaddr(split_addr, req1, req2);
         gpuDynInst->numScalarReqs = 2;
         gpuDynInst->setRequestFlags(req1);
         gpuDynInst->setRequestFlags(req2);
         PacketPtr pkt1 = new Packet(req1, mem_req_type);
         PacketPtr pkt2 = new Packet(req2, mem_req_type);
         pkt1->dataStatic(gpuDynInst->scalar_data);
         pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
         DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"
                 " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
                 gpuDynInst->wfSlotId, split_addr);
         gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
         gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
     } else {
         gpuDynInst->numScalarReqs = 1;
         gpuDynInst->setRequestFlags(req);
         PacketPtr pkt = new Packet(req, mem_req_type);
         pkt->dataStatic(gpuDynInst->scalar_data);
         gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
     }
 }

 } // namespace gem5

 #endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__
	/*
	* Copyright (c) 2021 Advanced Micro Devices, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice,
	* this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* 3. Neither the name of the copyright holder nor the names of its
	* contributors may be used to endorse or promote products derived from this
	* software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__
	#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__

	#include "arch/amdgpu/gcn3/insts/gpu_static_inst.hh"
	#include "arch/amdgpu/gcn3/insts/op_encodings.hh"
	#include "debug/GPUMem.hh"
	#include "gpu-compute/gpu_dyn_inst.hh"

	namespace gem5
	{

	/**
	* Helper function for instructions declared in op_encodings. This function
	* takes in all of the arguments for a given memory request we are trying to
	* initialize, then submits the request or requests depending on if the
	* original request is aligned or unaligned.
	*/
	template<typename T, int N>
	inline void
	initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
	bool is_atomic=false)
	{
	// local variables
	int req_size = N * sizeof(T);
	int block_size = gpuDynInst->computeUnit()->cacheLineSize();
	Addr vaddr = 0, split_addr = 0;
	bool misaligned_acc = false;
	RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
	PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;

	gpuDynInst->resetEntireStatusVector();
	for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {
	if (gpuDynInst->exec_mask[lane]) {
	vaddr = gpuDynInst->addr[lane];

	/**
	* the base address of the cache line where the the last
	* byte of the request will be stored.
	*/
	split_addr = roundDown(vaddr + req_size - 1, block_size);

	assert(split_addr <= vaddr \|\| split_addr - vaddr < block_size);
	/**
	* if the base cache line address of the last byte is
	* greater than the address of the first byte then we have
	* a misaligned access.
	*/
	misaligned_acc = split_addr > vaddr;

	if (is_atomic) {
	// make sure request is word aligned
	assert((vaddr & 0x3) == 0);

	// a given lane's atomic can't cross cache lines
	assert(!misaligned_acc);

	req = std::make_shared<Request>(vaddr, sizeof(T), 0,
	gpuDynInst->computeUnit()->requestorId(), 0,
	gpuDynInst->wfDynId,
	gpuDynInst->makeAtomicOpFunctor<T>(
	&(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
	&(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));
	} else {
	req = std::make_shared<Request>(vaddr, req_size, 0,
	gpuDynInst->computeUnit()->requestorId(), 0,
	gpuDynInst->wfDynId);
	}

	if (misaligned_acc) {
	gpuDynInst->setStatusVector(lane, 2);
	req->splitOnVaddr(split_addr, req1, req2);
	gpuDynInst->setRequestFlags(req1);
	gpuDynInst->setRequestFlags(req2);
	pkt1 = new Packet(req1, mem_req_type);
	pkt2 = new Packet(req2, mem_req_type);
	pkt1->dataStatic(&(reinterpret_cast<T*>(
	gpuDynInst->d_data))[lane * N]);
	pkt2->dataStatic(&(reinterpret_cast<T*>(
	gpuDynInst->d_data))[lane * N +
	req1->getSize()/sizeof(T)]);
	DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "
	"request for %#x\n", gpuDynInst->cu_id,
	gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
	split_addr);
	gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
	gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
	} else {
	gpuDynInst->setStatusVector(lane, 1);
	gpuDynInst->setRequestFlags(req);
	pkt = new Packet(req, mem_req_type);
	pkt->dataStatic(&(reinterpret_cast<T*>(
	gpuDynInst->d_data))[lane * N]);
	gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
	}
	} else { // if lane is not active, then no pending requests
	gpuDynInst->setStatusVector(lane, 0);
	}
	}
	}

	/**
	* Helper function for scalar instructions declared in op_encodings. This
	* function takes in all of the arguments for a given memory request we are
	* trying to initialize, then submits the request or requests depending on if
	* the original request is aligned or unaligned.
	*/
	template<typename T, int N>
	inline void
	initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
	{
	int req_size = N * sizeof(T);
	int block_size = gpuDynInst->computeUnit()->cacheLineSize();
	Addr vaddr = gpuDynInst->scalarAddr;

	/**
	* the base address of the cache line where the the last byte of
	* the request will be stored.
	*/
	Addr split_addr = roundDown(vaddr + req_size - 1, block_size);

	assert(split_addr <= vaddr \|\| split_addr - vaddr < block_size);
	/**
	* if the base cache line address of the last byte is greater
	* than the address of the first byte then we have a misaligned
	* access.
	*/
	bool misaligned_acc = split_addr > vaddr;

	RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
	gpuDynInst->computeUnit()->requestorId(), 0,
	gpuDynInst->wfDynId);

	if (misaligned_acc) {
	RequestPtr req1, req2;
	req->splitOnVaddr(split_addr, req1, req2);
	gpuDynInst->numScalarReqs = 2;
	gpuDynInst->setRequestFlags(req1);
	gpuDynInst->setRequestFlags(req2);
	PacketPtr pkt1 = new Packet(req1, mem_req_type);
	PacketPtr pkt2 = new Packet(req2, mem_req_type);
	pkt1->dataStatic(gpuDynInst->scalar_data);
	pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
	DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"
	" %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
	gpuDynInst->wfSlotId, split_addr);
	gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
	gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
	} else {
	gpuDynInst->numScalarReqs = 1;
	gpuDynInst->setRequestFlags(req);
	PacketPtr pkt = new Packet(req, mem_req_type);
	pkt->dataStatic(gpuDynInst->scalar_data);
	gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
	}
	}

	} // namespace gem5

	#endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__