blob: 9846f41b5072707231652610ddb2d6cb222658c5 [file] [log] [blame]
/*
* Copyright (c) 2018 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Authors: Matt Sinclair
*/
#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__
#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__
#include "arch/gcn3/insts/gpu_static_inst.hh"
#include "arch/gcn3/insts/op_encodings.hh"
#include "debug/GPUMem.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
/**
* Helper function for instructions declared in op_encodings. This function
* takes in all of the arguments for a given memory request we are trying to
* initialize, then submits the request or requests depending on if the
* original request is aligned or unaligned.
*/
template<typename T, int N>
inline void
initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
bool is_atomic=false)
{
// local variables
int req_size = N * sizeof(T);
int block_size = gpuDynInst->computeUnit()->cacheLineSize();
Addr vaddr = 0, split_addr = 0;
bool misaligned_acc = false;
RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;
gpuDynInst->resetEntireStatusVector();
for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {
if (gpuDynInst->exec_mask[lane]) {
vaddr = gpuDynInst->addr[lane];
/**
* the base address of the cache line where the the last
* byte of the request will be stored.
*/
split_addr = roundDown(vaddr + req_size - 1, block_size);
assert(split_addr <= vaddr || split_addr - vaddr < block_size);
/**
* if the base cache line address of the last byte is
* greater than the address of the first byte then we have
* a misaligned access.
*/
misaligned_acc = split_addr > vaddr;
if (is_atomic) {
// make sure request is word aligned
assert((vaddr & 0x3) == 0);
// a given lane's atomic can't cross cache lines
assert(!misaligned_acc);
req = std::make_shared<Request>(vaddr, sizeof(T), 0,
gpuDynInst->computeUnit()->requestorId(), 0,
gpuDynInst->wfDynId,
gpuDynInst->makeAtomicOpFunctor<T>(
&(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
&(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));
} else {
req = std::make_shared<Request>(vaddr, req_size, 0,
gpuDynInst->computeUnit()->requestorId(), 0,
gpuDynInst->wfDynId);
}
if (misaligned_acc) {
gpuDynInst->setStatusVector(lane, 2);
req->splitOnVaddr(split_addr, req1, req2);
gpuDynInst->setRequestFlags(req1);
gpuDynInst->setRequestFlags(req2);
pkt1 = new Packet(req1, mem_req_type);
pkt2 = new Packet(req2, mem_req_type);
pkt1->dataStatic(&(reinterpret_cast<T*>(
gpuDynInst->d_data))[lane * N]);
pkt2->dataStatic(&(reinterpret_cast<T*>(
gpuDynInst->d_data))[lane * N + req1->getSize()]);
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "
"request for %#x\n", gpuDynInst->cu_id,
gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
split_addr);
gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
} else {
gpuDynInst->setStatusVector(lane, 1);
gpuDynInst->setRequestFlags(req);
pkt = new Packet(req, mem_req_type);
pkt->dataStatic(&(reinterpret_cast<T*>(
gpuDynInst->d_data))[lane * N]);
gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
}
} else { // if lane is not active, then no pending requests
gpuDynInst->setStatusVector(lane, 0);
}
}
}
/**
* Helper function for scalar instructions declared in op_encodings. This
* function takes in all of the arguments for a given memory request we are
* trying to initialize, then submits the request or requests depending on if
* the original request is aligned or unaligned.
*/
template<typename T, int N>
inline void
initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
{
int req_size = N * sizeof(T);
int block_size = gpuDynInst->computeUnit()->cacheLineSize();
Addr vaddr = gpuDynInst->scalarAddr;
/**
* the base address of the cache line where the the last byte of
* the request will be stored.
*/
Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
assert(split_addr <= vaddr || split_addr - vaddr < block_size);
/**
* if the base cache line address of the last byte is greater
* than the address of the first byte then we have a misaligned
* access.
*/
bool misaligned_acc = split_addr > vaddr;
RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
gpuDynInst->computeUnit()->requestorId(), 0,
gpuDynInst->wfDynId);
if (misaligned_acc) {
RequestPtr req1, req2;
req->splitOnVaddr(split_addr, req1, req2);
gpuDynInst->numScalarReqs = 2;
gpuDynInst->setRequestFlags(req1);
gpuDynInst->setRequestFlags(req2);
PacketPtr pkt1 = new Packet(req1, mem_req_type);
PacketPtr pkt2 = new Packet(req2, mem_req_type);
pkt1->dataStatic(gpuDynInst->scalar_data);
pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"
" %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
gpuDynInst->wfSlotId, split_addr);
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
} else {
gpuDynInst->numScalarReqs = 1;
gpuDynInst->setRequestFlags(req);
PacketPtr pkt = new Packet(req, mem_req_type);
pkt->dataStatic(gpuDynInst->scalar_data);
gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
}
}
#endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__