| /* |
| * Copyright (c) 2015-2018 Advanced Micro Devices, Inc. |
| * All rights reserved. |
| * |
| * For use for simulation and test purposes only |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * 3. Neither the name of the copyright holder nor the names of its |
| * contributors may be used to endorse or promote products derived from this |
| * software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| * POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "gpu-compute/gpu_compute_driver.hh" |
| |
| #include "cpu/thread_context.hh" |
| #include "debug/GPUDriver.hh" |
| #include "dev/hsa/hsa_device.hh" |
| #include "dev/hsa/hsa_packet_processor.hh" |
| #include "dev/hsa/kfd_event_defines.h" |
| #include "dev/hsa/kfd_ioctl.h" |
| #include "params/GPUComputeDriver.hh" |
| #include "sim/syscall_emul_buf.hh" |
| |
| GPUComputeDriver::GPUComputeDriver(const Params &p) |
| : HSADriver(p) |
| { |
| device->attachDriver(this); |
| DPRINTF(GPUDriver, "Constructing KFD: device\n"); |
| } |
| |
| int |
| GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) |
| { |
| auto &virt_proxy = tc->getVirtProxy(); |
| |
| switch (req) { |
| case AMDKFD_IOC_GET_VERSION: |
| { |
| DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n"); |
| |
| TypedBufferArg<kfd_ioctl_get_version_args> args(ioc_buf); |
| args->major_version = KFD_IOCTL_MAJOR_VERSION; |
| args->minor_version = KFD_IOCTL_MINOR_VERSION; |
| |
| args.copyOut(virt_proxy); |
| } |
| break; |
| case AMDKFD_IOC_CREATE_QUEUE: |
| { |
| DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n"); |
| |
| allocateQueue(tc, ioc_buf); |
| |
| DPRINTF(GPUDriver, "Creating queue %d\n", queueId); |
| } |
| break; |
| case AMDKFD_IOC_DESTROY_QUEUE: |
| { |
| TypedBufferArg<kfd_ioctl_destroy_queue_args> args(ioc_buf); |
| args.copyIn(virt_proxy); |
| DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \ |
| "queue offset %d\n", args->queue_id); |
| device->hsaPacketProc().unsetDeviceQueueDesc(args->queue_id); |
| } |
| break; |
| case AMDKFD_IOC_SET_MEMORY_POLICY: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n"); |
| } |
| break; |
| case AMDKFD_IOC_GET_CLOCK_COUNTERS: |
| { |
| DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n"); |
| |
| TypedBufferArg<kfd_ioctl_get_clock_counters_args> args(ioc_buf); |
| args.copyIn(virt_proxy); |
| |
| // Set nanosecond resolution |
| args->system_clock_freq = 1000000000; |
| |
| /** |
| * Derive all clock counters based on the tick. All |
| * device clocks are identical and perfectly in sync. |
| */ |
| uint64_t elapsed_nsec = curTick() / SimClock::Int::ns; |
| args->gpu_clock_counter = elapsed_nsec; |
| args->cpu_clock_counter = elapsed_nsec; |
| args->system_clock_counter = elapsed_nsec; |
| |
| args.copyOut(virt_proxy); |
| } |
| break; |
| case AMDKFD_IOC_GET_PROCESS_APERTURES: |
| { |
| DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n"); |
| |
| TypedBufferArg<kfd_ioctl_get_process_apertures_args> args(ioc_buf); |
| args->num_of_nodes = 1; |
| |
| /** |
| * Set the GPUVM/LDS/Scratch APEs exactly as they |
| * are in the real driver, see the KFD driver |
| * in the ROCm Linux kernel source: |
| * drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c |
| */ |
| for (int i = 0; i < args->num_of_nodes; ++i) { |
| /** |
| * While the GPU node numbers start at 0, we add 1 |
| * to force the count to start at 1. This is to |
| * ensure that the base/limit addresses are |
| * calculated correctly. |
| */ |
| args->process_apertures[i].scratch_base |
| = scratchApeBase(i + 1); |
| args->process_apertures[i].scratch_limit = |
| scratchApeLimit(args->process_apertures[i].scratch_base); |
| |
| args->process_apertures[i].lds_base = ldsApeBase(i + 1); |
| args->process_apertures[i].lds_limit = |
| ldsApeLimit(args->process_apertures[i].lds_base); |
| |
| args->process_apertures[i].gpuvm_base = gpuVmApeBase(i + 1); |
| args->process_apertures[i].gpuvm_limit = |
| gpuVmApeLimit(args->process_apertures[i].gpuvm_base); |
| |
| // NOTE: Must match ID populated by hsaTopology.py |
| args->process_apertures[i].gpu_id = 2765; |
| |
| DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i, |
| args->process_apertures[i].gpuvm_base); |
| DPRINTF(GPUDriver, "GPUVM limit for node[%i] = %#x\n", i, |
| args->process_apertures[i].gpuvm_limit); |
| |
| DPRINTF(GPUDriver, "LDS base for node[%i] = %#x\n", i, |
| args->process_apertures[i].lds_base); |
| DPRINTF(GPUDriver, "LDS limit for node[%i] = %#x\n", i, |
| args->process_apertures[i].lds_limit); |
| |
| DPRINTF(GPUDriver, "Scratch base for node[%i] = %#x\n", i, |
| args->process_apertures[i].scratch_base); |
| DPRINTF(GPUDriver, "Scratch limit for node[%i] = %#x\n", i, |
| args->process_apertures[i].scratch_limit); |
| |
| /** |
| * The CPU's 64b address space can only use the |
| * areas with VA[63:47] == 0x1ffff or VA[63:47] == 0, |
| * therefore we must ensure that the apertures do not |
| * fall in the CPU's address space. |
| */ |
| assert(bits<Addr>(args->process_apertures[i].scratch_base, 63, |
| 47) != 0x1ffff); |
| assert(bits<Addr>(args->process_apertures[i].scratch_base, 63, |
| 47) != 0); |
| assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63, |
| 47) != 0x1ffff); |
| assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63, |
| 47) != 0); |
| assert(bits<Addr>(args->process_apertures[i].lds_base, 63, |
| 47) != 0x1ffff); |
| assert(bits<Addr>(args->process_apertures[i].lds_base, 63, |
| 47) != 0); |
| assert(bits<Addr>(args->process_apertures[i].lds_limit, 63, |
| 47) != 0x1ffff); |
| assert(bits<Addr>(args->process_apertures[i].lds_limit, 63, |
| 47) != 0); |
| assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63, |
| 47) != 0x1ffff); |
| assert(bits<Addr>(args->process_apertures[i].gpuvm_base, 63, |
| 47) != 0); |
| assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63, |
| 47) != 0x1ffff); |
| assert(bits<Addr>(args->process_apertures[i].gpuvm_limit, 63, |
| 47) != 0); |
| } |
| |
| args.copyOut(virt_proxy); |
| } |
| break; |
| case AMDKFD_IOC_UPDATE_QUEUE: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n"); |
| } |
| break; |
| case AMDKFD_IOC_CREATE_EVENT: |
| { |
| DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_EVENT\n"); |
| |
| TypedBufferArg<kfd_ioctl_create_event_args> args(ioc_buf); |
| args.copyIn(virt_proxy); |
| if (args->event_type != KFD_IOC_EVENT_SIGNAL) { |
| fatal("Signal events are only supported currently\n"); |
| } else if (eventSlotIndex == SLOTS_PER_PAGE) { |
| fatal("Signal event wasn't created; signal limit reached\n"); |
| } |
| // Currently, we allocate only one signal_page for events. |
| // Note that this signal page is of size 8 * KFD_SIGNAL_EVENT_LIMIT |
| uint64_t page_index = 0; |
| args->event_page_offset = (page_index | KFD_MMAP_TYPE_EVENTS); |
| args->event_page_offset <<= PAGE_SHIFT; |
| // TODO: Currently we support only signal events, hence using |
| // the same ID for both signal slot and event slot |
| args->event_slot_index = eventSlotIndex; |
| args->event_id = eventSlotIndex++; |
| args->event_trigger_data = args->event_id; |
| DPRINTF(GPUDriver, "amdkfd create events" |
| "(event_id: 0x%x, offset: 0x%x)\n", |
| args->event_id, args->event_page_offset); |
| // Since eventSlotIndex is increased everytime a new event is |
| // created ETable at eventSlotIndex(event_id) is guaranteed to be |
| // empty. In a future implementation that reuses deleted event_ids, |
| // we should check if event table at this |
| // eventSlotIndex(event_id) is empty before inserting a new event |
| // table entry |
| ETable.emplace(std::pair<uint32_t, ETEntry>(args->event_id, {})); |
| args.copyOut(virt_proxy); |
| } |
| break; |
| case AMDKFD_IOC_DESTROY_EVENT: |
| { |
| DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_EVENT\n"); |
| TypedBufferArg<kfd_ioctl_destroy_event_args> args(ioc_buf); |
| args.copyIn(virt_proxy); |
| DPRINTF(GPUDriver, "amdkfd destroying event %d\n", args->event_id); |
| fatal_if(ETable.count(args->event_id) == 0, |
| "Event ID invalid, cannot destroy this event\n"); |
| ETable.erase(args->event_id); |
| } |
| break; |
| case AMDKFD_IOC_SET_EVENT: |
| { |
| DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_SET_EVENTS\n"); |
| TypedBufferArg<kfd_ioctl_set_event_args> args(ioc_buf); |
| args.copyIn(virt_proxy); |
| DPRINTF(GPUDriver, "amdkfd set event %d\n", args->event_id); |
| fatal_if(ETable.count(args->event_id) == 0, |
| "Event ID invlaid, cannot set this event\n"); |
| ETable[args->event_id].setEvent = true; |
| signalWakeupEvent(args->event_id); |
| } |
| break; |
| case AMDKFD_IOC_RESET_EVENT: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n"); |
| } |
| break; |
| case AMDKFD_IOC_WAIT_EVENTS: |
| { |
| DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_WAIT_EVENTS\n"); |
| TypedBufferArg<kfd_ioctl_wait_events_args> args(ioc_buf); |
| args.copyIn(virt_proxy); |
| kfd_event_data *events = |
| (kfd_event_data *)args->events_ptr; |
| DPRINTF(GPUDriver, "amdkfd wait for events" |
| "(wait on all: %d, timeout : %d, num_events: %s)\n", |
| args->wait_for_all, args->timeout, args->num_events); |
| panic_if(args->wait_for_all != 0 && args->num_events > 1, |
| "Wait for all events not supported\n"); |
| bool should_sleep = true; |
| if (TCEvents.count(tc) == 0) { |
| // This thread context trying to wait on an event for the first |
| // time, initialize it. |
| TCEvents.emplace(std::piecewise_construct, std::make_tuple(tc), |
| std::make_tuple(this, tc)); |
| DPRINTF(GPUDriver, "\tamdkfd creating event list" |
| " for thread %d\n", tc->cpuId()); |
| } |
| panic_if(TCEvents[tc].signalEvents.size() != 0, |
| "There are %d events that put this thread to sleep," |
| " this thread should not be running\n", |
| TCEvents[tc].signalEvents.size()); |
| for (int i = 0; i < args->num_events; i++) { |
| panic_if(!events, |
| "Event pointer invalid\n"); |
| Addr eventDataAddr = (Addr)(events + i); |
| TypedBufferArg<kfd_event_data> EventData( |
| eventDataAddr, sizeof(kfd_event_data)); |
| EventData.copyIn(virt_proxy); |
| DPRINTF(GPUDriver, |
| "\tamdkfd wait for event %d\n", EventData->event_id); |
| panic_if(ETable.count(EventData->event_id) == 0, |
| "Event ID invalid, cannot set this event\n"); |
| panic_if(ETable[EventData->event_id].threadWaiting, |
| "Multiple threads waiting on the same event\n"); |
| if (ETable[EventData->event_id].setEvent) { |
| // If event is already set, the event has already happened. |
| // Just unset the event and dont put this thread to sleep. |
| ETable[EventData->event_id].setEvent = false; |
| should_sleep = false; |
| } |
| if (should_sleep) { |
| // Put this thread to sleep |
| ETable[EventData->event_id].threadWaiting = true; |
| ETable[EventData->event_id].tc = tc; |
| TCEvents[tc].signalEvents.insert(EventData->event_id); |
| } |
| } |
| |
| // TODO: Return the correct wait_result back. Currently, returning |
| // success for both KFD_WAIT_TIMEOUT and KFD_WAIT_COMPLETE. |
| // Ideally, this needs to be done after the event is triggered and |
| // after the thread is woken up. |
| args->wait_result = 0; |
| args.copyOut(virt_proxy); |
| if (should_sleep) { |
| // Put this thread to sleep |
| sleepCPU(tc, args->timeout); |
| } else { |
| // Remove events that tried to put this thread to sleep |
| TCEvents[tc].clearEvents(); |
| } |
| } |
| break; |
| case AMDKFD_IOC_DBG_REGISTER: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n"); |
| } |
| break; |
| case AMDKFD_IOC_DBG_UNREGISTER: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n"); |
| } |
| break; |
| case AMDKFD_IOC_DBG_ADDRESS_WATCH: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n"); |
| } |
| break; |
| case AMDKFD_IOC_DBG_WAVE_CONTROL: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n"); |
| } |
| break; |
| case AMDKFD_IOC_ALLOC_MEMORY_OF_GPU: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n"); |
| } |
| break; |
| case AMDKFD_IOC_FREE_MEMORY_OF_GPU: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n"); |
| } |
| break; |
| case AMDKFD_IOC_MAP_MEMORY_TO_GPU: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n"); |
| } |
| break; |
| case AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n"); |
| } |
| break; |
| case AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH\n"); |
| } |
| break; |
| case AMDKFD_IOC_SET_CU_MASK: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n"); |
| } |
| break; |
| case AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE" |
| "\n"); |
| } |
| break; |
| case AMDKFD_IOC_SET_TRAP_HANDLER: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n"); |
| } |
| break; |
| case AMDKFD_IOC_GET_PROCESS_APERTURES_NEW: |
| { |
| DPRINTF(GPUDriver, |
| "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n"); |
| |
| TypedBufferArg<kfd_ioctl_get_process_apertures_new_args> |
| ioc_args(ioc_buf); |
| |
| ioc_args.copyIn(virt_proxy); |
| ioc_args->num_of_nodes = 1; |
| |
| for (int i = 0; i < ioc_args->num_of_nodes; ++i) { |
| TypedBufferArg<kfd_process_device_apertures> ape_args |
| (ioc_args->kfd_process_device_apertures_ptr); |
| |
| ape_args->scratch_base = scratchApeBase(i + 1); |
| ape_args->scratch_limit = |
| scratchApeLimit(ape_args->scratch_base); |
| ape_args->lds_base = ldsApeBase(i + 1); |
| ape_args->lds_limit = ldsApeLimit(ape_args->lds_base); |
| ape_args->gpuvm_base = gpuVmApeBase(i + 1); |
| ape_args->gpuvm_limit = gpuVmApeLimit(ape_args->gpuvm_base); |
| |
| ape_args->gpu_id = 2765; |
| |
| assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff); |
| assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0); |
| assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0x1ffff); |
| assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0); |
| assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0x1ffff); |
| assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0); |
| assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0x1ffff); |
| assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0); |
| assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0x1ffff); |
| assert(bits<Addr>(ape_args->gpuvm_base, 63, 47) != 0); |
| assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0x1ffff); |
| assert(bits<Addr>(ape_args->gpuvm_limit, 63, 47) != 0); |
| |
| ape_args.copyOut(virt_proxy); |
| } |
| |
| ioc_args.copyOut(virt_proxy); |
| } |
| break; |
| case AMDKFD_IOC_GET_DMABUF_INFO: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n"); |
| } |
| break; |
| case AMDKFD_IOC_IMPORT_DMABUF: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n"); |
| } |
| break; |
| case AMDKFD_IOC_GET_TILE_CONFIG: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n"); |
| } |
| break; |
| case AMDKFD_IOC_IPC_IMPORT_HANDLE: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_IPC_IMPORT_HANDLE\n"); |
| } |
| break; |
| case AMDKFD_IOC_IPC_EXPORT_HANDLE: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_IPC_EXPORT_HANDLE\n"); |
| } |
| break; |
| case AMDKFD_IOC_CROSS_MEMORY_COPY: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_CROSS_MEMORY_COPY\n"); |
| } |
| break; |
| case AMDKFD_IOC_OPEN_GRAPHIC_HANDLE: |
| { |
| warn("unimplemented ioctl: AMDKFD_IOC_OPEN_GRAPHIC_HANDLE\n"); |
| } |
| break; |
| default: |
| fatal("%s: bad ioctl %d\n", req); |
| break; |
| } |
| return 0; |
| } |
| |
| void |
| GPUComputeDriver::sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout) |
| { |
| // Convert millisecs to ticks |
| Tick wakeup_delay((uint64_t)milliSecTimeout * 1000000000); |
| assert(TCEvents.count(tc) == 1); |
| TCEvents[tc].timerEvent.scheduleWakeup(wakeup_delay); |
| tc->suspend(); |
| DPRINTF(GPUDriver, |
| "CPU %d is put to sleep\n", tc->cpuId()); |
| } |
| |
| Addr |
| GPUComputeDriver::gpuVmApeBase(int gpuNum) const |
| { |
| return ((Addr)gpuNum << 61) + 0x1000000000000L; |
| } |
| |
| Addr |
| GPUComputeDriver::gpuVmApeLimit(Addr apeBase) const |
| { |
| return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL; |
| } |
| |
| Addr |
| GPUComputeDriver::scratchApeBase(int gpuNum) const |
| { |
| return ((Addr)gpuNum << 61) + 0x100000000L; |
| } |
| |
| Addr |
| GPUComputeDriver::scratchApeLimit(Addr apeBase) const |
| { |
| return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF; |
| } |
| |
| Addr |
| GPUComputeDriver::ldsApeBase(int gpuNum) const |
| { |
| return ((Addr)gpuNum << 61) + 0x0; |
| } |
| |
| Addr |
| GPUComputeDriver::ldsApeLimit(Addr apeBase) const |
| { |
| return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF; |
| } |