dispatcher.cc revision 12697
18333Snate@binkert.org/* 28333Snate@binkert.org * Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc. 38333Snate@binkert.org * All rights reserved. 48333Snate@binkert.org * 58333Snate@binkert.org * For use for simulation and test purposes only 68333Snate@binkert.org * 78333Snate@binkert.org * Redistribution and use in source and binary forms, with or without 88333Snate@binkert.org * modification, are permitted provided that the following conditions are met: 98333Snate@binkert.org * 108333Snate@binkert.org * 1. Redistributions of source code must retain the above copyright notice, 118333Snate@binkert.org * this list of conditions and the following disclaimer. 128333Snate@binkert.org * 138333Snate@binkert.org * 2. Redistributions in binary form must reproduce the above copyright notice, 148333Snate@binkert.org * this list of conditions and the following disclaimer in the documentation 158333Snate@binkert.org * and/or other materials provided with the distribution. 168333Snate@binkert.org * 178333Snate@binkert.org * 3. Neither the name of the copyright holder nor the names of its 188333Snate@binkert.org * contributors may be used to endorse or promote products derived from this 198333Snate@binkert.org * software without specific prior written permission. 208333Snate@binkert.org * 218333Snate@binkert.org * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 228333Snate@binkert.org * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 238333Snate@binkert.org * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 248333Snate@binkert.org * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 258333Snate@binkert.org * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 268333Snate@binkert.org * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 278333Snate@binkert.org * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 288333Snate@binkert.org * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 298333Snate@binkert.org * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 308333Snate@binkert.org * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 318333Snate@binkert.org * POSSIBILITY OF SUCH DAMAGE. 328333Snate@binkert.org * 338333Snate@binkert.org * Authors: Brad Beckmann, 348333Snate@binkert.org * Marc Orr, 358333Snate@binkert.org * Anthony Gutierrez 368333Snate@binkert.org */ 378333Snate@binkert.org 388333Snate@binkert.org 398333Snate@binkert.org#include "gpu-compute/dispatcher.hh" 408333Snate@binkert.org 418333Snate@binkert.org#include "cpu/base.hh" 428333Snate@binkert.org#include "debug/GPUDisp.hh" 438333Snate@binkert.org#include "gpu-compute/cl_driver.hh" 448333Snate@binkert.org#include "gpu-compute/cl_event.hh" 458333Snate@binkert.org#include "gpu-compute/shader.hh" 468333Snate@binkert.org#include "gpu-compute/wavefront.hh" 478333Snate@binkert.org#include "mem/packet_access.hh" 488333Snate@binkert.org 498333Snate@binkert.orgGpuDispatcher *GpuDispatcher::instance = nullptr; 508333Snate@binkert.org 518333Snate@binkert.orgGpuDispatcher::GpuDispatcher(const Params *p) 528333Snate@binkert.org : DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")), 538333Snate@binkert.org pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency), 548333Snate@binkert.org dispatchCount(0), dispatchActive(false), cpu(p->cpu), 558333Snate@binkert.org shader(p->shader_pointer), driver(p->cl_driver), 568333Snate@binkert.org tickEvent([this]{ exec(); }, "GPU Dispatcher tick", 578333Snate@binkert.org false, Event::CPU_Tick_Pri) 588333Snate@binkert.org{ 598333Snate@binkert.org shader->handshake(this); 608333Snate@binkert.org driver->handshake(this); 618333Snate@binkert.org 628333Snate@binkert.org ndRange.wg_disp_rem = false; 638333Snate@binkert.org ndRange.globalWgId = 0; 648333Snate@binkert.org 658333Snate@binkert.org schedule(&tickEvent, 0); 668333Snate@binkert.org 678333Snate@binkert.org // translation port for the dispatcher 688333Snate@binkert.org tlbPort = new TLBPort(csprintf("%s-port%d", name()), this); 698333Snate@binkert.org 708333Snate@binkert.org num_kernelLaunched 718333Snate@binkert.org .name(name() + ".num_kernel_launched") 728333Snate@binkert.org .desc("number of kernel launched") 738333Snate@binkert.org ; 748333Snate@binkert.org} 758333Snate@binkert.org 768333Snate@binkert.orgGpuDispatcher *GpuDispatcherParams::create() 778333Snate@binkert.org{ 788333Snate@binkert.org GpuDispatcher *dispatcher = new GpuDispatcher(this); 798333Snate@binkert.org GpuDispatcher::setInstance(dispatcher); 808333Snate@binkert.org 818333Snate@binkert.org return GpuDispatcher::getInstance(); 828333Snate@binkert.org} 838333Snate@binkert.org 848333Snate@binkert.orgvoid 858333Snate@binkert.orgGpuDispatcher::serialize(CheckpointOut &cp) const 868333Snate@binkert.org{ 878333Snate@binkert.org Tick event_tick = 0; 888333Snate@binkert.org 898333Snate@binkert.org if (ndRange.wg_disp_rem) 908333Snate@binkert.org fatal("Checkpointing not supported during active workgroup execution"); 918333Snate@binkert.org 928333Snate@binkert.org if (tickEvent.scheduled()) 938333Snate@binkert.org event_tick = tickEvent.when(); 948333Snate@binkert.org 958333Snate@binkert.org SERIALIZE_SCALAR(event_tick); 968333Snate@binkert.org 978333Snate@binkert.org} 988333Snate@binkert.org 998333Snate@binkert.orgvoid 1008333Snate@binkert.orgGpuDispatcher::unserialize(CheckpointIn &cp) 1018333Snate@binkert.org{ 1028333Snate@binkert.org Tick event_tick; 1038333Snate@binkert.org 1048333Snate@binkert.org if (tickEvent.scheduled()) 1058333Snate@binkert.org deschedule(&tickEvent); 1068333Snate@binkert.org 1078333Snate@binkert.org UNSERIALIZE_SCALAR(event_tick); 1088333Snate@binkert.org 1098333Snate@binkert.org if (event_tick) 1108333Snate@binkert.org schedule(&tickEvent, event_tick); 1118333Snate@binkert.org} 1128333Snate@binkert.org 1138333Snate@binkert.orgAddrRangeList 1148333Snate@binkert.orgGpuDispatcher::getAddrRanges() const 1158333Snate@binkert.org{ 1168333Snate@binkert.org AddrRangeList ranges; 1178333Snate@binkert.org 1188333Snate@binkert.org DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n", 1198333Snate@binkert.org pioAddr, pioSize); 1208333Snate@binkert.org 1218333Snate@binkert.org ranges.push_back(RangeSize(pioAddr, pioSize)); 1228333Snate@binkert.org 1238333Snate@binkert.org return ranges; 1248333Snate@binkert.org} 1258333Snate@binkert.org 1268333Snate@binkert.orgTick 1278333Snate@binkert.orgGpuDispatcher::read(PacketPtr pkt) 1288333Snate@binkert.org{ 1298333Snate@binkert.org assert(pkt->getAddr() >= pioAddr); 1308333Snate@binkert.org assert(pkt->getAddr() < pioAddr + pioSize); 1318333Snate@binkert.org 1328333Snate@binkert.org int offset = pkt->getAddr() - pioAddr; 1338333Snate@binkert.org pkt->allocate(); 1348333Snate@binkert.org 1358333Snate@binkert.org DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize()); 1368333Snate@binkert.org 1378333Snate@binkert.org if (offset < 8) { 1388333Snate@binkert.org assert(!offset); 1398333Snate@binkert.org assert(pkt->getSize() == 8); 1408333Snate@binkert.org 1418333Snate@binkert.org uint64_t retval = dispatchActive; 1428333Snate@binkert.org pkt->set(retval); 1438333Snate@binkert.org } else { 1448333Snate@binkert.org offset -= 8; 1458333Snate@binkert.org assert(offset + pkt->getSize() < sizeof(HsaQueueEntry)); 1468333Snate@binkert.org char *curTaskPtr = (char*)&curTask; 1478333Snate@binkert.org 1488333Snate@binkert.org memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize()); 1498333Snate@binkert.org } 1508333Snate@binkert.org 1518333Snate@binkert.org pkt->makeAtomicResponse(); 1528333Snate@binkert.org 1538333Snate@binkert.org return pioDelay; 1548333Snate@binkert.org} 1558333Snate@binkert.org 1568333Snate@binkert.orgTick 1578333Snate@binkert.orgGpuDispatcher::write(PacketPtr pkt) 1588333Snate@binkert.org{ 1598333Snate@binkert.org assert(pkt->getAddr() >= pioAddr); 1608333Snate@binkert.org assert(pkt->getAddr() < pioAddr + pioSize); 1618333Snate@binkert.org 1628333Snate@binkert.org int offset = pkt->getAddr() - pioAddr; 1638333Snate@binkert.org 1648333Snate@binkert.org#if TRACING_ON 1658333Snate@binkert.org uint64_t data_val = 0; 1668333Snate@binkert.org 1678333Snate@binkert.org switch (pkt->getSize()) { 1688333Snate@binkert.org case 1: 1698333Snate@binkert.org data_val = pkt->get<uint8_t>(); 1708333Snate@binkert.org break; 1718333Snate@binkert.org case 2: 1728333Snate@binkert.org data_val = pkt->get<uint16_t>(); 1738333Snate@binkert.org break; 1748333Snate@binkert.org case 4: 1758333Snate@binkert.org data_val = pkt->get<uint32_t>(); 1768333Snate@binkert.org break; 1778333Snate@binkert.org case 8: 1788333Snate@binkert.org data_val = pkt->get<uint64_t>(); 1798333Snate@binkert.org break; 1808333Snate@binkert.org default: 1818333Snate@binkert.org DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize()); 1828333Snate@binkert.org } 1838333Snate@binkert.org 1848333Snate@binkert.org DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val, 1858333Snate@binkert.org pkt->getSize()); 1868333Snate@binkert.org#endif 1878333Snate@binkert.org if (!offset) { 1888333Snate@binkert.org static int nextId = 0; 1898333Snate@binkert.org 1908333Snate@binkert.org // The depends field of the qstruct, which was previously unused, is 1918333Snate@binkert.org // used to communicate with simulated application. 1928333Snate@binkert.org if (curTask.depends) { 1938333Snate@binkert.org HostState hs; 1948333Snate@binkert.org shader->ReadMem((uint64_t)(curTask.depends), &hs, 1958333Snate@binkert.org sizeof(HostState), 0); 1968333Snate@binkert.org 1978333Snate@binkert.org // update event start time (in nano-seconds) 1988333Snate@binkert.org uint64_t start = curTick() / 1000; 1998333Snate@binkert.org 2008333Snate@binkert.org shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start), 2018333Snate@binkert.org &start, sizeof(uint64_t), 0); 2028333Snate@binkert.org } 2038333Snate@binkert.org 2048333Snate@binkert.org // launch kernel 2058333Snate@binkert.org ++num_kernelLaunched; 2068333Snate@binkert.org 2078333Snate@binkert.org NDRange *ndr = &(ndRangeMap[nextId]); 2088333Snate@binkert.org // copy dispatch info 2098333Snate@binkert.org ndr->q = curTask; 2108333Snate@binkert.org 2118333Snate@binkert.org // update the numDispTask polled by the runtime 2128333Snate@binkert.org accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1); 2138333Snate@binkert.org 2148333Snate@binkert.org ndr->numWgTotal = 1; 2158333Snate@binkert.org 2168333Snate@binkert.org for (int i = 0; i < 3; ++i) { 2178333Snate@binkert.org ndr->wgId[i] = 0; 2188333Snate@binkert.org ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]); 2198333Snate@binkert.org ndr->numWgTotal *= ndr->numWg[i]; 2208333Snate@binkert.org } 2218333Snate@binkert.org 2228333Snate@binkert.org ndr->numWgCompleted = 0; 2238333Snate@binkert.org ndr->globalWgId = 0; 2248333Snate@binkert.org ndr->wg_disp_rem = true; 2258333Snate@binkert.org ndr->execDone = false; 2268333Snate@binkert.org ndr->addrToNotify = (volatile bool*)curTask.addrToNotify; 2278333Snate@binkert.org ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft; 2288333Snate@binkert.org ndr->dispatchId = nextId; 2298333Snate@binkert.org ndr->curCid = pkt->req->contextId(); 2308333Snate@binkert.org DPRINTF(GPUDisp, "launching kernel %d\n",nextId); 2318333Snate@binkert.org execIds.push(nextId); 2328333Snate@binkert.org ++nextId; 2338333Snate@binkert.org 2348333Snate@binkert.org dispatchActive = true; 2358333Snate@binkert.org 2368333Snate@binkert.org if (!tickEvent.scheduled()) { 2378333Snate@binkert.org schedule(&tickEvent, curTick() + shader->ticks(1)); 2388333Snate@binkert.org } 2398333Snate@binkert.org } else { 2408333Snate@binkert.org // populate current task struct 2418333Snate@binkert.org // first 64 bits are launch reg 2428333Snate@binkert.org offset -= 8; 2438333Snate@binkert.org assert(offset < sizeof(HsaQueueEntry)); 2448333Snate@binkert.org char *curTaskPtr = (char*)&curTask; 2458333Snate@binkert.org memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize()); 2468333Snate@binkert.org } 2478333Snate@binkert.org 2488333Snate@binkert.org pkt->makeAtomicResponse(); 2498333Snate@binkert.org 2508333Snate@binkert.org return pioDelay; 2518333Snate@binkert.org} 2528333Snate@binkert.org 2538333Snate@binkert.org 2548333Snate@binkert.orgBaseMasterPort& 2558333Snate@binkert.orgGpuDispatcher::getMasterPort(const std::string &if_name, PortID idx) 2568333Snate@binkert.org{ 2578333Snate@binkert.org if (if_name == "translation_port") { 2588333Snate@binkert.org return *tlbPort; 2598333Snate@binkert.org } 2608333Snate@binkert.org 2618333Snate@binkert.org return DmaDevice::getMasterPort(if_name, idx); 2628333Snate@binkert.org} 2638333Snate@binkert.org 2648333Snate@binkert.orgvoid 2658333Snate@binkert.orgGpuDispatcher::exec() 2668333Snate@binkert.org{ 2678333Snate@binkert.org int fail_count = 0; 2688333Snate@binkert.org 2698333Snate@binkert.org // There are potentially multiple outstanding kernel launches. 2708333Snate@binkert.org // It is possible that the workgroups in a different kernel 2718333Snate@binkert.org // can fit on the GPU even if another kernel's workgroups cannot 2728333Snate@binkert.org DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); 2738333Snate@binkert.org 274 while (execIds.size() > fail_count) { 275 int execId = execIds.front(); 276 277 while (ndRangeMap[execId].wg_disp_rem) { 278 //update the thread context 279 shader->updateContext(ndRangeMap[execId].curCid); 280 281 // attempt to dispatch_workgroup 282 if (!shader->dispatch_workgroups(&ndRangeMap[execId])) { 283 // if we failed try the next kernel, 284 // it may have smaller workgroups. 285 // put it on the queue to rety latter 286 DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId); 287 execIds.push(execId); 288 ++fail_count; 289 break; 290 } 291 } 292 // let's try the next kernel_id 293 execIds.pop(); 294 } 295 296 DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); 297 298 if (doneIds.size() && cpu) { 299 shader->hostWakeUp(cpu); 300 } 301 302 while (doneIds.size()) { 303 // wakeup the CPU if any Kernels completed this cycle 304 DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front()); 305 doneIds.pop(); 306 } 307} 308 309void 310GpuDispatcher::notifyWgCompl(Wavefront *w) 311{ 312 int kern_id = w->kernId; 313 DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id); 314 assert(ndRangeMap[kern_id].dispatchId == kern_id); 315 ndRangeMap[kern_id].numWgCompleted++; 316 317 if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) { 318 ndRangeMap[kern_id].execDone = true; 319 doneIds.push(kern_id); 320 321 if (ndRangeMap[kern_id].addrToNotify) { 322 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1, 323 0); 324 } 325 326 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1); 327 328 // update event end time (in nano-seconds) 329 if (ndRangeMap[kern_id].q.depends) { 330 HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends; 331 uint64_t event; 332 shader->ReadMem((uint64_t)(&host_state->event), &event, 333 sizeof(uint64_t), 0); 334 335 uint64_t end = curTick() / 1000; 336 337 shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end, 338 sizeof(uint64_t), 0); 339 } 340 } 341 342 if (!tickEvent.scheduled()) { 343 schedule(&tickEvent, curTick() + shader->ticks(1)); 344 } 345} 346 347void 348GpuDispatcher::scheduleDispatch() 349{ 350 if (!tickEvent.scheduled()) 351 schedule(&tickEvent, curTick() + shader->ticks(1)); 352} 353 354void 355GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off) 356{ 357 if (cpu) { 358 if (off) { 359 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq, 360 true); 361 val += off; 362 } 363 364 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true); 365 } else { 366 panic("Cannot find host"); 367 } 368} 369 370// helper functions for driver to retrieve GPU attributes 371int 372GpuDispatcher::getNumCUs() 373{ 374 return shader->cuList.size(); 375} 376 377int 378GpuDispatcher::wfSize() const 379{ 380 return shader->cuList[0]->wfSize(); 381} 382 383void 384GpuDispatcher::setFuncargsSize(int funcargs_size) 385{ 386 shader->funcargs_size = funcargs_size; 387} 388 389uint32_t 390GpuDispatcher::getStaticContextSize() const 391{ 392 return shader->cuList[0]->wfList[0][0]->getStaticContextSize(); 393} 394