dispatcher.cc revision 12126
19665Sandreas.hansson@arm.com/* 29520SN/A * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 39520SN/A * All rights reserved. 49520SN/A * 59520SN/A * For use for simulation and test purposes only 69520SN/A * 79520SN/A * Redistribution and use in source and binary forms, with or without 89520SN/A * modification, are permitted provided that the following conditions are met: 99520SN/A * 109520SN/A * 1. Redistributions of source code must retain the above copyright notice, 119520SN/A * this list of conditions and the following disclaimer. 129520SN/A * 139520SN/A * 2. Redistributions in binary form must reproduce the above copyright notice, 149520SN/A * this list of conditions and the following disclaimer in the documentation 159520SN/A * and/or other materials provided with the distribution. 169520SN/A * 179520SN/A * 3. Neither the name of the copyright holder nor the names of its contributors 189520SN/A * may be used to endorse or promote products derived from this software 199520SN/A * without specific prior written permission. 209520SN/A * 219520SN/A * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 229520SN/A * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 239520SN/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 249520SN/A * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 259520SN/A * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 269520SN/A * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 279520SN/A * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 289520SN/A * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 299520SN/A * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 309520SN/A * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 319520SN/A * POSSIBILITY OF SUCH DAMAGE. 329520SN/A * 339520SN/A * Author: Brad Beckmann, Marc Orr 349520SN/A */ 359520SN/A 369520SN/A 379665Sandreas.hansson@arm.com#include "gpu-compute/dispatcher.hh" 389520SN/A 399520SN/A#include "cpu/base.hh" 409520SN/A#include "debug/GPUDisp.hh" 419520SN/A#include "gpu-compute/cl_driver.hh" 429520SN/A#include "gpu-compute/cl_event.hh" 439520SN/A#include "gpu-compute/shader.hh" 449665Sandreas.hansson@arm.com#include "gpu-compute/wavefront.hh" 459665Sandreas.hansson@arm.com#include "mem/packet_access.hh" 469665Sandreas.hansson@arm.com 479520SN/AGpuDispatcher *GpuDispatcher::instance = nullptr; 489665Sandreas.hansson@arm.com 499665Sandreas.hansson@arm.comGpuDispatcher::GpuDispatcher(const Params *p) 509665Sandreas.hansson@arm.com : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")), 519665Sandreas.hansson@arm.com pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency), 529665Sandreas.hansson@arm.com dispatchCount(0), dispatchActive(false), cpu(p->cpu), 539728Sandreas.hansson@arm.com shader(p->shader_pointer), driver(p->cl_driver), 549728Sandreas.hansson@arm.com tickEvent([this]{ exec(); }, "GPU Dispatcher tick", 559728Sandreas.hansson@arm.com false, Event::CPU_Tick_Pri) 569728Sandreas.hansson@arm.com{ 579520SN/A shader->handshake(this); 589520SN/A driver->handshake(this); 599665Sandreas.hansson@arm.com 609665Sandreas.hansson@arm.com ndRange.wg_disp_rem = false; 619665Sandreas.hansson@arm.com ndRange.globalWgId = 0; 629520SN/A 639520SN/A schedule(&tickEvent, 0); 649665Sandreas.hansson@arm.com 659665Sandreas.hansson@arm.com // translation port for the dispatcher 669520SN/A tlbPort = new TLBPort(csprintf("%s-port%d", name()), this); 679520SN/A 689520SN/A num_kernelLaunched 699520SN/A .name(name() + ".num_kernel_launched") 709665Sandreas.hansson@arm.com .desc("number of kernel launched") 719665Sandreas.hansson@arm.com ; 729520SN/A} 739520SN/A 749520SN/AGpuDispatcher *GpuDispatcherParams::create() 759520SN/A{ 769665Sandreas.hansson@arm.com GpuDispatcher *dispatcher = new GpuDispatcher(this); 779520SN/A GpuDispatcher::setInstance(dispatcher); 789665Sandreas.hansson@arm.com 799520SN/A return GpuDispatcher::getInstance(); 809520SN/A} 819665Sandreas.hansson@arm.com 829665Sandreas.hansson@arm.comvoid 839520SN/AGpuDispatcher::serialize(CheckpointOut &cp) const 849665Sandreas.hansson@arm.com{ 859520SN/A Tick event_tick = 0; 869520SN/A 879665Sandreas.hansson@arm.com if (ndRange.wg_disp_rem) 889665Sandreas.hansson@arm.com fatal("Checkpointing not supported during active workgroup execution"); 899520SN/A 909665Sandreas.hansson@arm.com if (tickEvent.scheduled()) 919520SN/A event_tick = tickEvent.when(); 929665Sandreas.hansson@arm.com 939520SN/A SERIALIZE_SCALAR(event_tick); 949520SN/A 959520SN/A} 969520SN/A 979520SN/Avoid 989520SN/AGpuDispatcher::unserialize(CheckpointIn &cp) 999520SN/A{ 1009520SN/A Tick event_tick; 1019520SN/A 1029665Sandreas.hansson@arm.com if (tickEvent.scheduled()) 1039665Sandreas.hansson@arm.com deschedule(&tickEvent); 1049665Sandreas.hansson@arm.com 1059520SN/A UNSERIALIZE_SCALAR(event_tick); 1069520SN/A 1079665Sandreas.hansson@arm.com if (event_tick) 1089665Sandreas.hansson@arm.com schedule(&tickEvent, event_tick); 1099665Sandreas.hansson@arm.com} 1109520SN/A 1119665Sandreas.hansson@arm.comAddrRangeList 1129665Sandreas.hansson@arm.comGpuDispatcher::getAddrRanges() const 1139665Sandreas.hansson@arm.com{ 1149520SN/A AddrRangeList ranges; 1159665Sandreas.hansson@arm.com 1169520SN/A DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n", 1179665Sandreas.hansson@arm.com pioAddr, pioSize); 1189665Sandreas.hansson@arm.com 1199665Sandreas.hansson@arm.com ranges.push_back(RangeSize(pioAddr, pioSize)); 1209520SN/A 1219665Sandreas.hansson@arm.com return ranges; 1229665Sandreas.hansson@arm.com} 1239520SN/A 1249665Sandreas.hansson@arm.comTick 1259520SN/AGpuDispatcher::read(PacketPtr pkt) 1269665Sandreas.hansson@arm.com{ 1279836Sandreas.hansson@arm.com assert(pkt->getAddr() >= pioAddr); 1289836Sandreas.hansson@arm.com assert(pkt->getAddr() < pioAddr + pioSize); 1299836Sandreas.hansson@arm.com 1309836Sandreas.hansson@arm.com int offset = pkt->getAddr() - pioAddr; 1319836Sandreas.hansson@arm.com pkt->allocate(); 1329836Sandreas.hansson@arm.com 1339836Sandreas.hansson@arm.com DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize()); 1349836Sandreas.hansson@arm.com 1359836Sandreas.hansson@arm.com if (offset < 8) { 1369836Sandreas.hansson@arm.com assert(!offset); 1379836Sandreas.hansson@arm.com assert(pkt->getSize() == 8); 1389836Sandreas.hansson@arm.com 1399836Sandreas.hansson@arm.com uint64_t retval = dispatchActive; 1409836Sandreas.hansson@arm.com pkt->set(retval); 1419836Sandreas.hansson@arm.com } else { 1429836Sandreas.hansson@arm.com offset -= 8; 1439836Sandreas.hansson@arm.com assert(offset + pkt->getSize() < sizeof(HsaQueueEntry)); 1449836Sandreas.hansson@arm.com char *curTaskPtr = (char*)&curTask; 1459836Sandreas.hansson@arm.com 1469836Sandreas.hansson@arm.com memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize()); 1479836Sandreas.hansson@arm.com } 1489836Sandreas.hansson@arm.com 1499836Sandreas.hansson@arm.com pkt->makeAtomicResponse(); 1509836Sandreas.hansson@arm.com 1519836Sandreas.hansson@arm.com return pioDelay; 1529836Sandreas.hansson@arm.com} 1539836Sandreas.hansson@arm.com 1549836Sandreas.hansson@arm.comTick 1559836Sandreas.hansson@arm.comGpuDispatcher::write(PacketPtr pkt) 1569836Sandreas.hansson@arm.com{ 1579836Sandreas.hansson@arm.com assert(pkt->getAddr() >= pioAddr); 1589836Sandreas.hansson@arm.com assert(pkt->getAddr() < pioAddr + pioSize); 1599836Sandreas.hansson@arm.com 1609836Sandreas.hansson@arm.com int offset = pkt->getAddr() - pioAddr; 1619836Sandreas.hansson@arm.com 1629836Sandreas.hansson@arm.com#if TRACING_ON 1639836Sandreas.hansson@arm.com uint64_t data_val = 0; 1649836Sandreas.hansson@arm.com 1659836Sandreas.hansson@arm.com switch (pkt->getSize()) { 1669836Sandreas.hansson@arm.com case 1: 1679836Sandreas.hansson@arm.com data_val = pkt->get<uint8_t>(); 1689836Sandreas.hansson@arm.com break; 1699836Sandreas.hansson@arm.com case 2: 1709836Sandreas.hansson@arm.com data_val = pkt->get<uint16_t>(); 1719836Sandreas.hansson@arm.com break; 1729836Sandreas.hansson@arm.com case 4: 1739836Sandreas.hansson@arm.com data_val = pkt->get<uint32_t>(); 1749836Sandreas.hansson@arm.com break; 1759836Sandreas.hansson@arm.com case 8: 1769836Sandreas.hansson@arm.com data_val = pkt->get<uint64_t>(); 1779836Sandreas.hansson@arm.com break; 1789836Sandreas.hansson@arm.com default: 1799836Sandreas.hansson@arm.com DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize()); 1809836Sandreas.hansson@arm.com } 1819836Sandreas.hansson@arm.com 1829836Sandreas.hansson@arm.com DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val, 1839836Sandreas.hansson@arm.com pkt->getSize()); 1849836Sandreas.hansson@arm.com#endif 1859836Sandreas.hansson@arm.com if (!offset) { 1869836Sandreas.hansson@arm.com static int nextId = 0; 1879836Sandreas.hansson@arm.com 1889836Sandreas.hansson@arm.com // The depends field of the qstruct, which was previously unused, is 1899836Sandreas.hansson@arm.com // used to communicate with simulated application. 1909836Sandreas.hansson@arm.com if (curTask.depends) { 1919836Sandreas.hansson@arm.com HostState hs; 1929836Sandreas.hansson@arm.com shader->ReadMem((uint64_t)(curTask.depends), &hs, 1939836Sandreas.hansson@arm.com sizeof(HostState), 0); 194 195 // update event start time (in nano-seconds) 196 uint64_t start = curTick() / 1000; 197 198 shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start), 199 &start, sizeof(uint64_t), 0); 200 } 201 202 // launch kernel 203 ++num_kernelLaunched; 204 205 NDRange *ndr = &(ndRangeMap[nextId]); 206 // copy dispatch info 207 ndr->q = curTask; 208 209 // update the numDispTask polled by the runtime 210 accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1); 211 212 ndr->numWgTotal = 1; 213 214 for (int i = 0; i < 3; ++i) { 215 ndr->wgId[i] = 0; 216 ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]); 217 ndr->numWgTotal *= ndr->numWg[i]; 218 } 219 220 ndr->numWgCompleted = 0; 221 ndr->globalWgId = 0; 222 ndr->wg_disp_rem = true; 223 ndr->execDone = false; 224 ndr->addrToNotify = (volatile bool*)curTask.addrToNotify; 225 ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft; 226 ndr->dispatchId = nextId; 227 ndr->curCid = pkt->req->contextId(); 228 DPRINTF(GPUDisp, "launching kernel %d\n",nextId); 229 execIds.push(nextId); 230 ++nextId; 231 232 dispatchActive = true; 233 234 if (!tickEvent.scheduled()) { 235 schedule(&tickEvent, curTick() + shader->ticks(1)); 236 } 237 } else { 238 // populate current task struct 239 // first 64 bits are launch reg 240 offset -= 8; 241 assert(offset < sizeof(HsaQueueEntry)); 242 char *curTaskPtr = (char*)&curTask; 243 memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize()); 244 } 245 246 pkt->makeAtomicResponse(); 247 248 return pioDelay; 249} 250 251 252BaseMasterPort& 253GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx) 254{ 255 if (if_name == "translation_port") { 256 return *tlbPort; 257 } 258 259 return DmaDevice::getMasterPort(if_name, idx); 260} 261 262void 263GpuDispatcher::exec() 264{ 265 int fail_count = 0; 266 267 // There are potentially multiple outstanding kernel launches. 268 // It is possible that the workgroups in a different kernel 269 // can fit on the GPU even if another kernel's workgroups cannot 270 DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); 271 272 while (execIds.size() > fail_count) { 273 int execId = execIds.front(); 274 275 while (ndRangeMap[execId].wg_disp_rem) { 276 //update the thread context 277 shader->updateContext(ndRangeMap[execId].curCid); 278 279 // attempt to dispatch_workgroup 280 if (!shader->dispatch_workgroups(&ndRangeMap[execId])) { 281 // if we failed try the next kernel, 282 // it may have smaller workgroups. 283 // put it on the queue to rety latter 284 DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId); 285 execIds.push(execId); 286 ++fail_count; 287 break; 288 } 289 } 290 // let's try the next kernel_id 291 execIds.pop(); 292 } 293 294 DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); 295 296 if (doneIds.size() && cpu) { 297 shader->hostWakeUp(cpu); 298 } 299 300 while (doneIds.size()) { 301 // wakeup the CPU if any Kernels completed this cycle 302 DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front()); 303 doneIds.pop(); 304 } 305} 306 307void 308GpuDispatcher::notifyWgCompl(Wavefront *w) 309{ 310 int kern_id = w->kernId; 311 DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id); 312 assert(ndRangeMap[kern_id].dispatchId == kern_id); 313 ndRangeMap[kern_id].numWgCompleted++; 314 315 if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) { 316 ndRangeMap[kern_id].execDone = true; 317 doneIds.push(kern_id); 318 319 if (ndRangeMap[kern_id].addrToNotify) { 320 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1, 321 0); 322 } 323 324 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1); 325 326 // update event end time (in nano-seconds) 327 if (ndRangeMap[kern_id].q.depends) { 328 HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends; 329 uint64_t event; 330 shader->ReadMem((uint64_t)(&host_state->event), &event, 331 sizeof(uint64_t), 0); 332 333 uint64_t end = curTick() / 1000; 334 335 shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end, 336 sizeof(uint64_t), 0); 337 } 338 } 339 340 if (!tickEvent.scheduled()) { 341 schedule(&tickEvent, curTick() + shader->ticks(1)); 342 } 343} 344 345void 346GpuDispatcher::scheduleDispatch() 347{ 348 if (!tickEvent.scheduled()) 349 schedule(&tickEvent, curTick() + shader->ticks(1)); 350} 351 352void 353GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off) 354{ 355 if (cpu) { 356 if (off) { 357 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq, 358 true); 359 val += off; 360 } 361 362 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true); 363 } else { 364 panic("Cannot find host"); 365 } 366} 367 368// helper functions for driver to retrieve GPU attributes 369int 370GpuDispatcher::getNumCUs() 371{ 372 return shader->cuList.size(); 373} 374 375int 376GpuDispatcher::wfSize() const 377{ 378 return shader->cuList[0]->wfSize(); 379} 380 381void 382GpuDispatcher::setFuncargsSize(int funcargs_size) 383{ 384 shader->funcargs_size = funcargs_size; 385} 386 387uint32_t 388GpuDispatcher::getStaticContextSize() const 389{ 390 return shader->cuList[0]->wfList[0][0]->getStaticContextSize(); 391} 392