1/* 2 * Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Brad Beckmann, 34 * Marc Orr, 35 * Anthony Gutierrez 36 */ 37 38 39#include "gpu-compute/dispatcher.hh" 40 41#include "cpu/base.hh" 42#include "debug/GPUDisp.hh" 43#include "gpu-compute/cl_driver.hh" 44#include "gpu-compute/cl_event.hh" 45#include "gpu-compute/shader.hh" 46#include "gpu-compute/wavefront.hh" 47#include "mem/packet_access.hh" 48 49GpuDispatcher *GpuDispatcher::instance = nullptr; 50 51GpuDispatcher::GpuDispatcher(const Params *p) 52 : DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")), 53 pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency), 54 dispatchCount(0), dispatchActive(false), cpu(p->cpu), 55 shader(p->shader_pointer), driver(p->cl_driver), 56 tickEvent([this]{ exec(); }, "GPU Dispatcher tick", 57 false, Event::CPU_Tick_Pri) 58{ 59 shader->handshake(this); 60 driver->handshake(this); 61 62 ndRange.wg_disp_rem = false; 63 ndRange.globalWgId = 0; 64 65 schedule(&tickEvent, 0); 66 67 // translation port for the dispatcher 68 tlbPort = new TLBPort(csprintf("%s-port%d", name()), this); 69 70 num_kernelLaunched 71 .name(name() + ".num_kernel_launched") 72 .desc("number of kernel launched") 73 ; 74} 75 76GpuDispatcher *GpuDispatcherParams::create() 77{ 78 GpuDispatcher *dispatcher = new GpuDispatcher(this); 79 GpuDispatcher::setInstance(dispatcher); 80 81 return GpuDispatcher::getInstance(); 82} 83 84void 85GpuDispatcher::serialize(CheckpointOut &cp) const 86{ 87 Tick event_tick = 0; 88 89 if (ndRange.wg_disp_rem) 90 fatal("Checkpointing not supported during active workgroup execution"); 91 92 if (tickEvent.scheduled()) 93 event_tick = tickEvent.when(); 94 95 SERIALIZE_SCALAR(event_tick); 96 97} 98 99void 100GpuDispatcher::unserialize(CheckpointIn &cp) 101{ 102 Tick event_tick; 103 104 if (tickEvent.scheduled()) 105 deschedule(&tickEvent); 106 107 UNSERIALIZE_SCALAR(event_tick); 108 109 if (event_tick) 110 schedule(&tickEvent, event_tick); 111} 112 113AddrRangeList 114GpuDispatcher::getAddrRanges() const 115{ 116 AddrRangeList ranges; 117 118 DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n", 119 pioAddr, pioSize); 120 121 ranges.push_back(RangeSize(pioAddr, pioSize)); 122 123 return ranges; 124} 125 126Tick 127GpuDispatcher::read(PacketPtr pkt) 128{ 129 assert(pkt->getAddr() >= pioAddr); 130 assert(pkt->getAddr() < pioAddr + pioSize); 131 132 int offset = pkt->getAddr() - pioAddr; 133 pkt->allocate(); 134 135 DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize()); 136 137 if (offset < 8) { 138 assert(!offset); 139 assert(pkt->getSize() == 8); 140 141 uint64_t retval = dispatchActive; 142 pkt->setLE(retval); 143 } else { 144 offset -= 8; 145 assert(offset + pkt->getSize() < sizeof(HsaQueueEntry)); 146 char *curTaskPtr = (char*)&curTask; 147 148 memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize()); 149 } 150 151 pkt->makeAtomicResponse(); 152 153 return pioDelay; 154} 155 156Tick 157GpuDispatcher::write(PacketPtr pkt) 158{ 159 assert(pkt->getAddr() >= pioAddr); 160 assert(pkt->getAddr() < pioAddr + pioSize); 161 162 int offset = pkt->getAddr() - pioAddr; 163 164#if TRACING_ON 165 uint64_t data_val = 0; 166 167 switch (pkt->getSize()) { 168 case 1: 169 data_val = pkt->getLE<uint8_t>(); 170 break; 171 case 2: 172 data_val = pkt->getLE<uint16_t>(); 173 break; 174 case 4: 175 data_val = pkt->getLE<uint32_t>(); 176 break; 177 case 8: 178 data_val = pkt->getLE<uint64_t>(); 179 break; 180 default: 181 DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize()); 182 } 183 184 DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val, 185 pkt->getSize()); 186#endif 187 if (!offset) { 188 static int nextId = 0; 189 190 // The depends field of the qstruct, which was previously unused, is 191 // used to communicate with simulated application. 192 if (curTask.depends) { 193 HostState hs; 194 shader->ReadMem((uint64_t)(curTask.depends), &hs, 195 sizeof(HostState), 0); 196 197 // update event start time (in nano-seconds) 198 uint64_t start = curTick() / 1000; 199 200 shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start), 201 &start, sizeof(uint64_t), 0); 202 } 203 204 // launch kernel 205 ++num_kernelLaunched; 206 207 NDRange *ndr = &(ndRangeMap[nextId]); 208 // copy dispatch info 209 ndr->q = curTask; 210 211 // update the numDispTask polled by the runtime 212 accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1); 213 214 ndr->numWgTotal = 1; 215 216 for (int i = 0; i < 3; ++i) { 217 ndr->wgId[i] = 0; 218 ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]); 219 ndr->numWgTotal *= ndr->numWg[i]; 220 } 221 222 ndr->numWgCompleted = 0; 223 ndr->globalWgId = 0; 224 ndr->wg_disp_rem = true; 225 ndr->execDone = false; 226 ndr->addrToNotify = (volatile bool*)curTask.addrToNotify; 227 ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft; 228 ndr->dispatchId = nextId; 229 ndr->curCid = pkt->req->contextId(); 230 DPRINTF(GPUDisp, "launching kernel %d\n",nextId); 231 execIds.push(nextId); 232 ++nextId; 233 234 dispatchActive = true; 235 236 if (!tickEvent.scheduled()) { 237 schedule(&tickEvent, curTick() + shader->ticks(1)); 238 } 239 } else { 240 // populate current task struct 241 // first 64 bits are launch reg 242 offset -= 8; 243 assert(offset < sizeof(HsaQueueEntry)); 244 char *curTaskPtr = (char*)&curTask; 245 memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize()); 246 } 247 248 pkt->makeAtomicResponse(); 249 250 return pioDelay; 251} 252 253 254Port & 255GpuDispatcher::getPort(const std::string &if_name, PortID idx) 256{ 257 if (if_name == "translation_port") { 258 return *tlbPort; 259 } 260 261 return DmaDevice::getPort(if_name, idx); 262} 263 264void 265GpuDispatcher::exec() 266{ 267 int fail_count = 0; 268 269 // There are potentially multiple outstanding kernel launches. 270 // It is possible that the workgroups in a different kernel 271 // can fit on the GPU even if another kernel's workgroups cannot 272 DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); 273 274 while (execIds.size() > fail_count) { 275 int execId = execIds.front(); 276 277 while (ndRangeMap[execId].wg_disp_rem) { 278 //update the thread context 279 shader->updateContext(ndRangeMap[execId].curCid); 280 281 // attempt to dispatch_workgroup 282 if (!shader->dispatch_workgroups(&ndRangeMap[execId])) { 283 // if we failed try the next kernel, 284 // it may have smaller workgroups. 285 // put it on the queue to rety latter 286 DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId); 287 execIds.push(execId); 288 ++fail_count; 289 break; 290 } 291 } 292 // let's try the next kernel_id 293 execIds.pop(); 294 } 295 296 DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); 297 298 if (doneIds.size() && cpu) { 299 shader->hostWakeUp(cpu); 300 } 301 302 while (doneIds.size()) { 303 // wakeup the CPU if any Kernels completed this cycle 304 DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front()); 305 doneIds.pop(); 306 } 307} 308 309void 310GpuDispatcher::notifyWgCompl(Wavefront *w) 311{ 312 int kern_id = w->kernId; 313 DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id); 314 assert(ndRangeMap[kern_id].dispatchId == kern_id); 315 ndRangeMap[kern_id].numWgCompleted++; 316 317 if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) { 318 ndRangeMap[kern_id].execDone = true; 319 doneIds.push(kern_id); 320 321 if (ndRangeMap[kern_id].addrToNotify) { 322 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1, 323 0); 324 } 325 326 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1); 327 328 // update event end time (in nano-seconds) 329 if (ndRangeMap[kern_id].q.depends) { 330 HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends; 331 uint64_t event; 332 shader->ReadMem((uint64_t)(&host_state->event), &event, 333 sizeof(uint64_t), 0); 334 335 uint64_t end = curTick() / 1000; 336 337 shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end, 338 sizeof(uint64_t), 0); 339 } 340 } 341 342 if (!tickEvent.scheduled()) { 343 schedule(&tickEvent, curTick() + shader->ticks(1)); 344 } 345} 346 347void 348GpuDispatcher::scheduleDispatch() 349{ 350 if (!tickEvent.scheduled()) 351 schedule(&tickEvent, curTick() + shader->ticks(1)); 352} 353 354void 355GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off) 356{ 357 if (cpu) { 358 if (off) { 359 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq, 360 true); 361 val += off; 362 } 363 364 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true); 365 } else { 366 panic("Cannot find host"); 367 } 368} 369 370// helper functions for driver to retrieve GPU attributes 371int 372GpuDispatcher::getNumCUs() 373{ 374 return shader->cuList.size(); 375} 376 377int 378GpuDispatcher::wfSize() const 379{ 380 return shader->cuList[0]->wfSize(); 381} 382 383void 384GpuDispatcher::setFuncargsSize(int funcargs_size) 385{ 386 shader->funcargs_size = funcargs_size; 387} 388 389uint32_t 390GpuDispatcher::getStaticContextSize() const 391{ 392 return shader->cuList[0]->wfList[0][0]->getStaticContextSize(); 393} 394