dispatcher.cc revision 11639
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Brad Beckmann, Marc Orr 34 */ 35 36 37#include "gpu-compute/dispatcher.hh" 38 39#include "cpu/base.hh" 40#include "debug/GPUDisp.hh" 41#include "gpu-compute/cl_driver.hh" 42#include "gpu-compute/cl_event.hh" 43#include "gpu-compute/shader.hh" 44#include "gpu-compute/wavefront.hh" 45#include "mem/packet_access.hh" 46 47GpuDispatcher *GpuDispatcher::instance = nullptr; 48 49GpuDispatcher::GpuDispatcher(const Params *p) 50 : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")), 51 pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency), 52 dispatchCount(0), dispatchActive(false), cpu(p->cpu), 53 shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this) 54{ 55 shader->handshake(this); 56 driver->handshake(this); 57 58 ndRange.wg_disp_rem = false; 59 ndRange.globalWgId = 0; 60 61 schedule(&tickEvent, 0); 62 63 // translation port for the dispatcher 64 tlbPort = new TLBPort(csprintf("%s-port%d", name()), this); 65 66 num_kernelLaunched 67 .name(name() + ".num_kernel_launched") 68 .desc("number of kernel launched") 69 ; 70} 71 72GpuDispatcher *GpuDispatcherParams::create() 73{ 74 GpuDispatcher *dispatcher = new GpuDispatcher(this); 75 GpuDispatcher::setInstance(dispatcher); 76 77 return GpuDispatcher::getInstance(); 78} 79 80void 81GpuDispatcher::serialize(CheckpointOut &cp) const 82{ 83 Tick event_tick = 0; 84 85 if (ndRange.wg_disp_rem) 86 fatal("Checkpointing not supported during active workgroup execution"); 87 88 if (tickEvent.scheduled()) 89 event_tick = tickEvent.when(); 90 91 SERIALIZE_SCALAR(event_tick); 92 93} 94 95void 96GpuDispatcher::unserialize(CheckpointIn &cp) 97{ 98 Tick event_tick; 99 100 if (tickEvent.scheduled()) 101 deschedule(&tickEvent); 102 103 UNSERIALIZE_SCALAR(event_tick); 104 105 if (event_tick) 106 schedule(&tickEvent, event_tick); 107} 108 109AddrRangeList 110GpuDispatcher::getAddrRanges() const 111{ 112 AddrRangeList ranges; 113 114 DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n", 115 pioAddr, pioSize); 116 117 ranges.push_back(RangeSize(pioAddr, pioSize)); 118 119 return ranges; 120} 121 122Tick 123GpuDispatcher::read(PacketPtr pkt) 124{ 125 assert(pkt->getAddr() >= pioAddr); 126 assert(pkt->getAddr() < pioAddr + pioSize); 127 128 int offset = pkt->getAddr() - pioAddr; 129 pkt->allocate(); 130 131 DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize()); 132 133 if (offset < 8) { 134 assert(!offset); 135 assert(pkt->getSize() == 8); 136 137 uint64_t retval = dispatchActive; 138 pkt->set(retval); 139 } else { 140 offset -= 8; 141 assert(offset + pkt->getSize() < sizeof(HsaQueueEntry)); 142 char *curTaskPtr = (char*)&curTask; 143 144 memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize()); 145 } 146 147 pkt->makeAtomicResponse(); 148 149 return pioDelay; 150} 151 152Tick 153GpuDispatcher::write(PacketPtr pkt) 154{ 155 assert(pkt->getAddr() >= pioAddr); 156 assert(pkt->getAddr() < pioAddr + pioSize); 157 158 int offset = pkt->getAddr() - pioAddr; 159 160#if TRACING_ON 161 uint64_t data_val = 0; 162 163 switch (pkt->getSize()) { 164 case 1: 165 data_val = pkt->get<uint8_t>(); 166 break; 167 case 2: 168 data_val = pkt->get<uint16_t>(); 169 break; 170 case 4: 171 data_val = pkt->get<uint32_t>(); 172 break; 173 case 8: 174 data_val = pkt->get<uint64_t>(); 175 break; 176 default: 177 DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize()); 178 } 179 180 DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val, 181 pkt->getSize()); 182#endif 183 if (!offset) { 184 static int nextId = 0; 185 186 // The depends field of the qstruct, which was previously unused, is 187 // used to communicate with simulated application. 188 if (curTask.depends) { 189 HostState hs; 190 shader->ReadMem((uint64_t)(curTask.depends), &hs, 191 sizeof(HostState), 0); 192 193 // update event start time (in nano-seconds) 194 uint64_t start = curTick() / 1000; 195 196 shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start), 197 &start, sizeof(uint64_t), 0); 198 } 199 200 // launch kernel 201 ++num_kernelLaunched; 202 203 NDRange *ndr = &(ndRangeMap[nextId]); 204 // copy dispatch info 205 ndr->q = curTask; 206 207 // update the numDispTask polled by the runtime 208 accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1); 209 210 ndr->numWgTotal = 1; 211 212 for (int i = 0; i < 3; ++i) { 213 ndr->wgId[i] = 0; 214 ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]); 215 ndr->numWgTotal *= ndr->numWg[i]; 216 } 217 218 ndr->numWgCompleted = 0; 219 ndr->globalWgId = 0; 220 ndr->wg_disp_rem = true; 221 ndr->execDone = false; 222 ndr->addrToNotify = (volatile bool*)curTask.addrToNotify; 223 ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft; 224 ndr->dispatchId = nextId; 225 ndr->curCid = pkt->req->contextId(); 226 DPRINTF(GPUDisp, "launching kernel %d\n",nextId); 227 execIds.push(nextId); 228 ++nextId; 229 230 dispatchActive = true; 231 232 if (!tickEvent.scheduled()) { 233 schedule(&tickEvent, curTick() + shader->ticks(1)); 234 } 235 } else { 236 // populate current task struct 237 // first 64 bits are launch reg 238 offset -= 8; 239 assert(offset < sizeof(HsaQueueEntry)); 240 char *curTaskPtr = (char*)&curTask; 241 memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize()); 242 } 243 244 pkt->makeAtomicResponse(); 245 246 return pioDelay; 247} 248 249 250BaseMasterPort& 251GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx) 252{ 253 if (if_name == "translation_port") { 254 return *tlbPort; 255 } 256 257 return DmaDevice::getMasterPort(if_name, idx); 258} 259 260void 261GpuDispatcher::exec() 262{ 263 int fail_count = 0; 264 265 // There are potentially multiple outstanding kernel launches. 266 // It is possible that the workgroups in a different kernel 267 // can fit on the GPU even if another kernel's workgroups cannot 268 DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); 269 270 while (execIds.size() > fail_count) { 271 int execId = execIds.front(); 272 273 while (ndRangeMap[execId].wg_disp_rem) { 274 //update the thread context 275 shader->updateContext(ndRangeMap[execId].curCid); 276 277 // attempt to dispatch_workgroup 278 if (!shader->dispatch_workgroups(&ndRangeMap[execId])) { 279 // if we failed try the next kernel, 280 // it may have smaller workgroups. 281 // put it on the queue to rety latter 282 DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId); 283 execIds.push(execId); 284 ++fail_count; 285 break; 286 } 287 } 288 // let's try the next kernel_id 289 execIds.pop(); 290 } 291 292 DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); 293 294 if (doneIds.size() && cpu) { 295 shader->hostWakeUp(cpu); 296 } 297 298 while (doneIds.size()) { 299 // wakeup the CPU if any Kernels completed this cycle 300 DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front()); 301 doneIds.pop(); 302 } 303} 304 305void 306GpuDispatcher::notifyWgCompl(Wavefront *w) 307{ 308 int kern_id = w->kernId; 309 DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id); 310 assert(ndRangeMap[kern_id].dispatchId == kern_id); 311 ndRangeMap[kern_id].numWgCompleted++; 312 313 if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) { 314 ndRangeMap[kern_id].execDone = true; 315 doneIds.push(kern_id); 316 317 if (ndRangeMap[kern_id].addrToNotify) { 318 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1, 319 0); 320 } 321 322 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1); 323 324 // update event end time (in nano-seconds) 325 if (ndRangeMap[kern_id].q.depends) { 326 HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends; 327 uint64_t event; 328 shader->ReadMem((uint64_t)(&host_state->event), &event, 329 sizeof(uint64_t), 0); 330 331 uint64_t end = curTick() / 1000; 332 333 shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end, 334 sizeof(uint64_t), 0); 335 } 336 } 337 338 if (!tickEvent.scheduled()) { 339 schedule(&tickEvent, curTick() + shader->ticks(1)); 340 } 341} 342 343void 344GpuDispatcher::scheduleDispatch() 345{ 346 if (!tickEvent.scheduled()) 347 schedule(&tickEvent, curTick() + shader->ticks(1)); 348} 349 350void 351GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off) 352{ 353 if (cpu) { 354 if (off) { 355 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq, 356 true); 357 val += off; 358 } 359 360 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true); 361 } else { 362 panic("Cannot find host"); 363 } 364} 365 366GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher) 367 : Event(CPU_Tick_Pri), dispatcher(_dispatcher) 368{ 369} 370 371void 372GpuDispatcher::TickEvent::process() 373{ 374 dispatcher->exec(); 375} 376 377const char* 378GpuDispatcher::TickEvent::description() const 379{ 380 return "GPU Dispatcher tick"; 381} 382 383// helper functions for driver to retrieve GPU attributes 384int 385GpuDispatcher::getNumCUs() 386{ 387 return shader->cuList.size(); 388} 389 390int 391GpuDispatcher::wfSize() const 392{ 393 return shader->cuList[0]->wfSize(); 394} 395 396void 397GpuDispatcher::setFuncargsSize(int funcargs_size) 398{ 399 shader->funcargs_size = funcargs_size; 400} 401