dispatcher.cc revision 12680
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Brad Beckmann, Marc Orr 34 */ 35 36 37#include "gpu-compute/dispatcher.hh" 38 39#include "cpu/base.hh" 40#include "debug/GPUDisp.hh" 41#include "gpu-compute/cl_driver.hh" 42#include "gpu-compute/cl_event.hh" 43#include "gpu-compute/shader.hh" 44#include "gpu-compute/wavefront.hh" 45#include "mem/packet_access.hh" 46 47GpuDispatcher *GpuDispatcher::instance = nullptr; 48 49GpuDispatcher::GpuDispatcher(const Params *p) 50 : DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")), 51 pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency), 52 dispatchCount(0), dispatchActive(false), cpu(p->cpu), 53 shader(p->shader_pointer), driver(p->cl_driver), 54 tickEvent([this]{ exec(); }, "GPU Dispatcher tick", 55 false, Event::CPU_Tick_Pri) 56{ 57 shader->handshake(this); 58 driver->handshake(this); 59 60 ndRange.wg_disp_rem = false; 61 ndRange.globalWgId = 0; 62 63 schedule(&tickEvent, 0); 64 65 // translation port for the dispatcher 66 tlbPort = new TLBPort(csprintf("%s-port%d", name()), this); 67 68 num_kernelLaunched 69 .name(name() + ".num_kernel_launched") 70 .desc("number of kernel launched") 71 ; 72} 73 74GpuDispatcher *GpuDispatcherParams::create() 75{ 76 GpuDispatcher *dispatcher = new GpuDispatcher(this); 77 GpuDispatcher::setInstance(dispatcher); 78 79 return GpuDispatcher::getInstance(); 80} 81 82void 83GpuDispatcher::serialize(CheckpointOut &cp) const 84{ 85 Tick event_tick = 0; 86 87 if (ndRange.wg_disp_rem) 88 fatal("Checkpointing not supported during active workgroup execution"); 89 90 if (tickEvent.scheduled()) 91 event_tick = tickEvent.when(); 92 93 SERIALIZE_SCALAR(event_tick); 94 95} 96 97void 98GpuDispatcher::unserialize(CheckpointIn &cp) 99{ 100 Tick event_tick; 101 102 if (tickEvent.scheduled()) 103 deschedule(&tickEvent); 104 105 UNSERIALIZE_SCALAR(event_tick); 106 107 if (event_tick) 108 schedule(&tickEvent, event_tick); 109} 110 111AddrRangeList 112GpuDispatcher::getAddrRanges() const 113{ 114 AddrRangeList ranges; 115 116 DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n", 117 pioAddr, pioSize); 118 119 ranges.push_back(RangeSize(pioAddr, pioSize)); 120 121 return ranges; 122} 123 124Tick 125GpuDispatcher::read(PacketPtr pkt) 126{ 127 assert(pkt->getAddr() >= pioAddr); 128 assert(pkt->getAddr() < pioAddr + pioSize); 129 130 int offset = pkt->getAddr() - pioAddr; 131 pkt->allocate(); 132 133 DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize()); 134 135 if (offset < 8) { 136 assert(!offset); 137 assert(pkt->getSize() == 8); 138 139 uint64_t retval = dispatchActive; 140 pkt->set(retval); 141 } else { 142 offset -= 8; 143 assert(offset + pkt->getSize() < sizeof(HsaQueueEntry)); 144 char *curTaskPtr = (char*)&curTask; 145 146 memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize()); 147 } 148 149 pkt->makeAtomicResponse(); 150 151 return pioDelay; 152} 153 154Tick 155GpuDispatcher::write(PacketPtr pkt) 156{ 157 assert(pkt->getAddr() >= pioAddr); 158 assert(pkt->getAddr() < pioAddr + pioSize); 159 160 int offset = pkt->getAddr() - pioAddr; 161 162#if TRACING_ON 163 uint64_t data_val = 0; 164 165 switch (pkt->getSize()) { 166 case 1: 167 data_val = pkt->get<uint8_t>(); 168 break; 169 case 2: 170 data_val = pkt->get<uint16_t>(); 171 break; 172 case 4: 173 data_val = pkt->get<uint32_t>(); 174 break; 175 case 8: 176 data_val = pkt->get<uint64_t>(); 177 break; 178 default: 179 DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize()); 180 } 181 182 DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val, 183 pkt->getSize()); 184#endif 185 if (!offset) { 186 static int nextId = 0; 187 188 // The depends field of the qstruct, which was previously unused, is 189 // used to communicate with simulated application. 190 if (curTask.depends) { 191 HostState hs; 192 shader->ReadMem((uint64_t)(curTask.depends), &hs, 193 sizeof(HostState), 0); 194 195 // update event start time (in nano-seconds) 196 uint64_t start = curTick() / 1000; 197 198 shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start), 199 &start, sizeof(uint64_t), 0); 200 } 201 202 // launch kernel 203 ++num_kernelLaunched; 204 205 NDRange *ndr = &(ndRangeMap[nextId]); 206 // copy dispatch info 207 ndr->q = curTask; 208 209 // update the numDispTask polled by the runtime 210 accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1); 211 212 ndr->numWgTotal = 1; 213 214 for (int i = 0; i < 3; ++i) { 215 ndr->wgId[i] = 0; 216 ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]); 217 ndr->numWgTotal *= ndr->numWg[i]; 218 } 219 220 ndr->numWgCompleted = 0; 221 ndr->globalWgId = 0; 222 ndr->wg_disp_rem = true; 223 ndr->execDone = false; 224 ndr->addrToNotify = (volatile bool*)curTask.addrToNotify; 225 ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft; 226 ndr->dispatchId = nextId; 227 ndr->curCid = pkt->req->contextId(); 228 DPRINTF(GPUDisp, "launching kernel %d\n",nextId); 229 execIds.push(nextId); 230 ++nextId; 231 232 dispatchActive = true; 233 234 if (!tickEvent.scheduled()) { 235 schedule(&tickEvent, curTick() + shader->ticks(1)); 236 } 237 } else { 238 // populate current task struct 239 // first 64 bits are launch reg 240 offset -= 8; 241 assert(offset < sizeof(HsaQueueEntry)); 242 char *curTaskPtr = (char*)&curTask; 243 memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize()); 244 } 245 246 pkt->makeAtomicResponse(); 247 248 return pioDelay; 249} 250 251 252BaseMasterPort& 253GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx) 254{ 255 if (if_name == "translation_port") { 256 return *tlbPort; 257 } 258 259 return DmaDevice::getMasterPort(if_name, idx); 260} 261 262void 263GpuDispatcher::exec() 264{ 265 int fail_count = 0; 266 267 // There are potentially multiple outstanding kernel launches. 268 // It is possible that the workgroups in a different kernel 269 // can fit on the GPU even if another kernel's workgroups cannot 270 DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); 271 272 while (execIds.size() > fail_count) { 273 int execId = execIds.front(); 274 275 while (ndRangeMap[execId].wg_disp_rem) { 276 //update the thread context 277 shader->updateContext(ndRangeMap[execId].curCid); 278 279 // attempt to dispatch_workgroup 280 if (!shader->dispatch_workgroups(&ndRangeMap[execId])) { 281 // if we failed try the next kernel, 282 // it may have smaller workgroups. 283 // put it on the queue to rety latter 284 DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId); 285 execIds.push(execId); 286 ++fail_count; 287 break; 288 } 289 } 290 // let's try the next kernel_id 291 execIds.pop(); 292 } 293 294 DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); 295 296 if (doneIds.size() && cpu) { 297 shader->hostWakeUp(cpu); 298 } 299 300 while (doneIds.size()) { 301 // wakeup the CPU if any Kernels completed this cycle 302 DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front()); 303 doneIds.pop(); 304 } 305} 306 307void 308GpuDispatcher::notifyWgCompl(Wavefront *w) 309{ 310 int kern_id = w->kernId; 311 DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id); 312 assert(ndRangeMap[kern_id].dispatchId == kern_id); 313 ndRangeMap[kern_id].numWgCompleted++; 314 315 if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) { 316 ndRangeMap[kern_id].execDone = true; 317 doneIds.push(kern_id); 318 319 if (ndRangeMap[kern_id].addrToNotify) { 320 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1, 321 0); 322 } 323 324 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1); 325 326 // update event end time (in nano-seconds) 327 if (ndRangeMap[kern_id].q.depends) { 328 HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends; 329 uint64_t event; 330 shader->ReadMem((uint64_t)(&host_state->event), &event, 331 sizeof(uint64_t), 0); 332 333 uint64_t end = curTick() / 1000; 334 335 shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end, 336 sizeof(uint64_t), 0); 337 } 338 } 339 340 if (!tickEvent.scheduled()) { 341 schedule(&tickEvent, curTick() + shader->ticks(1)); 342 } 343} 344 345void 346GpuDispatcher::scheduleDispatch() 347{ 348 if (!tickEvent.scheduled()) 349 schedule(&tickEvent, curTick() + shader->ticks(1)); 350} 351 352void 353GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off) 354{ 355 if (cpu) { 356 if (off) { 357 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq, 358 true); 359 val += off; 360 } 361 362 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true); 363 } else { 364 panic("Cannot find host"); 365 } 366} 367 368// helper functions for driver to retrieve GPU attributes 369int 370GpuDispatcher::getNumCUs() 371{ 372 return shader->cuList.size(); 373} 374 375int 376GpuDispatcher::wfSize() const 377{ 378 return shader->cuList[0]->wfSize(); 379} 380 381void 382GpuDispatcher::setFuncargsSize(int funcargs_size) 383{ 384 shader->funcargs_size = funcargs_size; 385} 386 387uint32_t 388GpuDispatcher::getStaticContextSize() const 389{ 390 return shader->cuList[0]->wfList[0][0]->getStaticContextSize(); 391} 392