Cross Reference: /gem5/src/gpu-compute/dispatcher.cc

Deleted Added

sdiff udiff text old ( 12680:91f4d6668b4f ) new ( 12697:cd71b966be1e )

full compact

dispatcher.cc (12680:91f4d6668b4f)	dispatcher.cc (12697:cd71b966be1e)
1/*	1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.	2 * Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc.
3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 *	3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission.	17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission.
20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 *	20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 *
33 * Author: Brad Beckmann, Marc Orr	33 * Authors: Brad Beckmann, 34 * Marc Orr, 35 * Anthony Gutierrez
34 / 35 36 37#include "gpu-compute/dispatcher.hh" 38 39#include "cpu/base.hh" 40#include "debug/GPUDisp.hh" 41#include "gpu-compute/cl_driver.hh" 42#include "gpu-compute/cl_event.hh" 43#include "gpu-compute/shader.hh" 44#include "gpu-compute/wavefront.hh" 45#include "mem/packet_access.hh" 46 47GpuDispatcher GpuDispatcher::instance = nullptr; 48 49GpuDispatcher::GpuDispatcher(const Params p) 50 : DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")), 51 pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency), 52 dispatchCount(0), dispatchActive(false), cpu(p->cpu), 53 shader(p->shader_pointer), driver(p->cl_driver), 54 tickEvent([this]{ exec(); }, "GPU Dispatcher tick", 55 false, Event::CPU_Tick_Pri) 56{ 57 shader->handshake(this); 58 driver->handshake(this); 59 60 ndRange.wg_disp_rem = false; 61 ndRange.globalWgId = 0; 62 63 schedule(&tickEvent, 0); 64 65 // translation port for the dispatcher 66 tlbPort = new TLBPort(csprintf("%s-port%d", name()), this); 67 68 num_kernelLaunched 69 .name(name() + ".num_kernel_launched") 70 .desc("number of kernel launched") 71 ; 72} 73 74GpuDispatcher GpuDispatcherParams::create() 75{ 76 GpuDispatcher dispatcher = new GpuDispatcher(this); 77 GpuDispatcher::setInstance(dispatcher); 78 79 return GpuDispatcher::getInstance(); 80} 81 82void 83GpuDispatcher::serialize(CheckpointOut &cp) const 84{ 85 Tick event_tick = 0; 86 87 if (ndRange.wg_disp_rem) 88 fatal("Checkpointing not supported during active workgroup execution"); 89 90 if (tickEvent.scheduled()) 91 event_tick = tickEvent.when(); 92 93 SERIALIZE_SCALAR(event_tick); 94 95} 96 97void 98GpuDispatcher::unserialize(CheckpointIn &cp) 99{ 100* Tick event_tick; 101 102 if (tickEvent.scheduled()) 103 deschedule(&tickEvent); 104 105 UNSERIALIZE_SCALAR(event_tick); 106 107 if (event_tick) 108 schedule(&tickEvent, event_tick); 109} 110 111AddrRangeList 112GpuDispatcher::getAddrRanges() const 113{ 114 AddrRangeList ranges; 115 116 DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n", 117 pioAddr, pioSize); 118 119 ranges.push_back(RangeSize(pioAddr, pioSize)); 120 121 return ranges; 122} 123 124Tick 125GpuDispatcher::read(PacketPtr pkt) 126{ 127 assert(pkt->getAddr() >= pioAddr); 128 assert(pkt->getAddr() < pioAddr + pioSize); 129 130 int offset = pkt->getAddr() - pioAddr; 131 pkt->allocate(); 132 133 DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize()); 134 135 if (offset < 8) { 136 assert(!offset); 137 assert(pkt->getSize() == 8); 138 139 uint64_t retval = dispatchActive; 140 pkt->set(retval); 141 } else { 142 offset -= 8; 143 assert(offset + pkt->getSize() < sizeof(HsaQueueEntry)); 144 char curTaskPtr = (char)&curTask; 145 146 memcpy(pkt->getPtr<const void>(), curTaskPtr + offset, pkt->getSize()); 147* } 148 149 pkt->makeAtomicResponse(); 150 151 return pioDelay; 152} 153 154Tick 155GpuDispatcher::write(PacketPtr pkt) 156{ 157 assert(pkt->getAddr() >= pioAddr); 158 assert(pkt->getAddr() < pioAddr + pioSize); 159 160 int offset = pkt->getAddr() - pioAddr; 161 162#if TRACING_ON 163 uint64_t data_val = 0; 164 165 switch (pkt->getSize()) { 166 case 1: 167 data_val = pkt->get<uint8_t>(); 168 break; 169 case 2: 170 data_val = pkt->get<uint16_t>(); 171 break; 172 case 4: 173 data_val = pkt->get<uint32_t>(); 174 break; 175 case 8: 176 data_val = pkt->get<uint64_t>(); 177 break; 178 default: 179 DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize()); 180 } 181 182 DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val, 183 pkt->getSize()); 184#endif 185 if (!offset) { 186 static int nextId = 0; 187 188 // The depends field of the qstruct, which was previously unused, is 189 // used to communicate with simulated application. 190 if (curTask.depends) { 191 HostState hs; 192 shader->ReadMem((uint64_t)(curTask.depends), &hs, 193 sizeof(HostState), 0); 194 195 // update event start time (in nano-seconds) 196 uint64_t start = curTick() / 1000; 197 198 shader->WriteMem((uint64_t)(&((_cl_event)hs.event)->start), 199* &start, sizeof(uint64_t), 0); 200 } 201 202 // launch kernel 203 ++num_kernelLaunched; 204 205 NDRange ndr = &(ndRangeMap[nextId]); 206* // copy dispatch info 207 ndr->q = curTask; 208 209 // update the numDispTask polled by the runtime 210 accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1); 211 212 ndr->numWgTotal = 1; 213 214 for (int i = 0; i < 3; ++i) { 215 ndr->wgId[i] = 0; 216 ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]); 217 ndr->numWgTotal = ndr->numWg[i]; 218* } 219 220 ndr->numWgCompleted = 0; 221 ndr->globalWgId = 0; 222 ndr->wg_disp_rem = true; 223 ndr->execDone = false; 224 ndr->addrToNotify = (volatile bool)curTask.addrToNotify; 225* ndr->numDispLeft = (volatile uint32_t)curTask.numDispLeft; 226* ndr->dispatchId = nextId; 227 ndr->curCid = pkt->req->contextId(); 228 DPRINTF(GPUDisp, "launching kernel %d\n",nextId); 229 execIds.push(nextId); 230 ++nextId; 231 232 dispatchActive = true; 233 234 if (!tickEvent.scheduled()) { 235 schedule(&tickEvent, curTick() + shader->ticks(1)); 236 } 237 } else { 238 // populate current task struct 239 // first 64 bits are launch reg 240 offset -= 8; 241 assert(offset < sizeof(HsaQueueEntry)); 242 char curTaskPtr = (char)&curTask; 243 memcpy(curTaskPtr + offset, pkt->getPtr<const void>(), pkt->getSize()); 244* } 245 246 pkt->makeAtomicResponse(); 247 248 return pioDelay; 249} 250 251 252BaseMasterPort& 253GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx) 254{ 255 if (if_name == "translation_port") { 256 return tlbPort; 257* } 258 259 return DmaDevice::getMasterPort(if_name, idx); 260} 261 262void 263GpuDispatcher::exec() 264{ 265 int fail_count = 0; 266 267 // There are potentially multiple outstanding kernel launches. 268 // It is possible that the workgroups in a different kernel 269 // can fit on the GPU even if another kernel's workgroups cannot 270 DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); 271 272 while (execIds.size() > fail_count) { 273 int execId = execIds.front(); 274 275 while (ndRangeMap[execId].wg_disp_rem) { 276 //update the thread context 277 shader->updateContext(ndRangeMap[execId].curCid); 278 279 // attempt to dispatch_workgroup 280 if (!shader->dispatch_workgroups(&ndRangeMap[execId])) { 281 // if we failed try the next kernel, 282 // it may have smaller workgroups. 283 // put it on the queue to rety latter 284 DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId); 285 execIds.push(execId); 286 ++fail_count; 287 break; 288 } 289 } 290 // let's try the next kernel_id 291 execIds.pop(); 292 } 293 294 DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); 295 296 if (doneIds.size() && cpu) { 297 shader->hostWakeUp(cpu); 298 } 299 300 while (doneIds.size()) { 301 // wakeup the CPU if any Kernels completed this cycle 302 DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front()); 303 doneIds.pop(); 304 } 305} 306 307void 308GpuDispatcher::notifyWgCompl(Wavefront w) 309{ 310* int kern_id = w->kernId; 311 DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id); 312 assert(ndRangeMap[kern_id].dispatchId == kern_id); 313 ndRangeMap[kern_id].numWgCompleted++; 314 315 if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) { 316 ndRangeMap[kern_id].execDone = true; 317 doneIds.push(kern_id); 318 319 if (ndRangeMap[kern_id].addrToNotify) { 320 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1, 321 0); 322 } 323 324 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1); 325 326 // update event end time (in nano-seconds) 327 if (ndRangeMap[kern_id].q.depends) { 328 HostState host_state = (HostState)ndRangeMap[kern_id].q.depends; 329 uint64_t event; 330 shader->ReadMem((uint64_t)(&host_state->event), &event, 331 sizeof(uint64_t), 0); 332 333 uint64_t end = curTick() / 1000; 334 335 shader->WriteMem((uint64_t)(&((_cl_event)event)->end), &end, 336* sizeof(uint64_t), 0); 337 } 338 } 339 340 if (!tickEvent.scheduled()) { 341 schedule(&tickEvent, curTick() + shader->ticks(1)); 342 } 343} 344 345void 346GpuDispatcher::scheduleDispatch() 347{ 348 if (!tickEvent.scheduled()) 349 schedule(&tickEvent, curTick() + shader->ticks(1)); 350} 351 352void 353GpuDispatcher::accessUserVar(BaseCPU cpu, uint64_t addr, int val, int off) 354{ 355* if (cpu) { 356 if (off) { 357 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq, 358 true); 359 val += off; 360 } 361 362 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true); 363 } else { 364 panic("Cannot find host"); 365 } 366} 367 368// helper functions for driver to retrieve GPU attributes 369int 370GpuDispatcher::getNumCUs() 371{ 372 return shader->cuList.size(); 373} 374 375int 376GpuDispatcher::wfSize() const 377{ 378 return shader->cuList[0]->wfSize(); 379} 380 381void 382GpuDispatcher::setFuncargsSize(int funcargs_size) 383{ 384 shader->funcargs_size = funcargs_size; 385} 386 387uint32_t 388GpuDispatcher::getStaticContextSize() const 389{ 390 return shader->cuList[0]->wfList[0][0]->getStaticContextSize(); 391}	36 / 37 38 39#include "gpu-compute/dispatcher.hh" 40 41#include "cpu/base.hh" 42#include "debug/GPUDisp.hh" 43#include "gpu-compute/cl_driver.hh" 44#include "gpu-compute/cl_event.hh" 45#include "gpu-compute/shader.hh" 46#include "gpu-compute/wavefront.hh" 47#include "mem/packet_access.hh" 48 49GpuDispatcher GpuDispatcher::instance = nullptr; 50 51GpuDispatcher::GpuDispatcher(const Params p) 52 : DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")), 53 pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency), 54 dispatchCount(0), dispatchActive(false), cpu(p->cpu), 55 shader(p->shader_pointer), driver(p->cl_driver), 56 tickEvent([this]{ exec(); }, "GPU Dispatcher tick", 57 false, Event::CPU_Tick_Pri) 58{ 59 shader->handshake(this); 60 driver->handshake(this); 61 62 ndRange.wg_disp_rem = false; 63 ndRange.globalWgId = 0; 64 65 schedule(&tickEvent, 0); 66 67 // translation port for the dispatcher 68 tlbPort = new TLBPort(csprintf("%s-port%d", name()), this); 69 70 num_kernelLaunched 71 .name(name() + ".num_kernel_launched") 72 .desc("number of kernel launched") 73 ; 74} 75 76GpuDispatcher GpuDispatcherParams::create() 77{ 78 GpuDispatcher dispatcher = new GpuDispatcher(this); 79 GpuDispatcher::setInstance(dispatcher); 80 81 return GpuDispatcher::getInstance(); 82} 83 84void 85GpuDispatcher::serialize(CheckpointOut &cp) const 86{ 87 Tick event_tick = 0; 88 89 if (ndRange.wg_disp_rem) 90 fatal("Checkpointing not supported during active workgroup execution"); 91 92 if (tickEvent.scheduled()) 93 event_tick = tickEvent.when(); 94 95 SERIALIZE_SCALAR(event_tick); 96 97} 98 99void 100GpuDispatcher::unserialize(CheckpointIn &cp) 101{ 102* Tick event_tick; 103 104 if (tickEvent.scheduled()) 105 deschedule(&tickEvent); 106 107 UNSERIALIZE_SCALAR(event_tick); 108 109 if (event_tick) 110 schedule(&tickEvent, event_tick); 111} 112 113AddrRangeList 114GpuDispatcher::getAddrRanges() const 115{ 116 AddrRangeList ranges; 117 118 DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n", 119 pioAddr, pioSize); 120 121 ranges.push_back(RangeSize(pioAddr, pioSize)); 122 123 return ranges; 124} 125 126Tick 127GpuDispatcher::read(PacketPtr pkt) 128{ 129 assert(pkt->getAddr() >= pioAddr); 130 assert(pkt->getAddr() < pioAddr + pioSize); 131 132 int offset = pkt->getAddr() - pioAddr; 133 pkt->allocate(); 134 135 DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize()); 136 137 if (offset < 8) { 138 assert(!offset); 139 assert(pkt->getSize() == 8); 140 141 uint64_t retval = dispatchActive; 142 pkt->set(retval); 143 } else { 144 offset -= 8; 145 assert(offset + pkt->getSize() < sizeof(HsaQueueEntry)); 146 char curTaskPtr = (char)&curTask; 147 148 memcpy(pkt->getPtr<const void>(), curTaskPtr + offset, pkt->getSize()); 149* } 150 151 pkt->makeAtomicResponse(); 152 153 return pioDelay; 154} 155 156Tick 157GpuDispatcher::write(PacketPtr pkt) 158{ 159 assert(pkt->getAddr() >= pioAddr); 160 assert(pkt->getAddr() < pioAddr + pioSize); 161 162 int offset = pkt->getAddr() - pioAddr; 163 164#if TRACING_ON 165 uint64_t data_val = 0; 166 167 switch (pkt->getSize()) { 168 case 1: 169 data_val = pkt->get<uint8_t>(); 170 break; 171 case 2: 172 data_val = pkt->get<uint16_t>(); 173 break; 174 case 4: 175 data_val = pkt->get<uint32_t>(); 176 break; 177 case 8: 178 data_val = pkt->get<uint64_t>(); 179 break; 180 default: 181 DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize()); 182 } 183 184 DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val, 185 pkt->getSize()); 186#endif 187 if (!offset) { 188 static int nextId = 0; 189 190 // The depends field of the qstruct, which was previously unused, is 191 // used to communicate with simulated application. 192 if (curTask.depends) { 193 HostState hs; 194 shader->ReadMem((uint64_t)(curTask.depends), &hs, 195 sizeof(HostState), 0); 196 197 // update event start time (in nano-seconds) 198 uint64_t start = curTick() / 1000; 199 200 shader->WriteMem((uint64_t)(&((_cl_event)hs.event)->start), 201* &start, sizeof(uint64_t), 0); 202 } 203 204 // launch kernel 205 ++num_kernelLaunched; 206 207 NDRange ndr = &(ndRangeMap[nextId]); 208* // copy dispatch info 209 ndr->q = curTask; 210 211 // update the numDispTask polled by the runtime 212 accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1); 213 214 ndr->numWgTotal = 1; 215 216 for (int i = 0; i < 3; ++i) { 217 ndr->wgId[i] = 0; 218 ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]); 219 ndr->numWgTotal = ndr->numWg[i]; 220* } 221 222 ndr->numWgCompleted = 0; 223 ndr->globalWgId = 0; 224 ndr->wg_disp_rem = true; 225 ndr->execDone = false; 226 ndr->addrToNotify = (volatile bool)curTask.addrToNotify; 227* ndr->numDispLeft = (volatile uint32_t)curTask.numDispLeft; 228* ndr->dispatchId = nextId; 229 ndr->curCid = pkt->req->contextId(); 230 DPRINTF(GPUDisp, "launching kernel %d\n",nextId); 231 execIds.push(nextId); 232 ++nextId; 233 234 dispatchActive = true; 235 236 if (!tickEvent.scheduled()) { 237 schedule(&tickEvent, curTick() + shader->ticks(1)); 238 } 239 } else { 240 // populate current task struct 241 // first 64 bits are launch reg 242 offset -= 8; 243 assert(offset < sizeof(HsaQueueEntry)); 244 char curTaskPtr = (char)&curTask; 245 memcpy(curTaskPtr + offset, pkt->getPtr<const void>(), pkt->getSize()); 246* } 247 248 pkt->makeAtomicResponse(); 249 250 return pioDelay; 251} 252 253 254BaseMasterPort& 255GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx) 256{ 257 if (if_name == "translation_port") { 258 return tlbPort; 259* } 260 261 return DmaDevice::getMasterPort(if_name, idx); 262} 263 264void 265GpuDispatcher::exec() 266{ 267 int fail_count = 0; 268 269 // There are potentially multiple outstanding kernel launches. 270 // It is possible that the workgroups in a different kernel 271 // can fit on the GPU even if another kernel's workgroups cannot 272 DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size()); 273 274 while (execIds.size() > fail_count) { 275 int execId = execIds.front(); 276 277 while (ndRangeMap[execId].wg_disp_rem) { 278 //update the thread context 279 shader->updateContext(ndRangeMap[execId].curCid); 280 281 // attempt to dispatch_workgroup 282 if (!shader->dispatch_workgroups(&ndRangeMap[execId])) { 283 // if we failed try the next kernel, 284 // it may have smaller workgroups. 285 // put it on the queue to rety latter 286 DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId); 287 execIds.push(execId); 288 ++fail_count; 289 break; 290 } 291 } 292 // let's try the next kernel_id 293 execIds.pop(); 294 } 295 296 DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size()); 297 298 if (doneIds.size() && cpu) { 299 shader->hostWakeUp(cpu); 300 } 301 302 while (doneIds.size()) { 303 // wakeup the CPU if any Kernels completed this cycle 304 DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front()); 305 doneIds.pop(); 306 } 307} 308 309void 310GpuDispatcher::notifyWgCompl(Wavefront w) 311{ 312* int kern_id = w->kernId; 313 DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id); 314 assert(ndRangeMap[kern_id].dispatchId == kern_id); 315 ndRangeMap[kern_id].numWgCompleted++; 316 317 if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) { 318 ndRangeMap[kern_id].execDone = true; 319 doneIds.push(kern_id); 320 321 if (ndRangeMap[kern_id].addrToNotify) { 322 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1, 323 0); 324 } 325 326 accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1); 327 328 // update event end time (in nano-seconds) 329 if (ndRangeMap[kern_id].q.depends) { 330 HostState host_state = (HostState)ndRangeMap[kern_id].q.depends; 331 uint64_t event; 332 shader->ReadMem((uint64_t)(&host_state->event), &event, 333 sizeof(uint64_t), 0); 334 335 uint64_t end = curTick() / 1000; 336 337 shader->WriteMem((uint64_t)(&((_cl_event)event)->end), &end, 338* sizeof(uint64_t), 0); 339 } 340 } 341 342 if (!tickEvent.scheduled()) { 343 schedule(&tickEvent, curTick() + shader->ticks(1)); 344 } 345} 346 347void 348GpuDispatcher::scheduleDispatch() 349{ 350 if (!tickEvent.scheduled()) 351 schedule(&tickEvent, curTick() + shader->ticks(1)); 352} 353 354void 355GpuDispatcher::accessUserVar(BaseCPU cpu, uint64_t addr, int val, int off) 356{ 357* if (cpu) { 358 if (off) { 359 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq, 360 true); 361 val += off; 362 } 363 364 shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true); 365 } else { 366 panic("Cannot find host"); 367 } 368} 369 370// helper functions for driver to retrieve GPU attributes 371int 372GpuDispatcher::getNumCUs() 373{ 374 return shader->cuList.size(); 375} 376 377int 378GpuDispatcher::wfSize() const 379{ 380 return shader->cuList[0]->wfSize(); 381} 382 383void 384GpuDispatcher::setFuncargsSize(int funcargs_size) 385{ 386 shader->funcargs_size = funcargs_size; 387} 388 389uint32_t 390GpuDispatcher::getStaticContextSize() const 391{ 392 return shader->cuList[0]->wfList[0][0]->getStaticContextSize(); 393}