dispatcher.cc revision 12126
19665Sandreas.hansson@arm.com/*
29520SN/A * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
39520SN/A * All rights reserved.
49520SN/A *
59520SN/A * For use for simulation and test purposes only
69520SN/A *
79520SN/A * Redistribution and use in source and binary forms, with or without
89520SN/A * modification, are permitted provided that the following conditions are met:
99520SN/A *
109520SN/A * 1. Redistributions of source code must retain the above copyright notice,
119520SN/A * this list of conditions and the following disclaimer.
129520SN/A *
139520SN/A * 2. Redistributions in binary form must reproduce the above copyright notice,
149520SN/A * this list of conditions and the following disclaimer in the documentation
159520SN/A * and/or other materials provided with the distribution.
169520SN/A *
179520SN/A * 3. Neither the name of the copyright holder nor the names of its contributors
189520SN/A * may be used to endorse or promote products derived from this software
199520SN/A * without specific prior written permission.
209520SN/A *
219520SN/A * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
229520SN/A * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
239520SN/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
249520SN/A * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
259520SN/A * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
269520SN/A * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
279520SN/A * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
289520SN/A * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
299520SN/A * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
309520SN/A * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
319520SN/A * POSSIBILITY OF SUCH DAMAGE.
329520SN/A *
339520SN/A * Author: Brad Beckmann, Marc Orr
349520SN/A */
359520SN/A
369520SN/A
379665Sandreas.hansson@arm.com#include "gpu-compute/dispatcher.hh"
389520SN/A
399520SN/A#include "cpu/base.hh"
409520SN/A#include "debug/GPUDisp.hh"
419520SN/A#include "gpu-compute/cl_driver.hh"
429520SN/A#include "gpu-compute/cl_event.hh"
439520SN/A#include "gpu-compute/shader.hh"
449665Sandreas.hansson@arm.com#include "gpu-compute/wavefront.hh"
459665Sandreas.hansson@arm.com#include "mem/packet_access.hh"
469665Sandreas.hansson@arm.com
479520SN/AGpuDispatcher *GpuDispatcher::instance = nullptr;
489665Sandreas.hansson@arm.com
499665Sandreas.hansson@arm.comGpuDispatcher::GpuDispatcher(const Params *p)
509665Sandreas.hansson@arm.com    : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")),
519665Sandreas.hansson@arm.com      pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
529665Sandreas.hansson@arm.com      dispatchCount(0), dispatchActive(false), cpu(p->cpu),
539728Sandreas.hansson@arm.com      shader(p->shader_pointer), driver(p->cl_driver),
549728Sandreas.hansson@arm.com      tickEvent([this]{ exec(); }, "GPU Dispatcher tick",
559728Sandreas.hansson@arm.com                false, Event::CPU_Tick_Pri)
569728Sandreas.hansson@arm.com{
579520SN/A    shader->handshake(this);
589520SN/A    driver->handshake(this);
599665Sandreas.hansson@arm.com
609665Sandreas.hansson@arm.com    ndRange.wg_disp_rem = false;
619665Sandreas.hansson@arm.com    ndRange.globalWgId = 0;
629520SN/A
639520SN/A    schedule(&tickEvent, 0);
649665Sandreas.hansson@arm.com
659665Sandreas.hansson@arm.com    // translation port for the dispatcher
669520SN/A    tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
679520SN/A
689520SN/A    num_kernelLaunched
699520SN/A    .name(name() + ".num_kernel_launched")
709665Sandreas.hansson@arm.com    .desc("number of kernel launched")
719665Sandreas.hansson@arm.com    ;
729520SN/A}
739520SN/A
749520SN/AGpuDispatcher *GpuDispatcherParams::create()
759520SN/A{
769665Sandreas.hansson@arm.com    GpuDispatcher *dispatcher = new GpuDispatcher(this);
779520SN/A    GpuDispatcher::setInstance(dispatcher);
789665Sandreas.hansson@arm.com
799520SN/A    return GpuDispatcher::getInstance();
809520SN/A}
819665Sandreas.hansson@arm.com
829665Sandreas.hansson@arm.comvoid
839520SN/AGpuDispatcher::serialize(CheckpointOut &cp) const
849665Sandreas.hansson@arm.com{
859520SN/A    Tick event_tick = 0;
869520SN/A
879665Sandreas.hansson@arm.com    if (ndRange.wg_disp_rem)
889665Sandreas.hansson@arm.com        fatal("Checkpointing not supported during active workgroup execution");
899520SN/A
909665Sandreas.hansson@arm.com    if (tickEvent.scheduled())
919520SN/A        event_tick = tickEvent.when();
929665Sandreas.hansson@arm.com
939520SN/A    SERIALIZE_SCALAR(event_tick);
949520SN/A
959520SN/A}
969520SN/A
979520SN/Avoid
989520SN/AGpuDispatcher::unserialize(CheckpointIn &cp)
999520SN/A{
1009520SN/A    Tick event_tick;
1019520SN/A
1029665Sandreas.hansson@arm.com    if (tickEvent.scheduled())
1039665Sandreas.hansson@arm.com        deschedule(&tickEvent);
1049665Sandreas.hansson@arm.com
1059520SN/A    UNSERIALIZE_SCALAR(event_tick);
1069520SN/A
1079665Sandreas.hansson@arm.com    if (event_tick)
1089665Sandreas.hansson@arm.com        schedule(&tickEvent, event_tick);
1099665Sandreas.hansson@arm.com}
1109520SN/A
1119665Sandreas.hansson@arm.comAddrRangeList
1129665Sandreas.hansson@arm.comGpuDispatcher::getAddrRanges() const
1139665Sandreas.hansson@arm.com{
1149520SN/A    AddrRangeList ranges;
1159665Sandreas.hansson@arm.com
1169520SN/A    DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
1179665Sandreas.hansson@arm.com            pioAddr, pioSize);
1189665Sandreas.hansson@arm.com
1199665Sandreas.hansson@arm.com    ranges.push_back(RangeSize(pioAddr, pioSize));
1209520SN/A
1219665Sandreas.hansson@arm.com    return ranges;
1229665Sandreas.hansson@arm.com}
1239520SN/A
1249665Sandreas.hansson@arm.comTick
1259520SN/AGpuDispatcher::read(PacketPtr pkt)
1269665Sandreas.hansson@arm.com{
1279836Sandreas.hansson@arm.com    assert(pkt->getAddr() >= pioAddr);
1289836Sandreas.hansson@arm.com    assert(pkt->getAddr() < pioAddr + pioSize);
1299836Sandreas.hansson@arm.com
1309836Sandreas.hansson@arm.com    int offset = pkt->getAddr() - pioAddr;
1319836Sandreas.hansson@arm.com    pkt->allocate();
1329836Sandreas.hansson@arm.com
1339836Sandreas.hansson@arm.com    DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
1349836Sandreas.hansson@arm.com
1359836Sandreas.hansson@arm.com    if (offset < 8) {
1369836Sandreas.hansson@arm.com        assert(!offset);
1379836Sandreas.hansson@arm.com        assert(pkt->getSize() == 8);
1389836Sandreas.hansson@arm.com
1399836Sandreas.hansson@arm.com        uint64_t retval = dispatchActive;
1409836Sandreas.hansson@arm.com        pkt->set(retval);
1419836Sandreas.hansson@arm.com    } else {
1429836Sandreas.hansson@arm.com        offset -= 8;
1439836Sandreas.hansson@arm.com        assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
1449836Sandreas.hansson@arm.com        char *curTaskPtr = (char*)&curTask;
1459836Sandreas.hansson@arm.com
1469836Sandreas.hansson@arm.com        memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
1479836Sandreas.hansson@arm.com    }
1489836Sandreas.hansson@arm.com
1499836Sandreas.hansson@arm.com    pkt->makeAtomicResponse();
1509836Sandreas.hansson@arm.com
1519836Sandreas.hansson@arm.com    return pioDelay;
1529836Sandreas.hansson@arm.com}
1539836Sandreas.hansson@arm.com
1549836Sandreas.hansson@arm.comTick
1559836Sandreas.hansson@arm.comGpuDispatcher::write(PacketPtr pkt)
1569836Sandreas.hansson@arm.com{
1579836Sandreas.hansson@arm.com    assert(pkt->getAddr() >= pioAddr);
1589836Sandreas.hansson@arm.com    assert(pkt->getAddr() < pioAddr + pioSize);
1599836Sandreas.hansson@arm.com
1609836Sandreas.hansson@arm.com    int offset = pkt->getAddr() - pioAddr;
1619836Sandreas.hansson@arm.com
1629836Sandreas.hansson@arm.com#if TRACING_ON
1639836Sandreas.hansson@arm.com    uint64_t data_val = 0;
1649836Sandreas.hansson@arm.com
1659836Sandreas.hansson@arm.com    switch (pkt->getSize()) {
1669836Sandreas.hansson@arm.com      case 1:
1679836Sandreas.hansson@arm.com        data_val = pkt->get<uint8_t>();
1689836Sandreas.hansson@arm.com        break;
1699836Sandreas.hansson@arm.com      case 2:
1709836Sandreas.hansson@arm.com        data_val = pkt->get<uint16_t>();
1719836Sandreas.hansson@arm.com        break;
1729836Sandreas.hansson@arm.com      case 4:
1739836Sandreas.hansson@arm.com        data_val = pkt->get<uint32_t>();
1749836Sandreas.hansson@arm.com        break;
1759836Sandreas.hansson@arm.com      case 8:
1769836Sandreas.hansson@arm.com        data_val = pkt->get<uint64_t>();
1779836Sandreas.hansson@arm.com        break;
1789836Sandreas.hansson@arm.com      default:
1799836Sandreas.hansson@arm.com        DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
1809836Sandreas.hansson@arm.com    }
1819836Sandreas.hansson@arm.com
1829836Sandreas.hansson@arm.com    DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
1839836Sandreas.hansson@arm.com            pkt->getSize());
1849836Sandreas.hansson@arm.com#endif
1859836Sandreas.hansson@arm.com    if (!offset) {
1869836Sandreas.hansson@arm.com        static int nextId = 0;
1879836Sandreas.hansson@arm.com
1889836Sandreas.hansson@arm.com        // The depends field of the qstruct, which was previously unused, is
1899836Sandreas.hansson@arm.com        // used to communicate with simulated application.
1909836Sandreas.hansson@arm.com        if (curTask.depends) {
1919836Sandreas.hansson@arm.com            HostState hs;
1929836Sandreas.hansson@arm.com            shader->ReadMem((uint64_t)(curTask.depends), &hs,
1939836Sandreas.hansson@arm.com                            sizeof(HostState), 0);
194
195            // update event start time (in nano-seconds)
196            uint64_t start = curTick() / 1000;
197
198            shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
199                             &start, sizeof(uint64_t), 0);
200        }
201
202        // launch kernel
203        ++num_kernelLaunched;
204
205        NDRange *ndr = &(ndRangeMap[nextId]);
206        // copy dispatch info
207        ndr->q = curTask;
208
209        // update the numDispTask polled by the runtime
210        accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
211
212        ndr->numWgTotal = 1;
213
214        for (int i = 0; i < 3; ++i) {
215            ndr->wgId[i] = 0;
216            ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
217            ndr->numWgTotal *= ndr->numWg[i];
218        }
219
220        ndr->numWgCompleted = 0;
221        ndr->globalWgId = 0;
222        ndr->wg_disp_rem = true;
223        ndr->execDone = false;
224        ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
225        ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
226        ndr->dispatchId = nextId;
227        ndr->curCid = pkt->req->contextId();
228        DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
229        execIds.push(nextId);
230        ++nextId;
231
232        dispatchActive = true;
233
234        if (!tickEvent.scheduled()) {
235            schedule(&tickEvent, curTick() + shader->ticks(1));
236        }
237    } else {
238        // populate current task struct
239        // first 64 bits are launch reg
240        offset -= 8;
241        assert(offset < sizeof(HsaQueueEntry));
242        char *curTaskPtr = (char*)&curTask;
243        memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
244    }
245
246    pkt->makeAtomicResponse();
247
248    return pioDelay;
249}
250
251
252BaseMasterPort&
253GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)
254{
255    if (if_name == "translation_port") {
256        return *tlbPort;
257    }
258
259    return DmaDevice::getMasterPort(if_name, idx);
260}
261
262void
263GpuDispatcher::exec()
264{
265    int fail_count = 0;
266
267    // There are potentially multiple outstanding kernel launches.
268    // It is possible that the workgroups in a different kernel
269    // can fit on the GPU even if another kernel's workgroups cannot
270    DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
271
272    while (execIds.size() > fail_count) {
273        int execId = execIds.front();
274
275        while (ndRangeMap[execId].wg_disp_rem) {
276            //update the thread context
277            shader->updateContext(ndRangeMap[execId].curCid);
278
279            // attempt to dispatch_workgroup
280            if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
281                // if we failed try the next kernel,
282                // it may have smaller workgroups.
283                // put it on the queue to rety latter
284                DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
285                execIds.push(execId);
286                ++fail_count;
287                break;
288            }
289        }
290        // let's try the next kernel_id
291        execIds.pop();
292    }
293
294    DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
295
296    if (doneIds.size() && cpu) {
297        shader->hostWakeUp(cpu);
298    }
299
300    while (doneIds.size()) {
301        // wakeup the CPU if any Kernels completed this cycle
302        DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
303        doneIds.pop();
304    }
305}
306
307void
308GpuDispatcher::notifyWgCompl(Wavefront *w)
309{
310    int kern_id = w->kernId;
311    DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
312    assert(ndRangeMap[kern_id].dispatchId == kern_id);
313    ndRangeMap[kern_id].numWgCompleted++;
314
315    if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
316        ndRangeMap[kern_id].execDone = true;
317        doneIds.push(kern_id);
318
319        if (ndRangeMap[kern_id].addrToNotify) {
320            accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
321                          0);
322        }
323
324        accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
325
326        // update event end time (in nano-seconds)
327        if (ndRangeMap[kern_id].q.depends) {
328            HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
329            uint64_t event;
330            shader->ReadMem((uint64_t)(&host_state->event), &event,
331                            sizeof(uint64_t), 0);
332
333            uint64_t end = curTick() / 1000;
334
335            shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
336                             sizeof(uint64_t), 0);
337        }
338    }
339
340    if (!tickEvent.scheduled()) {
341        schedule(&tickEvent, curTick() + shader->ticks(1));
342    }
343}
344
345void
346GpuDispatcher::scheduleDispatch()
347{
348    if (!tickEvent.scheduled())
349        schedule(&tickEvent, curTick() + shader->ticks(1));
350}
351
352void
353GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
354{
355    if (cpu) {
356        if (off) {
357            shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
358                              true);
359            val += off;
360        }
361
362        shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
363    } else {
364        panic("Cannot find host");
365    }
366}
367
368// helper functions for driver to retrieve GPU attributes
369int
370GpuDispatcher::getNumCUs()
371{
372    return shader->cuList.size();
373}
374
375int
376GpuDispatcher::wfSize() const
377{
378    return shader->cuList[0]->wfSize();
379}
380
381void
382GpuDispatcher::setFuncargsSize(int funcargs_size)
383{
384    shader->funcargs_size = funcargs_size;
385}
386
387uint32_t
388GpuDispatcher::getStaticContextSize() const
389{
390    return shader->cuList[0]->wfList[0][0]->getStaticContextSize();
391}
392