dispatcher.cc revision 12680:91f4d6668b4f
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Brad Beckmann, Marc Orr
34 */
35
36
37#include "gpu-compute/dispatcher.hh"
38
39#include "cpu/base.hh"
40#include "debug/GPUDisp.hh"
41#include "gpu-compute/cl_driver.hh"
42#include "gpu-compute/cl_event.hh"
43#include "gpu-compute/shader.hh"
44#include "gpu-compute/wavefront.hh"
45#include "mem/packet_access.hh"
46
47GpuDispatcher *GpuDispatcher::instance = nullptr;
48
49GpuDispatcher::GpuDispatcher(const Params *p)
50    : DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")),
51      pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
52      dispatchCount(0), dispatchActive(false), cpu(p->cpu),
53      shader(p->shader_pointer), driver(p->cl_driver),
54      tickEvent([this]{ exec(); }, "GPU Dispatcher tick",
55                false, Event::CPU_Tick_Pri)
56{
57    shader->handshake(this);
58    driver->handshake(this);
59
60    ndRange.wg_disp_rem = false;
61    ndRange.globalWgId = 0;
62
63    schedule(&tickEvent, 0);
64
65    // translation port for the dispatcher
66    tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
67
68    num_kernelLaunched
69    .name(name() + ".num_kernel_launched")
70    .desc("number of kernel launched")
71    ;
72}
73
74GpuDispatcher *GpuDispatcherParams::create()
75{
76    GpuDispatcher *dispatcher = new GpuDispatcher(this);
77    GpuDispatcher::setInstance(dispatcher);
78
79    return GpuDispatcher::getInstance();
80}
81
82void
83GpuDispatcher::serialize(CheckpointOut &cp) const
84{
85    Tick event_tick = 0;
86
87    if (ndRange.wg_disp_rem)
88        fatal("Checkpointing not supported during active workgroup execution");
89
90    if (tickEvent.scheduled())
91        event_tick = tickEvent.when();
92
93    SERIALIZE_SCALAR(event_tick);
94
95}
96
97void
98GpuDispatcher::unserialize(CheckpointIn &cp)
99{
100    Tick event_tick;
101
102    if (tickEvent.scheduled())
103        deschedule(&tickEvent);
104
105    UNSERIALIZE_SCALAR(event_tick);
106
107    if (event_tick)
108        schedule(&tickEvent, event_tick);
109}
110
111AddrRangeList
112GpuDispatcher::getAddrRanges() const
113{
114    AddrRangeList ranges;
115
116    DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
117            pioAddr, pioSize);
118
119    ranges.push_back(RangeSize(pioAddr, pioSize));
120
121    return ranges;
122}
123
124Tick
125GpuDispatcher::read(PacketPtr pkt)
126{
127    assert(pkt->getAddr() >= pioAddr);
128    assert(pkt->getAddr() < pioAddr + pioSize);
129
130    int offset = pkt->getAddr() - pioAddr;
131    pkt->allocate();
132
133    DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
134
135    if (offset < 8) {
136        assert(!offset);
137        assert(pkt->getSize() == 8);
138
139        uint64_t retval = dispatchActive;
140        pkt->set(retval);
141    } else {
142        offset -= 8;
143        assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
144        char *curTaskPtr = (char*)&curTask;
145
146        memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
147    }
148
149    pkt->makeAtomicResponse();
150
151    return pioDelay;
152}
153
154Tick
155GpuDispatcher::write(PacketPtr pkt)
156{
157    assert(pkt->getAddr() >= pioAddr);
158    assert(pkt->getAddr() < pioAddr + pioSize);
159
160    int offset = pkt->getAddr() - pioAddr;
161
162#if TRACING_ON
163    uint64_t data_val = 0;
164
165    switch (pkt->getSize()) {
166      case 1:
167        data_val = pkt->get<uint8_t>();
168        break;
169      case 2:
170        data_val = pkt->get<uint16_t>();
171        break;
172      case 4:
173        data_val = pkt->get<uint32_t>();
174        break;
175      case 8:
176        data_val = pkt->get<uint64_t>();
177        break;
178      default:
179        DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
180    }
181
182    DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
183            pkt->getSize());
184#endif
185    if (!offset) {
186        static int nextId = 0;
187
188        // The depends field of the qstruct, which was previously unused, is
189        // used to communicate with simulated application.
190        if (curTask.depends) {
191            HostState hs;
192            shader->ReadMem((uint64_t)(curTask.depends), &hs,
193                            sizeof(HostState), 0);
194
195            // update event start time (in nano-seconds)
196            uint64_t start = curTick() / 1000;
197
198            shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
199                             &start, sizeof(uint64_t), 0);
200        }
201
202        // launch kernel
203        ++num_kernelLaunched;
204
205        NDRange *ndr = &(ndRangeMap[nextId]);
206        // copy dispatch info
207        ndr->q = curTask;
208
209        // update the numDispTask polled by the runtime
210        accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
211
212        ndr->numWgTotal = 1;
213
214        for (int i = 0; i < 3; ++i) {
215            ndr->wgId[i] = 0;
216            ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
217            ndr->numWgTotal *= ndr->numWg[i];
218        }
219
220        ndr->numWgCompleted = 0;
221        ndr->globalWgId = 0;
222        ndr->wg_disp_rem = true;
223        ndr->execDone = false;
224        ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
225        ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
226        ndr->dispatchId = nextId;
227        ndr->curCid = pkt->req->contextId();
228        DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
229        execIds.push(nextId);
230        ++nextId;
231
232        dispatchActive = true;
233
234        if (!tickEvent.scheduled()) {
235            schedule(&tickEvent, curTick() + shader->ticks(1));
236        }
237    } else {
238        // populate current task struct
239        // first 64 bits are launch reg
240        offset -= 8;
241        assert(offset < sizeof(HsaQueueEntry));
242        char *curTaskPtr = (char*)&curTask;
243        memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
244    }
245
246    pkt->makeAtomicResponse();
247
248    return pioDelay;
249}
250
251
252BaseMasterPort&
253GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)
254{
255    if (if_name == "translation_port") {
256        return *tlbPort;
257    }
258
259    return DmaDevice::getMasterPort(if_name, idx);
260}
261
262void
263GpuDispatcher::exec()
264{
265    int fail_count = 0;
266
267    // There are potentially multiple outstanding kernel launches.
268    // It is possible that the workgroups in a different kernel
269    // can fit on the GPU even if another kernel's workgroups cannot
270    DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
271
272    while (execIds.size() > fail_count) {
273        int execId = execIds.front();
274
275        while (ndRangeMap[execId].wg_disp_rem) {
276            //update the thread context
277            shader->updateContext(ndRangeMap[execId].curCid);
278
279            // attempt to dispatch_workgroup
280            if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
281                // if we failed try the next kernel,
282                // it may have smaller workgroups.
283                // put it on the queue to rety latter
284                DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
285                execIds.push(execId);
286                ++fail_count;
287                break;
288            }
289        }
290        // let's try the next kernel_id
291        execIds.pop();
292    }
293
294    DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
295
296    if (doneIds.size() && cpu) {
297        shader->hostWakeUp(cpu);
298    }
299
300    while (doneIds.size()) {
301        // wakeup the CPU if any Kernels completed this cycle
302        DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
303        doneIds.pop();
304    }
305}
306
307void
308GpuDispatcher::notifyWgCompl(Wavefront *w)
309{
310    int kern_id = w->kernId;
311    DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
312    assert(ndRangeMap[kern_id].dispatchId == kern_id);
313    ndRangeMap[kern_id].numWgCompleted++;
314
315    if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
316        ndRangeMap[kern_id].execDone = true;
317        doneIds.push(kern_id);
318
319        if (ndRangeMap[kern_id].addrToNotify) {
320            accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
321                          0);
322        }
323
324        accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
325
326        // update event end time (in nano-seconds)
327        if (ndRangeMap[kern_id].q.depends) {
328            HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
329            uint64_t event;
330            shader->ReadMem((uint64_t)(&host_state->event), &event,
331                            sizeof(uint64_t), 0);
332
333            uint64_t end = curTick() / 1000;
334
335            shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
336                             sizeof(uint64_t), 0);
337        }
338    }
339
340    if (!tickEvent.scheduled()) {
341        schedule(&tickEvent, curTick() + shader->ticks(1));
342    }
343}
344
345void
346GpuDispatcher::scheduleDispatch()
347{
348    if (!tickEvent.scheduled())
349        schedule(&tickEvent, curTick() + shader->ticks(1));
350}
351
352void
353GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
354{
355    if (cpu) {
356        if (off) {
357            shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
358                              true);
359            val += off;
360        }
361
362        shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
363    } else {
364        panic("Cannot find host");
365    }
366}
367
368// helper functions for driver to retrieve GPU attributes
369int
370GpuDispatcher::getNumCUs()
371{
372    return shader->cuList.size();
373}
374
375int
376GpuDispatcher::wfSize() const
377{
378    return shader->cuList[0]->wfSize();
379}
380
381void
382GpuDispatcher::setFuncargsSize(int funcargs_size)
383{
384    shader->funcargs_size = funcargs_size;
385}
386
387uint32_t
388GpuDispatcher::getStaticContextSize() const
389{
390    return shader->cuList[0]->wfList[0][0]->getStaticContextSize();
391}
392