shader.cc revision 11905
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Steve Reinhardt
34 */
35
36#include "gpu-compute/shader.hh"
37
38#include <limits>
39
40#include "arch/x86/linux/linux.hh"
41#include "base/chunk_generator.hh"
42#include "debug/GPUDisp.hh"
43#include "debug/GPUMem.hh"
44#include "debug/HSAIL.hh"
45#include "gpu-compute/dispatcher.hh"
46#include "gpu-compute/gpu_static_inst.hh"
47#include "gpu-compute/qstruct.hh"
48#include "gpu-compute/wavefront.hh"
49#include "mem/packet.hh"
50#include "mem/ruby/system/RubySystem.hh"
51#include "sim/sim_exit.hh"
52
53Shader::Shader(const Params *p) : ClockedObject(p),
54    clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
55    cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
56    hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
57    separate_acquire_release(p->separate_acquire_release), coissue_return(1),
58    trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
59    globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
60    box_tick_cnt(0), start_tick_cnt(0)
61{
62
63    cuList.resize(n_cu);
64
65    for (int i = 0; i < n_cu; ++i) {
66        cuList[i] = p->CUs[i];
67        assert(i == cuList[i]->cu_id);
68        cuList[i]->shader = this;
69    }
70}
71
72Addr
73Shader::mmap(int length)
74{
75
76    Addr start;
77
78    // round up length to the next page
79    length = roundUp(length, TheISA::PageBytes);
80
81    Process *proc = gpuTc->getProcessPtr();
82    auto mem_state = proc->memState;
83
84    if (proc->mmapGrowsDown()) {
85        DPRINTF(HSAIL, "GROWS DOWN");
86        start = mem_state->getMmapEnd() - length;
87        mem_state->setMmapEnd(start);
88    } else {
89        DPRINTF(HSAIL, "GROWS UP");
90        start = mem_state->getMmapEnd();
91        mem_state->setMmapEnd(start + length);
92
93        // assertion to make sure we don't overwrite the stack (it grows down)
94        assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
95               mem_state->getMmapEnd());
96    }
97
98    DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
99
100    proc->allocateMem(start, length);
101
102    return start;
103}
104
105void
106Shader::init()
107{
108    // grab the threadContext of the thread running on the CPU
109    assert(cpuPointer);
110    gpuTc = cpuPointer->getContext(0);
111    assert(gpuTc);
112}
113
114Shader::~Shader()
115{
116    for (int j = 0; j < n_cu; ++j)
117        delete cuList[j];
118}
119
120void
121Shader::updateContext(int cid) {
122    // context of the thread which dispatched work
123    assert(cpuPointer);
124    gpuTc = cpuPointer->getContext(cid);
125    assert(gpuTc);
126}
127
128void
129Shader::hostWakeUp(BaseCPU *cpu) {
130    if (cpuPointer == cpu) {
131        if (gpuTc->status() == ThreadContext::Suspended)
132            cpu->activateContext(gpuTc->threadId());
133    } else {
134        //Make sure both dispatcher and shader are trying to
135        //wakeup same host. Hack here to enable kernel launch
136        //from multiple CPUs
137        panic("Dispatcher wants to wakeup a different host");
138    }
139}
140
141Shader*
142ShaderParams::create()
143{
144    return new Shader(this);
145}
146
147void
148Shader::exec()
149{
150    tick_cnt = curTick();
151    box_tick_cnt = curTick() - start_tick_cnt;
152
153    // apply any scheduled adds
154    for (int i = 0; i < sa_n; ++i) {
155        if (sa_when[i] <= tick_cnt) {
156            *sa_val[i] += sa_x[i];
157            sa_val.erase(sa_val.begin() + i);
158            sa_x.erase(sa_x.begin() + i);
159            sa_when.erase(sa_when.begin() + i);
160            --sa_n;
161            --i;
162        }
163    }
164
165    // clock all of the cu's
166    for (int i = 0; i < n_cu; ++i)
167        cuList[i]->exec();
168}
169
170bool
171Shader::dispatch_workgroups(NDRange *ndr)
172{
173    bool scheduledSomething = false;
174    int cuCount = 0;
175    int curCu = nextSchedCu;
176
177    while (cuCount < n_cu) {
178        //Every time we try a CU, update nextSchedCu
179        nextSchedCu = (nextSchedCu + 1) % n_cu;
180
181        // dispatch workgroup iff the following two conditions are met:
182        // (a) wg_rem is true - there are unassigned workgroups in the grid
183        // (b) there are enough free slots in cu cuList[i] for this wg
184        if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
185            scheduledSomething = true;
186            DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
187
188            // ticks() member function translates cycles to simulation ticks.
189            if (!tickEvent.scheduled()) {
190                schedule(tickEvent, curTick() + this->ticks(1));
191            }
192
193            cuList[curCu]->StartWorkgroup(ndr);
194            ndr->wgId[0]++;
195            ndr->globalWgId++;
196            if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
197                ndr->wgId[0] = 0;
198                ndr->wgId[1]++;
199
200                if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
201                    ndr->wgId[1] = 0;
202                    ndr->wgId[2]++;
203
204                    if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
205                        ndr->wg_disp_rem = false;
206                        break;
207                    }
208                }
209            }
210        }
211
212        ++cuCount;
213        curCu = nextSchedCu;
214    }
215
216    return scheduledSomething;
217}
218
219void
220Shader::handshake(GpuDispatcher *_dispatcher)
221{
222    dispatcher = _dispatcher;
223}
224
225void
226Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
227                           bool suppress_func_errors, int cu_id)
228{
229    int block_size = cuList.at(cu_id)->cacheLineSize();
230    unsigned size = req->getSize();
231
232    Addr tmp_addr;
233    BaseTLB::Mode trans_mode;
234
235    if (cmd == MemCmd::ReadReq) {
236        trans_mode = BaseTLB::Read;
237    } else if (cmd == MemCmd::WriteReq) {
238        trans_mode = BaseTLB::Write;
239    } else {
240        fatal("unexcepted MemCmd\n");
241    }
242
243    tmp_addr = req->getVaddr();
244    Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
245
246    assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
247
248    // Misaligned access
249    if (split_addr > tmp_addr) {
250        RequestPtr req1, req2;
251        req->splitOnVaddr(split_addr, req1, req2);
252
253
254        PacketPtr pkt1 = new Packet(req2, cmd);
255        PacketPtr pkt2 = new Packet(req1, cmd);
256
257        functionalTLBAccess(pkt1, cu_id, trans_mode);
258        functionalTLBAccess(pkt2, cu_id, trans_mode);
259
260        PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
261        PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
262
263        new_pkt1->dataStatic(data);
264        new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
265
266        if (suppress_func_errors) {
267            new_pkt1->setSuppressFuncError();
268            new_pkt2->setSuppressFuncError();
269        }
270
271        // fixme: this should be cuList[cu_id] if cu_id != n_cu
272        // The latter requires a memPort in the dispatcher
273        cuList[0]->memPort[0]->sendFunctional(new_pkt1);
274        cuList[0]->memPort[0]->sendFunctional(new_pkt2);
275
276        delete new_pkt1;
277        delete new_pkt2;
278        delete pkt1;
279        delete pkt2;
280    } else {
281        PacketPtr pkt = new Packet(req, cmd);
282        functionalTLBAccess(pkt, cu_id, trans_mode);
283        PacketPtr new_pkt = new Packet(pkt->req, cmd);
284        new_pkt->dataStatic(data);
285
286        if (suppress_func_errors) {
287            new_pkt->setSuppressFuncError();
288        };
289
290        // fixme: this should be cuList[cu_id] if cu_id != n_cu
291        // The latter requires a memPort in the dispatcher
292        cuList[0]->memPort[0]->sendFunctional(new_pkt);
293
294        delete new_pkt;
295        delete pkt;
296    }
297}
298
299bool
300Shader::busy()
301{
302    for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
303        if (!cuList[i_cu]->isDone()) {
304            return true;
305        }
306    }
307
308    return false;
309}
310
311void
312Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
313{
314    sa_val.push_back(val);
315    sa_when.push_back(tick_cnt + when);
316    sa_x.push_back(x);
317    ++sa_n;
318}
319
320Shader::TickEvent::TickEvent(Shader *_shader)
321    : Event(CPU_Tick_Pri), shader(_shader)
322{
323}
324
325
326void
327Shader::TickEvent::process()
328{
329    if (shader->busy()) {
330        shader->exec();
331        shader->schedule(this, curTick() + shader->ticks(1));
332    }
333}
334
335const char*
336Shader::TickEvent::description() const
337{
338    return "Shader tick";
339}
340
341void
342Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
343                  MemCmd cmd, bool suppress_func_errors)
344{
345    uint8_t *data_buf = (uint8_t*)ptr;
346
347    for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
348         !gen.done(); gen.next()) {
349        Request *req = new Request(0, gen.addr(), gen.size(), 0,
350                                   cuList[0]->masterId(), 0, 0, 0);
351
352        doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
353        data_buf += gen.size();
354        delete req;
355    }
356}
357
358void
359Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
360{
361    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
362}
363
364void
365Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
366                bool suppress_func_errors)
367{
368    AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
369}
370
371void
372Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
373{
374    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
375}
376
377void
378Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
379                 bool suppress_func_errors)
380{
381    AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
382              suppress_func_errors);
383}
384
385/*
386 * Send a packet through the appropriate TLB functional port.
387 * If cu_id=n_cu, then this is the dispatcher's TLB.
388 * Otherwise it's the TLB of the cu_id compute unit.
389 */
390void
391Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
392{
393    // update senderState. Need to know the gpuTc and the TLB mode
394    pkt->senderState =
395        new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
396
397    if (cu_id == n_cu) {
398        dispatcher->tlbPort->sendFunctional(pkt);
399    } else {
400        // even when the perLaneTLB flag is turned on
401        // it's ok tp send all accesses through lane 0
402        // since the lane # is not known here,
403        // This isn't important since these are functional accesses.
404        cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
405    }
406
407    /* safe_cast the senderState */
408    TheISA::GpuTLB::TranslationState *sender_state =
409               safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
410
411    delete sender_state->tlbEntry;
412    delete pkt->senderState;
413}
414