compute_unit.cc revision 11364:1bd9f1b27438
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: John Kalamatianos, Anthony Gutierrez
34 */
35
36#include "gpu-compute/compute_unit.hh"
37
38#include "base/output.hh"
39#include "debug/GPUDisp.hh"
40#include "debug/GPUExec.hh"
41#include "debug/GPUFetch.hh"
42#include "debug/GPUMem.hh"
43#include "debug/GPUPort.hh"
44#include "debug/GPUPrefetch.hh"
45#include "debug/GPUSync.hh"
46#include "debug/GPUTLB.hh"
47#include "gpu-compute/dispatcher.hh"
48#include "gpu-compute/gpu_dyn_inst.hh"
49#include "gpu-compute/gpu_static_inst.hh"
50#include "gpu-compute/ndrange.hh"
51#include "gpu-compute/shader.hh"
52#include "gpu-compute/simple_pool_manager.hh"
53#include "gpu-compute/vector_register_file.hh"
54#include "gpu-compute/wavefront.hh"
55#include "mem/page_table.hh"
56#include "sim/process.hh"
57
58ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
59    scoreboardCheckStage(p), scheduleStage(p), execStage(p),
60    globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
61    cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
62    spBypassPipeLength(p->spbypass_pipe_length),
63    dpBypassPipeLength(p->dpbypass_pipe_length),
64    issuePeriod(p->issue_period),
65    numGlbMemUnits(p->num_global_mem_pipes),
66    numLocMemUnits(p->num_shared_mem_pipes),
67    perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
68    prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
69    xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
70    functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
71    countPages(p->countPages), barrier_id(0),
72    vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
73    coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
74    req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
75    resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
76    _masterId(p->system->getMasterId(name() + ".ComputeUnit")),
77    lds(*p->localDataStore), globalSeqNum(0),  wavefrontSize(p->wfSize)
78{
79    // this check will be eliminated once we have wavefront size support added
80    fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
81    // calculate how many cycles a vector load or store will need to transfer
82    // its data over the corresponding buses
83    numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
84                                / (double)vrfToCoalescerBusWidth);
85
86    numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
87                               / coalescerToVrfBusWidth;
88
89    lastVaddrWF.resize(numSIMDs);
90    wfList.resize(numSIMDs);
91
92    for (int j = 0; j < numSIMDs; ++j) {
93        lastVaddrWF[j].resize(p->n_wf);
94
95        for (int i = 0; i < p->n_wf; ++i) {
96            lastVaddrWF[j][i].resize(VSZ);
97
98            wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
99            wfList[j][i]->setParent(this);
100
101            for (int k = 0; k < VSZ; ++k) {
102                lastVaddrWF[j][i][k] = 0;
103            }
104        }
105    }
106
107    lastVaddrPhase.resize(numSIMDs);
108
109    for (int i = 0; i < numSIMDs; ++i) {
110        lastVaddrPhase[i] = LastVaddrWave();
111    }
112
113    lastVaddrCU = LastVaddrWave();
114
115    lds.setParent(this);
116
117    if (p->execPolicy == "OLDEST-FIRST") {
118        exec_policy = EXEC_POLICY::OLDEST;
119    } else if (p->execPolicy == "ROUND-ROBIN") {
120        exec_policy = EXEC_POLICY::RR;
121    } else {
122        fatal("Invalid WF execution policy (CU)\n");
123    }
124
125    memPort.resize(VSZ);
126
127    // resize the tlbPort vectorArray
128    int tlbPort_width = perLaneTLB ? VSZ : 1;
129    tlbPort.resize(tlbPort_width);
130
131    cuExitCallback = new CUExitCallback(this);
132    registerExitCallback(cuExitCallback);
133
134    xactCasLoadMap.clear();
135    lastExecCycle.resize(numSIMDs, 0);
136
137    for (int i = 0; i < vrf.size(); ++i) {
138        vrf[i]->setParent(this);
139    }
140
141    numVecRegsPerSimd = vrf[0]->numRegs();
142}
143
144ComputeUnit::~ComputeUnit()
145{
146    // Delete wavefront slots
147
148    for (int j = 0; j < numSIMDs; ++j)
149        for (int i = 0; i < shader->n_wf; ++i) {
150            delete wfList[j][i];
151        }
152
153    readyList.clear();
154    waveStatusList.clear();
155    dispatchList.clear();
156    vectorAluInstAvail.clear();
157    delete cuExitCallback;
158    delete ldsPort;
159}
160
161void
162ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
163{
164    w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
165
166    w->workgroupsz[0] = ndr->q.wgSize[0];
167    w->workgroupsz[1] = ndr->q.wgSize[1];
168    w->workgroupsz[2] = ndr->q.wgSize[2];
169    w->wg_sz = w->workgroupsz[0] * w->workgroupsz[1] * w->workgroupsz[2];
170    w->gridsz[0] = ndr->q.gdSize[0];
171    w->gridsz[1] = ndr->q.gdSize[1];
172    w->gridsz[2] = ndr->q.gdSize[2];
173    w->kernelArgs = ndr->q.args;
174    w->privSizePerItem = ndr->q.privMemPerItem;
175    w->spillSizePerItem = ndr->q.spillMemPerItem;
176    w->roBase = ndr->q.roMemStart;
177    w->roSize = ndr->q.roMemTotal;
178}
179
180void
181ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
182                        int trueWgSize[], int trueWgSizeTotal,
183                        LdsChunk *ldsChunk, uint64_t origSpillMemStart)
184{
185    wfCtx->cnt = cnt;
186
187    VectorMask init_mask;
188    init_mask.reset();
189
190    for (int k = 0; k < VSZ; ++k) {
191        if (k + cnt * VSZ < trueWgSizeTotal)
192            init_mask[k] = 1;
193    }
194
195    wfCtx->init_mask = init_mask.to_ullong();
196    wfCtx->exec_mask = init_mask.to_ullong();
197
198    for (int i = 0; i < VSZ; ++i) {
199        wfCtx->bar_cnt[i] = 0;
200    }
201
202    wfCtx->max_bar_cnt = 0;
203    wfCtx->old_barrier_cnt = 0;
204    wfCtx->barrier_cnt = 0;
205
206    wfCtx->privBase = ndr->q.privMemStart;
207    ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
208
209    wfCtx->spillBase = ndr->q.spillMemStart;
210    ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
211
212    wfCtx->pc = 0;
213    wfCtx->rpc = UINT32_MAX;
214
215    // set the wavefront context to have a pointer to this section of the LDS
216    wfCtx->ldsChunk = ldsChunk;
217
218    // WG state
219    wfCtx->wg_id = ndr->globalWgId;
220    wfCtx->barrier_id = barrier_id;
221
222    // Kernel wide state
223    wfCtx->ndr = ndr;
224}
225
226void
227ComputeUnit::updateEvents() {
228
229    if (!timestampVec.empty()) {
230        uint32_t vecSize = timestampVec.size();
231        uint32_t i = 0;
232        while (i < vecSize) {
233            if (timestampVec[i] <= shader->tick_cnt) {
234                std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i];
235                vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
236                                            statusVec[i]);
237                timestampVec.erase(timestampVec.begin() + i);
238                regIdxVec.erase(regIdxVec.begin() + i);
239                statusVec.erase(statusVec.begin() + i);
240                --vecSize;
241                --i;
242            }
243            ++i;
244        }
245    }
246
247    for (int i = 0; i< numSIMDs; ++i) {
248        vrf[i]->updateEvents();
249    }
250}
251
252
253void
254ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
255                     int trueWgSizeTotal)
256{
257    static int _n_wave = 0;
258    int cnt = wfCtx->cnt;
259    NDRange *ndr = wfCtx->ndr;
260
261    // Fill in Kernel state
262    FillKernelState(w, ndr);
263
264    w->kern_id = ndr->dispatchId;
265    w->dynwaveid = cnt;
266    w->init_mask = wfCtx->init_mask;
267
268    for (int k = 0; k < VSZ; ++k) {
269        w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
270        w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
271        w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
272
273        w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
274            trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
275            w->workitemid[0][k];
276    }
277
278    w->old_barrier_cnt = wfCtx->old_barrier_cnt;
279    w->barrier_cnt = wfCtx->barrier_cnt;
280    w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
281
282    for (int i = 0; i < VSZ; ++i) {
283        w->bar_cnt[i] = wfCtx->bar_cnt[i];
284    }
285
286    w->max_bar_cnt = wfCtx->max_bar_cnt;
287    w->privBase = wfCtx->privBase;
288    w->spillBase = wfCtx->spillBase;
289
290    w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
291
292    // WG state
293    w->wg_id = wfCtx->wg_id;
294    w->dispatchid = wfCtx->ndr->dispatchId;
295    w->workgroupid[0] = w->wg_id % ndr->numWg[0];
296    w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
297    w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
298
299    w->barrier_id = wfCtx->barrier_id;
300    w->stalledAtBarrier = false;
301
302    // move this from the context into the actual wavefront
303    w->ldsChunk = wfCtx->ldsChunk;
304
305    int32_t refCount M5_VAR_USED =
306                    lds.increaseRefCounter(w->dispatchid, w->wg_id);
307    DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
308                    cu_id, w->wg_id, refCount);
309
310    w->instructionBuffer.clear();
311
312    if (w->pendingFetch)
313        w->dropFetch = true;
314
315    // is this the last wavefront in the workgroup
316    // if set the spillWidth to be the remaining work-items
317    // so that the vector access is correct
318    if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
319        w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
320    } else {
321        w->spillWidth = VSZ;
322    }
323
324    DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
325            "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
326
327    w->start(++_n_wave, ndr->q.code_ptr);
328}
329
330void
331ComputeUnit::StartWorkgroup(NDRange *ndr)
332{
333    // reserve the LDS capacity allocated to the work group
334    // disambiguated by the dispatch ID and workgroup ID, which should be
335    // globally unique
336    LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
337                                          ndr->q.ldsSize);
338
339    // Send L1 cache acquire
340    // isKernel + isAcquire = Kernel Begin
341    if (shader->impl_kern_boundary_sync) {
342        GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
343                                                                nullptr,
344                                                                nullptr, 0);
345
346        gpuDynInst->useContinuation = false;
347        gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE;
348        gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM;
349        injectGlobalMemFence(gpuDynInst, true);
350    }
351
352    // Get true size of workgroup (after clamping to grid size)
353    int trueWgSize[3];
354    int trueWgSizeTotal = 1;
355
356    for (int d = 0; d < 3; ++d) {
357        trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
358                                 ndr->wgId[d] * ndr->q.wgSize[d]);
359
360        trueWgSizeTotal *= trueWgSize[d];
361    }
362
363    uint64_t origSpillMemStart = ndr->q.spillMemStart;
364    // calculate the number of 32-bit vector registers required by wavefront
365    int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
366    int cnt = 0;
367
368    // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
369    for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
370        Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
371        // Check if this wavefront slot is available:
372        // It must be stopped and not waiting
373        // for a release to complete S_RETURNING
374        if (w->status == Wavefront::S_STOPPED) {
375            // if we have scheduled all work items then stop
376            // scheduling wavefronts
377            if (cnt * VSZ >= trueWgSizeTotal)
378                break;
379
380            // reserve vector registers for the scheduled wavefront
381            assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd);
382            uint32_t normSize = 0;
383
384            w->startVgprIndex = vrf[m % numSIMDs]->manager->
385                                    allocateRegion(vregDemand, &normSize);
386
387            w->reservedVectorRegs = normSize;
388            vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
389
390            WFContext wfCtx;
391
392            InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
393                                ldsChunk, origSpillMemStart);
394
395            StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
396            ++cnt;
397        }
398    }
399    ++barrier_id;
400}
401
402int
403ComputeUnit::ReadyWorkgroup(NDRange *ndr)
404{
405    // Get true size of workgroup (after clamping to grid size)
406    int trueWgSize[3];
407    int trueWgSizeTotal = 1;
408
409    for (int d = 0; d < 3; ++d) {
410        trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
411                                 ndr->wgId[d] * ndr->q.wgSize[d]);
412
413        trueWgSizeTotal *= trueWgSize[d];
414        DPRINTF(GPUDisp, "trueWgSize[%d] =  %d\n", d, trueWgSize[d]);
415    }
416
417    DPRINTF(GPUDisp, "trueWgSizeTotal =  %d\n", trueWgSizeTotal);
418
419    // calculate the number of 32-bit vector registers required by each
420    // work item of the work group
421    int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
422    bool vregAvail = true;
423    int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
424    int freeWfSlots = 0;
425    // check if the total number of VGPRs required by all WFs of the WG
426    // fit in the VRFs of all SIMD units
427    assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
428    int numMappedWfs = 0;
429    std::vector<int> numWfsPerSimd;
430    numWfsPerSimd.resize(numSIMDs, 0);
431    // find how many free WF slots we have across all SIMDs
432    for (int j = 0; j < shader->n_wf; ++j) {
433        for (int i = 0; i < numSIMDs; ++i) {
434            if (wfList[i][j]->status == Wavefront::S_STOPPED) {
435                // count the number of free WF slots
436                ++freeWfSlots;
437                if (numMappedWfs < numWfs) {
438                    // count the WFs to be assigned per SIMD
439                    numWfsPerSimd[i]++;
440                }
441                numMappedWfs++;
442            }
443        }
444    }
445
446    // if there are enough free WF slots then find if there are enough
447    // free VGPRs per SIMD based on the WF->SIMD mapping
448    if (freeWfSlots >= numWfs) {
449        for (int j = 0; j < numSIMDs; ++j) {
450            // find if there are enough free VGPR regions in the SIMD's VRF
451            // to accommodate the WFs of the new WG that would be mapped to
452            // this SIMD unit
453            vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
454                                                     vregDemandPerWI);
455
456            // stop searching if there is at least one SIMD
457            // whose VRF does not have enough free VGPR pools.
458            // This is because a WG is scheduled only if ALL
459            // of its WFs can be scheduled
460            if (!vregAvail)
461                break;
462        }
463    }
464
465    DPRINTF(GPUDisp, "Free WF slots =  %d, VGPR Availability = %d\n",
466            freeWfSlots, vregAvail);
467
468    if (!vregAvail) {
469        ++numTimesWgBlockedDueVgprAlloc;
470    }
471
472    // Return true if enough WF slots to submit workgroup and if there are
473    // enough VGPRs to schedule all WFs to their SIMD units
474    if (!lds.canReserve(ndr->q.ldsSize)) {
475        wgBlockedDueLdsAllocation++;
476    }
477
478    // Return true if (a) there are enough free WF slots to submit
479    // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
480    // SIMD units and (c) if there is enough space in LDS
481    return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
482}
483
484int
485ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
486{
487    DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
488    int ccnt = 0;
489
490    for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
491        for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
492            Wavefront *w = wfList[i_simd][i_wf];
493
494            if (w->status == Wavefront::S_RUNNING) {
495                DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
496
497                DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
498                        w->barrier_id, _barrier_id);
499
500                DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
501                        w->barrier_cnt, bcnt);
502            }
503
504            if (w->status == Wavefront::S_RUNNING &&
505                w->barrier_id == _barrier_id && w->barrier_cnt == bcnt &&
506                !w->outstanding_reqs) {
507                ++ccnt;
508
509                DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
510                        "%d\n", i_simd, i_wf, ccnt);
511            }
512        }
513    }
514
515    DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
516            cu_id, ccnt, bslots);
517
518    return ccnt == bslots;
519}
520
521//  Check if the current wavefront is blocked on additional resources.
522bool
523ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
524{
525    bool cede = false;
526
527    // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
528    // magic instructions will impact the scheduling of wavefronts
529    if (xact_cas_mode) {
530        /*
531         * When a wavefront calls xact_cas_ld, it adds itself to a per address
532         * queue. All per address queues are managed by the xactCasLoadMap.
533         *
534         * A wavefront is not blocked if: it is not in ANY per address queue or
535         * if it is at the head of a per address queue.
536         */
537        for (auto itMap : xactCasLoadMap) {
538            std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
539
540            if (!curWaveIDQueue.empty()) {
541                for (auto it : curWaveIDQueue) {
542                    waveIdentifier cur_wave = it;
543
544                    if (cur_wave.simdId == simdId &&
545                        cur_wave.wfSlotId == wfSlotId) {
546                        // 2 possibilities
547                        // 1: this WF has a green light
548                        // 2: another WF has a green light
549                        waveIdentifier owner_wave = curWaveIDQueue.front();
550
551                        if (owner_wave.simdId != cur_wave.simdId ||
552                            owner_wave.wfSlotId != cur_wave.wfSlotId) {
553                            // possibility 2
554                            cede = true;
555                            break;
556                        } else {
557                            // possibility 1
558                            break;
559                        }
560                    }
561                }
562            }
563        }
564    }
565
566    return cede;
567}
568
569// Execute one clock worth of work on the ComputeUnit.
570void
571ComputeUnit::exec()
572{
573    updateEvents();
574    // Execute pipeline stages in reverse order to simulate
575    // the pipeline latency
576    globalMemoryPipe.exec();
577    localMemoryPipe.exec();
578    execStage.exec();
579    scheduleStage.exec();
580    scoreboardCheckStage.exec();
581    fetchStage.exec();
582
583    totalCycles++;
584}
585
586void
587ComputeUnit::init()
588{
589    // Initialize CU Bus models
590    glbMemToVrfBus.init(&shader->tick_cnt, shader->ticks(1));
591    locMemToVrfBus.init(&shader->tick_cnt, shader->ticks(1));
592    nextGlbMemBus = 0;
593    nextLocMemBus = 0;
594    fatal_if(numGlbMemUnits > 1,
595             "No support for multiple Global Memory Pipelines exists!!!");
596    vrfToGlobalMemPipeBus.resize(numGlbMemUnits);
597    for (int j = 0; j < numGlbMemUnits; ++j) {
598        vrfToGlobalMemPipeBus[j] = WaitClass();
599        vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, shader->ticks(1));
600    }
601
602    fatal_if(numLocMemUnits > 1,
603             "No support for multiple Local Memory Pipelines exists!!!");
604    vrfToLocalMemPipeBus.resize(numLocMemUnits);
605    for (int j = 0; j < numLocMemUnits; ++j) {
606        vrfToLocalMemPipeBus[j] = WaitClass();
607        vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, shader->ticks(1));
608    }
609    vectorRegsReserved.resize(numSIMDs, 0);
610    aluPipe.resize(numSIMDs);
611    wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
612
613    for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
614        wfWait[i] = WaitClass();
615        wfWait[i].init(&shader->tick_cnt, shader->ticks(1));
616    }
617
618    for (int i = 0; i < numSIMDs; ++i) {
619        aluPipe[i] = WaitClass();
620        aluPipe[i].init(&shader->tick_cnt, shader->ticks(1));
621    }
622
623    // Setup space for call args
624    for (int j = 0; j < numSIMDs; ++j) {
625        for (int i = 0; i < shader->n_wf; ++i) {
626            wfList[j][i]->initCallArgMem(shader->funcargs_size);
627        }
628    }
629
630    // Initializing pipeline resources
631    readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
632    waveStatusList.resize(numSIMDs);
633
634    for (int j = 0; j < numSIMDs; ++j) {
635        for (int i = 0; i < shader->n_wf; ++i) {
636            waveStatusList[j].push_back(
637                std::make_pair(wfList[j][i], BLOCKED));
638        }
639    }
640
641    for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
642        dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
643    }
644
645    fetchStage.init(this);
646    scoreboardCheckStage.init(this);
647    scheduleStage.init(this);
648    execStage.init(this);
649    globalMemoryPipe.init(this);
650    localMemoryPipe.init(this);
651    // initialize state for statistics calculation
652    vectorAluInstAvail.resize(numSIMDs, false);
653    shrMemInstAvail = 0;
654    glbMemInstAvail = 0;
655}
656
657bool
658ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
659{
660    // Ruby has completed the memory op. Schedule the mem_resp_event at the
661    // appropriate cycle to process the timing memory response
662    // This delay represents the pipeline delay
663    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
664    int index = sender_state->port_index;
665    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
666
667    // Is the packet returned a Kernel End or Barrier
668    if (pkt->req->isKernel() && pkt->req->isRelease()) {
669        Wavefront *w =
670            computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
671
672        // Check if we are waiting on Kernel End Release
673        if (w->status == Wavefront::S_RETURNING) {
674            DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
675                    computeUnit->cu_id, w->simdId, w->wfSlotId,
676                    w->wfDynId, w->kern_id);
677
678            computeUnit->shader->dispatcher->notifyWgCompl(w);
679            w->status = Wavefront::S_STOPPED;
680        } else {
681            w->outstanding_reqs--;
682        }
683
684        DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
685                computeUnit->cu_id, gpuDynInst->simdId,
686                gpuDynInst->wfSlotId, w->barrier_cnt);
687
688        if (gpuDynInst->useContinuation) {
689            assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
690            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
691                                           gpuDynInst);
692        }
693
694        delete pkt->senderState;
695        delete pkt->req;
696        delete pkt;
697        return true;
698    } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
699        if (gpuDynInst->useContinuation) {
700            assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
701            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
702                                           gpuDynInst);
703        }
704
705        delete pkt->senderState;
706        delete pkt->req;
707        delete pkt;
708        return true;
709    }
710
711    ComputeUnit::DataPort::MemRespEvent *mem_resp_event =
712        new ComputeUnit::DataPort::MemRespEvent(computeUnit->memPort[index],
713                                                pkt);
714
715    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
716            computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
717            index, pkt->req->getPaddr());
718
719    computeUnit->schedule(mem_resp_event,
720                          curTick() + computeUnit->resp_tick_latency);
721    return true;
722}
723
724void
725ComputeUnit::DataPort::recvReqRetry()
726{
727    int len = retries.size();
728
729    assert(len > 0);
730
731    for (int i = 0; i < len; ++i) {
732        PacketPtr pkt = retries.front().first;
733        GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
734        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
735                computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
736                pkt->req->getPaddr());
737
738        /** Currently Ruby can return false due to conflicts for the particular
739         *  cache block or address.  Thus other requests should be allowed to
740         *  pass and the data port should expect multiple retries. */
741        if (!sendTimingReq(pkt)) {
742            DPRINTF(GPUMem, "failed again!\n");
743            break;
744        } else {
745            DPRINTF(GPUMem, "successful!\n");
746            retries.pop_front();
747        }
748    }
749}
750
751bool
752ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
753{
754    computeUnit->fetchStage.processFetchReturn(pkt);
755
756    return true;
757}
758
759void
760ComputeUnit::SQCPort::recvReqRetry()
761{
762    int len = retries.size();
763
764    assert(len > 0);
765
766    for (int i = 0; i < len; ++i) {
767        PacketPtr pkt = retries.front().first;
768        Wavefront *wavefront M5_VAR_USED = retries.front().second;
769        DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
770                computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
771                pkt->req->getPaddr());
772        if (!sendTimingReq(pkt)) {
773            DPRINTF(GPUFetch, "failed again!\n");
774            break;
775        } else {
776            DPRINTF(GPUFetch, "successful!\n");
777            retries.pop_front();
778        }
779    }
780}
781
782void
783ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
784{
785    // There must be a way around this check to do the globalMemStart...
786    Addr tmp_vaddr = pkt->req->getVaddr();
787
788    updatePageDivergenceDist(tmp_vaddr);
789
790    pkt->req->setVirt(pkt->req->getAsid(), tmp_vaddr, pkt->req->getSize(),
791                      pkt->req->getFlags(), pkt->req->masterId(),
792                      pkt->req->getPC());
793
794    // figure out the type of the request to set read/write
795    BaseTLB::Mode TLB_mode;
796    assert(pkt->isRead() || pkt->isWrite());
797
798    // Check write before read for atomic operations
799    // since atomic operations should use BaseTLB::Write
800    if (pkt->isWrite()){
801        TLB_mode = BaseTLB::Write;
802    } else if (pkt->isRead()) {
803        TLB_mode = BaseTLB::Read;
804    } else {
805        fatal("pkt is not a read nor a write\n");
806    }
807
808    tlbCycles -= curTick();
809    ++tlbRequests;
810
811    int tlbPort_index = perLaneTLB ? index : 0;
812
813    if (shader->timingSim) {
814        if (debugSegFault) {
815            Process *p = shader->gpuTc->getProcessPtr();
816            Addr vaddr = pkt->req->getVaddr();
817            unsigned size = pkt->getSize();
818
819            if ((vaddr + size - 1) % 64 < vaddr % 64) {
820                panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
821                      cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
822            }
823
824            Addr paddr;
825
826            if (!p->pTable->translate(vaddr, paddr)) {
827                if (!p->fixupStackFault(vaddr)) {
828                    panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
829                          cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
830                          vaddr);
831                }
832            }
833        }
834
835        // This is the SenderState needed upon return
836        pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
837
838        // This is the senderState needed by the TLB hierarchy to function
839        TheISA::GpuTLB::TranslationState *translation_state =
840          new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
841                                               pkt->senderState);
842
843        pkt->senderState = translation_state;
844
845        if (functionalTLB) {
846            tlbPort[tlbPort_index]->sendFunctional(pkt);
847
848            // update the hitLevel distribution
849            int hit_level = translation_state->hitLevel;
850            assert(hit_level != -1);
851            hitsPerTLBLevel[hit_level]++;
852
853            // New SenderState for the memory access
854            X86ISA::GpuTLB::TranslationState *sender_state =
855                safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
856
857            delete sender_state->tlbEntry;
858            delete sender_state->saved;
859            delete sender_state;
860
861            assert(pkt->req->hasPaddr());
862            assert(pkt->req->hasSize());
863
864            uint8_t *tmpData = pkt->getPtr<uint8_t>();
865
866            // this is necessary because the GPU TLB receives packets instead
867            // of requests. when the translation is complete, all relevent
868            // fields in the request will be populated, but not in the packet.
869            // here we create the new packet so we can set the size, addr,
870            // and proper flags.
871            PacketPtr oldPkt = pkt;
872            pkt = new Packet(oldPkt->req, oldPkt->cmd);
873            delete oldPkt;
874            pkt->dataStatic(tmpData);
875
876
877            // New SenderState for the memory access
878            pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
879                                                             index, nullptr);
880
881            gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
882            gpuDynInst->tlbHitLevel[index] = hit_level;
883
884
885            // translation is done. Schedule the mem_req_event at the
886            // appropriate cycle to send the timing memory request to ruby
887            ComputeUnit::DataPort::MemReqEvent *mem_req_event =
888                new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
889
890            DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
891                    "scheduled\n", cu_id, gpuDynInst->simdId,
892                    gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
893
894            schedule(mem_req_event, curTick() + req_tick_latency);
895        } else if (tlbPort[tlbPort_index]->isStalled()) {
896            assert(tlbPort[tlbPort_index]->retries.size() > 0);
897
898            DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
899                    "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
900                    tmp_vaddr);
901
902            tlbPort[tlbPort_index]->retries.push_back(pkt);
903        } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
904            // Stall the data port;
905            // No more packet will be issued till
906            // ruby indicates resources are freed by
907            // a recvReqRetry() call back on this port.
908            tlbPort[tlbPort_index]->stallPort();
909
910            DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
911                    "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
912                    tmp_vaddr);
913
914            tlbPort[tlbPort_index]->retries.push_back(pkt);
915        } else {
916           DPRINTF(GPUTLB,
917                   "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
918                   cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
919        }
920    } else {
921        if (pkt->cmd == MemCmd::MemFenceReq) {
922            gpuDynInst->statusBitVector = VectorMask(0);
923        } else {
924            gpuDynInst->statusBitVector &= (~(1ll << index));
925        }
926
927        // New SenderState for the memory access
928        delete pkt->senderState;
929
930        // Because it's atomic operation, only need TLB translation state
931        pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
932                                                                shader->gpuTc);
933
934        tlbPort[tlbPort_index]->sendFunctional(pkt);
935
936        // the addr of the packet is not modified, so we need to create a new
937        // packet, or otherwise the memory access will have the old virtual
938        // address sent in the translation packet, instead of the physical
939        // address returned by the translation.
940        PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
941        new_pkt->dataStatic(pkt->getPtr<uint8_t>());
942
943        // Translation is done. It is safe to send the packet to memory.
944        memPort[0]->sendFunctional(new_pkt);
945
946        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
947                gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
948                new_pkt->req->getPaddr());
949
950        // safe_cast the senderState
951        TheISA::GpuTLB::TranslationState *sender_state =
952             safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
953
954        delete sender_state->tlbEntry;
955        delete new_pkt;
956        delete pkt->senderState;
957        delete pkt->req;
958        delete pkt;
959    }
960}
961
962void
963ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
964{
965    ComputeUnit::DataPort::MemReqEvent *mem_req_event =
966        new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
967
968
969    // New SenderState for the memory access
970    pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
971                                                              nullptr);
972
973    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
974            cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
975            pkt->req->getPaddr());
976
977    schedule(mem_req_event, curTick() + req_tick_latency);
978}
979
980void
981ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
982                                  Request* req)
983{
984    if (!req) {
985        req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId, -1);
986    }
987    req->setPaddr(0);
988    if (kernelLaunch) {
989        req->setFlags(Request::KERNEL);
990    }
991
992    gpuDynInst->s_type = SEG_GLOBAL;
993
994    // for non-kernel MemFence operations, memorder flags are set depending
995    // on which type of request is currently being sent, so this
996    // should be set by the caller (e.g. if an inst has acq-rel
997    // semantics, it will send one acquire req an one release req)
998    gpuDynInst->setRequestFlags(req, kernelLaunch);
999
1000    // a mem fence must correspond to an acquire/release request
1001    assert(req->isAcquire() || req->isRelease());
1002
1003    // create packet
1004    PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
1005
1006    // set packet's sender state
1007    pkt->senderState =
1008        new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
1009
1010    // send the packet
1011    sendSyncRequest(gpuDynInst, 0, pkt);
1012}
1013
1014const char*
1015ComputeUnit::DataPort::MemRespEvent::description() const
1016{
1017    return "ComputeUnit memory response event";
1018}
1019
1020void
1021ComputeUnit::DataPort::MemRespEvent::process()
1022{
1023    DataPort::SenderState *sender_state =
1024        safe_cast<DataPort::SenderState*>(pkt->senderState);
1025
1026    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1027    ComputeUnit *compute_unit = dataPort->computeUnit;
1028
1029    assert(gpuDynInst);
1030
1031    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
1032            compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
1033            pkt->req->getPaddr(), dataPort->index);
1034
1035    Addr paddr = pkt->req->getPaddr();
1036
1037    if (pkt->cmd != MemCmd::MemFenceResp) {
1038        int index = gpuDynInst->memStatusVector[paddr].back();
1039
1040        DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
1041                pkt->req->getPaddr(), index);
1042
1043        gpuDynInst->memStatusVector[paddr].pop_back();
1044        gpuDynInst->pAddr = pkt->req->getPaddr();
1045
1046        if (pkt->isRead() || pkt->isWrite()) {
1047
1048            if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
1049                gpuDynInst->statusBitVector &= (~(1ULL << index));
1050            } else {
1051                assert(gpuDynInst->statusVector[index] > 0);
1052                gpuDynInst->statusVector[index]--;
1053
1054                if (!gpuDynInst->statusVector[index])
1055                    gpuDynInst->statusBitVector &= (~(1ULL << index));
1056            }
1057
1058            DPRINTF(GPUMem, "bitvector is now %#x\n",
1059                    gpuDynInst->statusBitVector);
1060
1061            if (gpuDynInst->statusBitVector == VectorMask(0)) {
1062                auto iter = gpuDynInst->memStatusVector.begin();
1063                auto end = gpuDynInst->memStatusVector.end();
1064
1065                while (iter != end) {
1066                    assert(iter->second.empty());
1067                    ++iter;
1068                }
1069
1070                gpuDynInst->memStatusVector.clear();
1071
1072                if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
1073                    gpuDynInst->statusVector.clear();
1074
1075                if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op)
1076                    || MO_ANR(gpuDynInst->m_op)) {
1077                    assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy());
1078
1079                    compute_unit->globalMemoryPipe.getGMLdRespFIFO()
1080                        .push(gpuDynInst);
1081                } else {
1082                    assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy());
1083
1084                    compute_unit->globalMemoryPipe.getGMStRespFIFO()
1085                        .push(gpuDynInst);
1086                }
1087
1088                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
1089                        compute_unit->cu_id, gpuDynInst->simdId,
1090                        gpuDynInst->wfSlotId);
1091
1092                // after clearing the status vectors,
1093                // see if there is a continuation to perform
1094                // the continuation may generate more work for
1095                // this memory request
1096                if (gpuDynInst->useContinuation) {
1097                    assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
1098                    gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
1099                                                 gpuDynInst);
1100                }
1101            }
1102        }
1103    } else {
1104        gpuDynInst->statusBitVector = VectorMask(0);
1105
1106        if (gpuDynInst->useContinuation) {
1107            assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
1108            gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
1109                                         gpuDynInst);
1110        }
1111    }
1112
1113    delete pkt->senderState;
1114    delete pkt->req;
1115    delete pkt;
1116}
1117
1118ComputeUnit*
1119ComputeUnitParams::create()
1120{
1121    return new ComputeUnit(this);
1122}
1123
1124bool
1125ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
1126{
1127    Addr line = pkt->req->getPaddr();
1128
1129    DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
1130            pkt->req->getVaddr(), line);
1131
1132    assert(pkt->senderState);
1133    computeUnit->tlbCycles += curTick();
1134
1135    // pop off the TLB translation state
1136    TheISA::GpuTLB::TranslationState *translation_state =
1137               safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1138
1139    // no PageFaults are permitted for data accesses
1140    if (!translation_state->tlbEntry->valid) {
1141        DTLBPort::SenderState *sender_state =
1142            safe_cast<DTLBPort::SenderState*>(translation_state->saved);
1143
1144        Wavefront *w M5_VAR_USED =
1145            computeUnit->wfList[sender_state->_gpuDynInst->simdId]
1146            [sender_state->_gpuDynInst->wfSlotId];
1147
1148        DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
1149                 pkt->req->getVaddr());
1150    }
1151
1152    assert(translation_state->tlbEntry->valid);
1153
1154    // update the hitLevel distribution
1155    int hit_level = translation_state->hitLevel;
1156    computeUnit->hitsPerTLBLevel[hit_level]++;
1157
1158    delete translation_state->tlbEntry;
1159    assert(!translation_state->ports.size());
1160    pkt->senderState = translation_state->saved;
1161
1162    // for prefetch pkt
1163    BaseTLB::Mode TLB_mode = translation_state->tlbMode;
1164
1165    delete translation_state;
1166
1167    // use the original sender state to know how to close this transaction
1168    DTLBPort::SenderState *sender_state =
1169        safe_cast<DTLBPort::SenderState*>(pkt->senderState);
1170
1171    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1172    int mp_index = sender_state->portIndex;
1173    Addr vaddr = pkt->req->getVaddr();
1174    gpuDynInst->memStatusVector[line].push_back(mp_index);
1175    gpuDynInst->tlbHitLevel[mp_index] = hit_level;
1176
1177    MemCmd requestCmd;
1178
1179    if (pkt->cmd == MemCmd::ReadResp) {
1180        requestCmd = MemCmd::ReadReq;
1181    } else if (pkt->cmd == MemCmd::WriteResp) {
1182        requestCmd = MemCmd::WriteReq;
1183    } else if (pkt->cmd == MemCmd::SwapResp) {
1184        requestCmd = MemCmd::SwapReq;
1185    } else {
1186        panic("unsupported response to request conversion %s\n",
1187              pkt->cmd.toString());
1188    }
1189
1190    if (computeUnit->prefetchDepth) {
1191        int simdId = gpuDynInst->simdId;
1192        int wfSlotId = gpuDynInst->wfSlotId;
1193        Addr last = 0;
1194
1195        switch(computeUnit->prefetchType) {
1196          case Enums::PF_CU:
1197            last = computeUnit->lastVaddrCU[mp_index];
1198            break;
1199          case Enums::PF_PHASE:
1200            last = computeUnit->lastVaddrPhase[simdId][mp_index];
1201            break;
1202          case Enums::PF_WF:
1203            last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
1204          default:
1205            break;
1206        }
1207
1208        DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
1209                computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
1210
1211        int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
1212                     roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift
1213                     : 0;
1214
1215        DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
1216
1217        computeUnit->lastVaddrCU[mp_index] = vaddr;
1218        computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
1219        computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
1220
1221        stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
1222            computeUnit->prefetchStride: stride;
1223
1224        DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
1225                computeUnit->cu_id, simdId, wfSlotId, mp_index);
1226
1227        DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
1228
1229        // Prefetch Next few pages atomically
1230        for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
1231            DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
1232                    vaddr+stride*pf*TheISA::PageBytes);
1233
1234            if (!stride)
1235                break;
1236
1237            Request *prefetch_req = new Request(0, vaddr + stride * pf *
1238                                                TheISA::PageBytes,
1239                                                sizeof(uint8_t), 0,
1240                                                computeUnit->masterId(),
1241                                                0, 0, 0);
1242
1243            PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
1244            uint8_t foo = 0;
1245            prefetch_pkt->dataStatic(&foo);
1246
1247            // Because it's atomic operation, only need TLB translation state
1248            prefetch_pkt->senderState =
1249                new TheISA::GpuTLB::TranslationState(TLB_mode,
1250                                                     computeUnit->shader->gpuTc,
1251                                                     true);
1252
1253            // Currently prefetches are zero-latency, hence the sendFunctional
1254            sendFunctional(prefetch_pkt);
1255
1256            /* safe_cast the senderState */
1257            TheISA::GpuTLB::TranslationState *tlb_state =
1258                 safe_cast<TheISA::GpuTLB::TranslationState*>(
1259                         prefetch_pkt->senderState);
1260
1261
1262            delete tlb_state->tlbEntry;
1263            delete tlb_state;
1264            delete prefetch_pkt->req;
1265            delete prefetch_pkt;
1266        }
1267    }
1268
1269    // First we must convert the response cmd back to a request cmd so that
1270    // the request can be sent through the cu's master port
1271    PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
1272    new_pkt->dataStatic(pkt->getPtr<uint8_t>());
1273    delete pkt->senderState;
1274    delete pkt;
1275
1276    // New SenderState for the memory access
1277    new_pkt->senderState =
1278            new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
1279                                                   nullptr);
1280
1281    // translation is done. Schedule the mem_req_event at the appropriate
1282    // cycle to send the timing memory request to ruby
1283    ComputeUnit::DataPort::MemReqEvent *mem_req_event =
1284        new ComputeUnit::DataPort::MemReqEvent(computeUnit->memPort[mp_index],
1285                                               new_pkt);
1286
1287    DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
1288            computeUnit->cu_id, gpuDynInst->simdId,
1289            gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
1290
1291    computeUnit->schedule(mem_req_event, curTick() +
1292                          computeUnit->req_tick_latency);
1293
1294    return true;
1295}
1296
1297const char*
1298ComputeUnit::DataPort::MemReqEvent::description() const
1299{
1300    return "ComputeUnit memory request event";
1301}
1302
1303void
1304ComputeUnit::DataPort::MemReqEvent::process()
1305{
1306    SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
1307    GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
1308    ComputeUnit *compute_unit M5_VAR_USED = dataPort->computeUnit;
1309
1310    if (!(dataPort->sendTimingReq(pkt))) {
1311        dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst));
1312
1313        DPRINTF(GPUPort,
1314                "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
1315                compute_unit->cu_id, gpuDynInst->simdId,
1316                gpuDynInst->wfSlotId, dataPort->index,
1317                pkt->req->getPaddr());
1318    } else {
1319        DPRINTF(GPUPort,
1320                "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
1321                compute_unit->cu_id, gpuDynInst->simdId,
1322                gpuDynInst->wfSlotId, dataPort->index,
1323                pkt->req->getPaddr());
1324    }
1325}
1326
1327/*
1328 * The initial translation request could have been rejected,
1329 * if <retries> queue is not Retry sending the translation
1330 * request. sendRetry() is called from the peer port whenever
1331 * a translation completes.
1332 */
1333void
1334ComputeUnit::DTLBPort::recvReqRetry()
1335{
1336    int len = retries.size();
1337
1338    DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
1339            computeUnit->cu_id, len);
1340
1341    assert(len > 0);
1342    assert(isStalled());
1343    // recvReqRetry is an indication that the resource on which this
1344    // port was stalling on is freed. So, remove the stall first
1345    unstallPort();
1346
1347    for (int i = 0; i < len; ++i) {
1348        PacketPtr pkt = retries.front();
1349        Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1350        DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
1351
1352        if (!sendTimingReq(pkt)) {
1353            // Stall port
1354            stallPort();
1355            DPRINTF(GPUTLB, ": failed again\n");
1356            break;
1357        } else {
1358            DPRINTF(GPUTLB, ": successful\n");
1359            retries.pop_front();
1360        }
1361    }
1362}
1363
1364bool
1365ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
1366{
1367    Addr line M5_VAR_USED = pkt->req->getPaddr();
1368    DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
1369            computeUnit->cu_id, pkt->req->getVaddr(), line);
1370
1371    assert(pkt->senderState);
1372
1373    // pop off the TLB translation state
1374    TheISA::GpuTLB::TranslationState *translation_state =
1375                 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
1376
1377    bool success = translation_state->tlbEntry->valid;
1378    delete translation_state->tlbEntry;
1379    assert(!translation_state->ports.size());
1380    pkt->senderState = translation_state->saved;
1381    delete translation_state;
1382
1383    // use the original sender state to know how to close this transaction
1384    ITLBPort::SenderState *sender_state =
1385        safe_cast<ITLBPort::SenderState*>(pkt->senderState);
1386
1387    // get the wavefront associated with this translation request
1388    Wavefront *wavefront = sender_state->wavefront;
1389    delete pkt->senderState;
1390
1391    if (success) {
1392        // pkt is reused in fetch(), don't delete it here.  However, we must
1393        // reset the command to be a request so that it can be sent through
1394        // the cu's master port
1395        assert(pkt->cmd == MemCmd::ReadResp);
1396        pkt->cmd = MemCmd::ReadReq;
1397
1398        computeUnit->fetchStage.fetch(pkt, wavefront);
1399    } else {
1400        if (wavefront->dropFetch) {
1401            assert(wavefront->instructionBuffer.empty());
1402            wavefront->dropFetch = false;
1403        }
1404
1405        wavefront->pendingFetch = 0;
1406    }
1407
1408    return true;
1409}
1410
1411/*
1412 * The initial translation request could have been rejected, if
1413 * <retries> queue is not empty. Retry sending the translation
1414 * request. sendRetry() is called from the peer port whenever
1415 * a translation completes.
1416 */
1417void
1418ComputeUnit::ITLBPort::recvReqRetry()
1419{
1420
1421    int len = retries.size();
1422    DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
1423
1424    assert(len > 0);
1425    assert(isStalled());
1426
1427    // recvReqRetry is an indication that the resource on which this
1428    // port was stalling on is freed. So, remove the stall first
1429    unstallPort();
1430
1431    for (int i = 0; i < len; ++i) {
1432        PacketPtr pkt = retries.front();
1433        Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
1434        DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
1435
1436        if (!sendTimingReq(pkt)) {
1437            stallPort(); // Stall port
1438            DPRINTF(GPUTLB, ": failed again\n");
1439            break;
1440        } else {
1441            DPRINTF(GPUTLB, ": successful\n");
1442            retries.pop_front();
1443        }
1444    }
1445}
1446
1447void
1448ComputeUnit::regStats()
1449{
1450    tlbCycles
1451        .name(name() + ".tlb_cycles")
1452        .desc("total number of cycles for all uncoalesced requests")
1453        ;
1454
1455    tlbRequests
1456        .name(name() + ".tlb_requests")
1457        .desc("number of uncoalesced requests")
1458        ;
1459
1460    tlbLatency
1461        .name(name() + ".avg_translation_latency")
1462        .desc("Avg. translation latency for data translations")
1463        ;
1464
1465    tlbLatency = tlbCycles / tlbRequests;
1466
1467    hitsPerTLBLevel
1468       .init(4)
1469       .name(name() + ".TLB_hits_distribution")
1470       .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
1471       ;
1472
1473    // fixed number of TLB levels
1474    for (int i = 0; i < 4; ++i) {
1475        if (!i)
1476            hitsPerTLBLevel.subname(i,"page_table");
1477        else
1478            hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
1479    }
1480
1481    execRateDist
1482        .init(0, 10, 2)
1483        .name(name() + ".inst_exec_rate")
1484        .desc("Instruction Execution Rate: Number of executed vector "
1485              "instructions per cycle")
1486        ;
1487
1488    ldsBankConflictDist
1489       .init(0, VSZ, 2)
1490       .name(name() + ".lds_bank_conflicts")
1491       .desc("Number of bank conflicts per LDS memory packet")
1492       ;
1493
1494    ldsBankAccesses
1495        .name(name() + ".lds_bank_access_cnt")
1496        .desc("Total number of LDS bank accesses")
1497        ;
1498
1499    pageDivergenceDist
1500       // A wavefront can touch 1 to VSZ pages per memory instruction.
1501       // The number of pages per bin can be configured (here it's 4).
1502       .init(1, VSZ, 4)
1503       .name(name() + ".page_divergence_dist")
1504       .desc("pages touched per wf (over all mem. instr.)")
1505       ;
1506
1507    controlFlowDivergenceDist
1508        .init(1, VSZ, 4)
1509        .name(name() + ".warp_execution_dist")
1510        .desc("number of lanes active per instruction (oval all instructions)")
1511        ;
1512
1513    activeLanesPerGMemInstrDist
1514        .init(1, VSZ, 4)
1515        .name(name() + ".gmem_lanes_execution_dist")
1516        .desc("number of active lanes per global memory instruction")
1517        ;
1518
1519    activeLanesPerLMemInstrDist
1520        .init(1, VSZ, 4)
1521        .name(name() + ".lmem_lanes_execution_dist")
1522        .desc("number of active lanes per local memory instruction")
1523        ;
1524
1525    numInstrExecuted
1526        .name(name() + ".num_instr_executed")
1527        .desc("number of instructions executed")
1528        ;
1529
1530    numVecOpsExecuted
1531        .name(name() + ".num_vec_ops_executed")
1532        .desc("number of vec ops executed (e.g. VSZ/inst)")
1533        ;
1534
1535    totalCycles
1536        .name(name() + ".num_total_cycles")
1537        .desc("number of cycles the CU ran for")
1538        ;
1539
1540    ipc
1541        .name(name() + ".ipc")
1542        .desc("Instructions per cycle (this CU only)")
1543        ;
1544
1545    vpc
1546        .name(name() + ".vpc")
1547        .desc("Vector Operations per cycle (this CU only)")
1548        ;
1549
1550    numALUInstsExecuted
1551        .name(name() + ".num_alu_insts_executed")
1552        .desc("Number of dynamic non-GM memory insts executed")
1553        ;
1554
1555    wgBlockedDueLdsAllocation
1556        .name(name() + ".wg_blocked_due_lds_alloc")
1557        .desc("Workgroup blocked due to LDS capacity")
1558        ;
1559
1560    ipc = numInstrExecuted / totalCycles;
1561    vpc = numVecOpsExecuted / totalCycles;
1562
1563    numTimesWgBlockedDueVgprAlloc
1564        .name(name() + ".times_wg_blocked_due_vgpr_alloc")
1565        .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
1566        ;
1567
1568    dynamicGMemInstrCnt
1569        .name(name() + ".global_mem_instr_cnt")
1570        .desc("dynamic global memory instructions count")
1571        ;
1572
1573    dynamicLMemInstrCnt
1574        .name(name() + ".local_mem_instr_cnt")
1575        .desc("dynamic local memory intruction count")
1576        ;
1577
1578    numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
1579        dynamicLMemInstrCnt;
1580
1581    completedWfs
1582        .name(name() + ".num_completed_wfs")
1583        .desc("number of completed wavefronts")
1584        ;
1585
1586    numCASOps
1587        .name(name() + ".num_CAS_ops")
1588        .desc("number of compare and swap operations")
1589        ;
1590
1591    numFailedCASOps
1592        .name(name() + ".num_failed_CAS_ops")
1593        .desc("number of compare and swap operations that failed")
1594        ;
1595
1596    // register stats of pipeline stages
1597    fetchStage.regStats();
1598    scoreboardCheckStage.regStats();
1599    scheduleStage.regStats();
1600    execStage.regStats();
1601
1602    // register stats of memory pipeline
1603    globalMemoryPipe.regStats();
1604    localMemoryPipe.regStats();
1605}
1606
1607void
1608ComputeUnit::updatePageDivergenceDist(Addr addr)
1609{
1610    Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
1611
1612    if (!pagesTouched.count(virt_page_addr))
1613        pagesTouched[virt_page_addr] = 1;
1614    else
1615        pagesTouched[virt_page_addr]++;
1616}
1617
1618void
1619ComputeUnit::CUExitCallback::process()
1620{
1621    if (computeUnit->countPages) {
1622        std::ostream *page_stat_file =
1623            simout.create(computeUnit->name().c_str())->stream();
1624
1625        *page_stat_file << "page, wavefront accesses, workitem accesses" <<
1626            std::endl;
1627
1628        for (auto iter : computeUnit->pageAccesses) {
1629            *page_stat_file << std::hex << iter.first << ",";
1630            *page_stat_file << std::dec << iter.second.first << ",";
1631            *page_stat_file << std::dec << iter.second.second << std::endl;
1632        }
1633    }
1634 }
1635
1636bool
1637ComputeUnit::isDone() const
1638{
1639    for (int i = 0; i < numSIMDs; ++i) {
1640        if (!isSimdDone(i)) {
1641            return false;
1642        }
1643    }
1644
1645    bool glbMemBusRdy = true;
1646    for (int j = 0; j < numGlbMemUnits; ++j) {
1647        glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
1648    }
1649    bool locMemBusRdy = true;
1650    for (int j = 0; j < numLocMemUnits; ++j) {
1651        locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
1652    }
1653
1654    if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() ||
1655        !globalMemoryPipe.isGMStRespFIFOWrRdy() ||
1656        !globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy()
1657        || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
1658        !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
1659        return false;
1660    }
1661
1662    return true;
1663}
1664
1665int32_t
1666ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
1667{
1668    return lds.getRefCounter(dispatchId, wgId);
1669}
1670
1671bool
1672ComputeUnit::isSimdDone(uint32_t simdId) const
1673{
1674    assert(simdId < numSIMDs);
1675
1676    for (int i=0; i < numGlbMemUnits; ++i) {
1677        if (!vrfToGlobalMemPipeBus[i].rdy())
1678            return false;
1679    }
1680    for (int i=0; i < numLocMemUnits; ++i) {
1681        if (!vrfToLocalMemPipeBus[i].rdy())
1682            return false;
1683    }
1684    if (!aluPipe[simdId].rdy()) {
1685        return false;
1686    }
1687
1688    for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
1689        if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
1690            return false;
1691        }
1692    }
1693
1694    return true;
1695}
1696
1697/**
1698 * send a general request to the LDS
1699 * make sure to look at the return value here as your request might be
1700 * NACK'd and returning false means that you have to have some backup plan
1701 */
1702bool
1703ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst)
1704{
1705    // this is just a request to carry the GPUDynInstPtr
1706    // back and forth
1707    Request *newRequest = new Request();
1708    newRequest->setPaddr(0x0);
1709
1710    // ReadReq is not evaluted by the LDS but the Packet ctor requires this
1711    PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
1712
1713    // This is the SenderState needed upon return
1714    newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
1715
1716    return ldsPort->sendTimingReq(newPacket);
1717}
1718
1719/**
1720 * get the result of packets sent to the LDS when they return
1721 */
1722bool
1723ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet)
1724{
1725    const ComputeUnit::LDSPort::SenderState *senderState =
1726        dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
1727
1728    fatal_if(!senderState, "did not get the right sort of sender state");
1729
1730    GPUDynInstPtr gpuDynInst = senderState->getMemInst();
1731
1732    delete packet->senderState;
1733    delete packet->req;
1734    delete packet;
1735
1736    computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
1737    return true;
1738}
1739
1740/**
1741 * attempt to send this packet, either the port is already stalled, the request
1742 * is nack'd and must stall or the request goes through
1743 * when a request cannot be sent, add it to the retries queue
1744 */
1745bool
1746ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt)
1747{
1748    ComputeUnit::LDSPort::SenderState *sender_state =
1749            dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
1750    fatal_if(!sender_state, "packet without a valid sender state");
1751
1752    GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
1753
1754    if (isStalled()) {
1755        fatal_if(retries.empty(), "must have retries waiting to be stalled");
1756
1757        retries.push(pkt);
1758
1759        DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
1760                        computeUnit->cu_id, gpuDynInst->simdId,
1761                        gpuDynInst->wfSlotId);
1762        return false;
1763    } else if (!MasterPort::sendTimingReq(pkt)) {
1764        // need to stall the LDS port until a recvReqRetry() is received
1765        // this indicates that there is more space
1766        stallPort();
1767        retries.push(pkt);
1768
1769        DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
1770                computeUnit->cu_id, gpuDynInst->simdId,
1771                gpuDynInst->wfSlotId, pkt->req->getPaddr());
1772        return false;
1773    } else {
1774        DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
1775                computeUnit->cu_id, gpuDynInst->simdId,
1776                gpuDynInst->wfSlotId, pkt->req->getPaddr());
1777        return true;
1778    }
1779}
1780
1781/**
1782 * the bus is telling the port that there is now space so retrying stalled
1783 * requests should work now
1784 * this allows the port to have a request be nack'd and then have the receiver
1785 * say when there is space, rather than simply retrying the send every cycle
1786 */
1787void
1788ComputeUnit::LDSPort::recvReqRetry()
1789{
1790    auto queueSize = retries.size();
1791
1792    DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
1793            computeUnit->cu_id, queueSize);
1794
1795    fatal_if(queueSize < 1,
1796             "why was there a recvReqRetry() with no pending reqs?");
1797    fatal_if(!isStalled(),
1798             "recvReqRetry() happened when the port was not stalled");
1799
1800    unstallPort();
1801
1802    while (!retries.empty()) {
1803        PacketPtr packet = retries.front();
1804
1805        DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
1806
1807        if (!MasterPort::sendTimingReq(packet)) {
1808            // Stall port
1809            stallPort();
1810            DPRINTF(GPUPort, ": LDS send failed again\n");
1811            break;
1812        } else {
1813            DPRINTF(GPUTLB, ": LDS send successful\n");
1814            retries.pop();
1815        }
1816    }
1817}
1818