1/*
2 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Lisa Hsu
34 */
35
36#include "gpu-compute/wavefront.hh"
37
38#include "debug/GPUExec.hh"
39#include "debug/WavefrontStack.hh"
40#include "gpu-compute/compute_unit.hh"
41#include "gpu-compute/gpu_dyn_inst.hh"
42#include "gpu-compute/shader.hh"
43#include "gpu-compute/vector_register_file.hh"
44
45Wavefront*
46WavefrontParams::create()
47{
48    return new Wavefront(this);
49}
50
51Wavefront::Wavefront(const Params *p)
52  : SimObject(p), callArgMem(nullptr), _gpuISA()
53{
54    lastTrace = 0;
55    simdId = p->simdId;
56    wfSlotId = p->wf_slot_id;
57    status = S_STOPPED;
58    reservedVectorRegs = 0;
59    startVgprIndex = 0;
60    outstandingReqs = 0;
61    memReqsInPipe = 0;
62    outstandingReqsWrGm = 0;
63    outstandingReqsWrLm = 0;
64    outstandingReqsRdGm = 0;
65    outstandingReqsRdLm = 0;
66    rdLmReqsInPipe = 0;
67    rdGmReqsInPipe = 0;
68    wrLmReqsInPipe = 0;
69    wrGmReqsInPipe = 0;
70
71    barrierCnt = 0;
72    oldBarrierCnt = 0;
73    stalledAtBarrier = false;
74
75    memTraceBusy = 0;
76    oldVgprTcnt = 0xffffffffffffffffll;
77    oldDgprTcnt = 0xffffffffffffffffll;
78    oldVgpr.resize(p->wfSize);
79
80    pendingFetch = false;
81    dropFetch = false;
82    condRegState = new ConditionRegisterState();
83    maxSpVgprs = 0;
84    maxDpVgprs = 0;
85    lastAddr.resize(p->wfSize);
86    workItemFlatId.resize(p->wfSize);
87    oldDgpr.resize(p->wfSize);
88    barCnt.resize(p->wfSize);
89    for (int i = 0; i < 3; ++i) {
90        workItemId[i].resize(p->wfSize);
91    }
92}
93
94void
95Wavefront::regStats()
96{
97    SimObject::regStats();
98
99    srcRegOpDist
100        .init(0, 4, 2)
101        .name(name() + ".src_reg_operand_dist")
102        .desc("number of executed instructions with N source register operands")
103        ;
104
105    dstRegOpDist
106        .init(0, 3, 2)
107        .name(name() + ".dst_reg_operand_dist")
108        .desc("number of executed instructions with N destination register "
109              "operands")
110        ;
111
112    // FIXME: the name of the WF needs to be unique
113    numTimesBlockedDueWAXDependencies
114        .name(name() + ".timesBlockedDueWAXDependencies")
115        .desc("number of times the wf's instructions are blocked due to WAW "
116              "or WAR dependencies")
117        ;
118
119    // FIXME: the name of the WF needs to be unique
120    numTimesBlockedDueRAWDependencies
121        .name(name() + ".timesBlockedDueRAWDependencies")
122        .desc("number of times the wf's instructions are blocked due to RAW "
123              "dependencies")
124        ;
125
126    // FIXME: the name of the WF needs to be unique
127    numTimesBlockedDueVrfPortAvail
128        .name(name() + ".timesBlockedDueVrfPortAvail")
129        .desc("number of times instructions are blocked due to VRF port "
130              "availability")
131        ;
132}
133
134void
135Wavefront::init()
136{
137    reservedVectorRegs = 0;
138    startVgprIndex = 0;
139}
140
141void
142Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
143{
144    condRegState->init(num_cregs);
145    maxSpVgprs = num_sregs;
146    maxDpVgprs = num_dregs;
147}
148
149Wavefront::~Wavefront()
150{
151    if (callArgMem)
152        delete callArgMem;
153    delete condRegState;
154}
155
156void
157Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
158{
159    wfDynId = _wf_dyn_id;
160    basePtr = _base_ptr;
161    status = S_RUNNING;
162}
163
164bool
165Wavefront::isGmInstruction(GPUDynInstPtr ii)
166{
167    if (ii->isGlobalMem() || ii->isFlat())
168        return true;
169
170    return false;
171}
172
173bool
174Wavefront::isLmInstruction(GPUDynInstPtr ii)
175{
176    if (ii->isLocalMem()) {
177        return true;
178    }
179
180    return false;
181}
182
183bool
184Wavefront::isOldestInstALU()
185{
186    assert(!instructionBuffer.empty());
187    GPUDynInstPtr ii = instructionBuffer.front();
188
189    if (status != S_STOPPED && (ii->isNop() ||
190        ii->isReturn() || ii->isBranch() ||
191        ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
192        return true;
193    }
194
195    return false;
196}
197
198bool
199Wavefront::isOldestInstBarrier()
200{
201    assert(!instructionBuffer.empty());
202    GPUDynInstPtr ii = instructionBuffer.front();
203
204    if (status != S_STOPPED && ii->isBarrier()) {
205        return true;
206    }
207
208    return false;
209}
210
211bool
212Wavefront::isOldestInstGMem()
213{
214    assert(!instructionBuffer.empty());
215    GPUDynInstPtr ii = instructionBuffer.front();
216
217    if (status != S_STOPPED && ii->isGlobalMem()) {
218        return true;
219    }
220
221    return false;
222}
223
224bool
225Wavefront::isOldestInstLMem()
226{
227    assert(!instructionBuffer.empty());
228    GPUDynInstPtr ii = instructionBuffer.front();
229
230    if (status != S_STOPPED && ii->isLocalMem()) {
231        return true;
232    }
233
234    return false;
235}
236
237bool
238Wavefront::isOldestInstPrivMem()
239{
240    assert(!instructionBuffer.empty());
241    GPUDynInstPtr ii = instructionBuffer.front();
242
243    if (status != S_STOPPED && ii->isPrivateSeg()) {
244        return true;
245    }
246
247    return false;
248}
249
250bool
251Wavefront::isOldestInstFlatMem()
252{
253    assert(!instructionBuffer.empty());
254    GPUDynInstPtr ii = instructionBuffer.front();
255
256    if (status != S_STOPPED && ii->isFlat()) {
257        return true;
258    }
259
260    return false;
261}
262
263// Return true if the Wavefront's instruction
264// buffer has branch instruction.
265bool
266Wavefront::instructionBufferHasBranch()
267{
268    for (auto it : instructionBuffer) {
269        GPUDynInstPtr ii = it;
270
271        if (ii->isReturn() || ii->isBranch()) {
272            return true;
273        }
274    }
275
276    return false;
277}
278
279// Remap HSAIL register to physical VGPR.
280// HSAIL register = virtual register assigned to an operand by HLC compiler
281uint32_t
282Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
283{
284    assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
285    // add the offset from where the VGPRs of the wavefront have been assigned
286    uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
287    // HSAIL double precision (DP) register: calculate the physical VGPR index
288    // assuming that DP registers are placed after SP ones in the VRF. The DP
289    // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
290    // the DP VGPR index before mapping it to the physical VRF address space
291    if (mode == 1 && size > 4) {
292        physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
293    }
294
295    assert((startVgprIndex <= physicalVgprIndex) &&
296           (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
297
298    // calculate absolute physical VGPR index
299    return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
300}
301
302// Return true if this wavefront is ready
303// to execute an instruction of the specified type.
304int
305Wavefront::ready(itype_e type)
306{
307    // Check to make sure wave is running
308    if (status == S_STOPPED || status == S_RETURNING ||
309        instructionBuffer.empty()) {
310        return 0;
311    }
312
313    // Is the wave waiting at a barrier
314    if (stalledAtBarrier) {
315        if (!computeUnit->AllAtBarrier(barrierId,barrierCnt,
316                        computeUnit->getRefCounter(dispatchId, wgId))) {
317            // Are all threads at barrier?
318            return 0;
319        }
320        oldBarrierCnt = barrierCnt;
321        stalledAtBarrier = false;
322    }
323
324    // Read instruction
325    GPUDynInstPtr ii = instructionBuffer.front();
326
327    bool ready_inst M5_VAR_USED = false;
328    bool glbMemBusRdy = false;
329    bool glbMemIssueRdy = false;
330    if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
331        for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
332            if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
333                glbMemBusRdy = true;
334            if (computeUnit->wfWait[j].prerdy())
335                glbMemIssueRdy = true;
336        }
337    }
338    bool locMemBusRdy = false;
339    bool locMemIssueRdy = false;
340    if (type == I_SHARED || type == I_FLAT) {
341        for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
342            if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
343                locMemBusRdy = true;
344            if (computeUnit->wfWait[j].prerdy())
345                locMemIssueRdy = true;
346        }
347    }
348
349    // The following code is very error prone and the entire process for
350    // checking readiness will be fixed eventually.  In the meantime, let's
351    // make sure that we do not silently let an instruction type slip
352    // through this logic and always return not ready.
353    if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
354        ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
355        ii->isMemFence() || ii->isFlat())) {
356        panic("next instruction: %s is of unknown type\n", ii->disassemble());
357    }
358
359    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
360            computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
361
362    if (type == I_ALU && ii->isBarrier()) {
363        // Here for ALU instruction (barrier)
364        if (!computeUnit->wfWait[simdId].prerdy()) {
365            // Is wave slot free?
366            return 0;
367        }
368
369        // Are there in pipe or outstanding memory requests?
370        if ((outstandingReqs + memReqsInPipe) > 0) {
371            return 0;
372        }
373
374        ready_inst = true;
375    } else if (type == I_ALU && ii->isNop()) {
376        // Here for ALU instruction (nop)
377        if (!computeUnit->wfWait[simdId].prerdy()) {
378            // Is wave slot free?
379            return 0;
380        }
381
382        ready_inst = true;
383    } else if (type == I_ALU && ii->isReturn()) {
384        // Here for ALU instruction (return)
385        if (!computeUnit->wfWait[simdId].prerdy()) {
386            // Is wave slot free?
387            return 0;
388        }
389
390        // Are there in pipe or outstanding memory requests?
391        if ((outstandingReqs + memReqsInPipe) > 0) {
392            return 0;
393        }
394
395        ready_inst = true;
396    } else if (type == I_ALU && (ii->isBranch() ||
397               ii->isALU() ||
398               (ii->isKernArgSeg() && ii->isLoad()) ||
399               ii->isArgSeg())) {
400        // Here for ALU instruction (all others)
401        if (!computeUnit->wfWait[simdId].prerdy()) {
402            // Is alu slot free?
403            return 0;
404        }
405        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
406                    VrfAccessType::RD_WR)) {
407            return 0;
408        }
409
410        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
411            return 0;
412        }
413        ready_inst = true;
414    } else if (type == I_GLOBAL && ii->isGlobalMem()) {
415        // Here Global memory instruction
416        if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
417            // Are there in pipe or outstanding global memory write requests?
418            if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
419                return 0;
420            }
421        }
422
423        if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
424            // Are there in pipe or outstanding global memory read requests?
425            if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0)
426                return 0;
427        }
428
429        if (!glbMemIssueRdy) {
430            // Is WV issue slot free?
431            return 0;
432        }
433
434        if (!glbMemBusRdy) {
435            // Is there an available VRF->Global memory read bus?
436            return 0;
437        }
438
439        if (!computeUnit->globalMemoryPipe.
440            isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
441            // Can we insert a new request to the Global Mem Request FIFO?
442            return 0;
443        }
444        // can we schedule source & destination operands on the VRF?
445        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
446                    VrfAccessType::RD_WR)) {
447            return 0;
448        }
449        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
450            return 0;
451        }
452        ready_inst = true;
453    } else if (type == I_SHARED && ii->isLocalMem()) {
454        // Here for Shared memory instruction
455        if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
456            if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
457                return 0;
458            }
459        }
460
461        if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
462            if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
463                return 0;
464            }
465        }
466
467        if (!locMemBusRdy) {
468            // Is there an available VRF->LDS read bus?
469            return 0;
470        }
471        if (!locMemIssueRdy) {
472            // Is wave slot free?
473            return 0;
474        }
475
476        if (!computeUnit->localMemoryPipe.
477            isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
478            // Can we insert a new request to the LDS Request FIFO?
479            return 0;
480        }
481        // can we schedule source & destination operands on the VRF?
482        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
483                    VrfAccessType::RD_WR)) {
484            return 0;
485        }
486        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
487            return 0;
488        }
489        ready_inst = true;
490    } else if (type == I_FLAT && ii->isFlat()) {
491        if (!glbMemBusRdy) {
492            // Is there an available VRF->Global memory read bus?
493            return 0;
494        }
495
496        if (!locMemBusRdy) {
497            // Is there an available VRF->LDS read bus?
498            return 0;
499        }
500
501        if (!glbMemIssueRdy) {
502            // Is wave slot free?
503            return 0;
504        }
505
506        if (!locMemIssueRdy) {
507            return 0;
508        }
509        if (!computeUnit->globalMemoryPipe.
510            isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
511            // Can we insert a new request to the Global Mem Request FIFO?
512            return 0;
513        }
514
515        if (!computeUnit->localMemoryPipe.
516            isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
517            // Can we insert a new request to the LDS Request FIFO?
518            return 0;
519        }
520        // can we schedule source & destination operands on the VRF?
521        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
522                    VrfAccessType::RD_WR)) {
523            return 0;
524        }
525        // are all the operands ready? (RAW, WAW and WAR depedencies met?)
526        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
527            return 0;
528        }
529        ready_inst = true;
530    } else {
531        return 0;
532    }
533
534    assert(ready_inst);
535
536    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
537            simdId, wfSlotId, ii->disassemble());
538    return 1;
539}
540
541void
542Wavefront::updateResources()
543{
544    // Get current instruction
545    GPUDynInstPtr ii = instructionBuffer.front();
546    assert(ii);
547    computeUnit->vrf[simdId]->updateResources(this, ii);
548    // Single precision ALU or Branch or Return or Special instruction
549    if (ii->isALU() || ii->isSpecialOp() ||
550        ii->isBranch() ||
551        // FIXME: Kernel argument loads are currently treated as ALU operations
552        // since we don't send memory packets at execution. If we fix that then
553        // we should map them to one of the memory pipelines
554        (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
555        ii->isReturn()) {
556        computeUnit->aluPipe[simdId].preset(computeUnit->shader->
557                                            ticks(computeUnit->spBypassLength()));
558        // this is to enforce a fixed number of cycles per issue slot per SIMD
559        computeUnit->wfWait[simdId].preset(computeUnit->shader->
560                                           ticks(computeUnit->issuePeriod));
561    } else if (ii->isBarrier()) {
562        computeUnit->wfWait[simdId].preset(computeUnit->shader->
563                                           ticks(computeUnit->issuePeriod));
564    } else if (ii->isLoad() && ii->isFlat()) {
565        assert(Enums::SC_NONE != ii->executedAs());
566        memReqsInPipe++;
567        rdGmReqsInPipe++;
568        if ( Enums::SC_SHARED == ii->executedAs() ) {
569            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
570                preset(computeUnit->shader->ticks(4));
571            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
572                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
573        } else {
574            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
575                preset(computeUnit->shader->ticks(4));
576            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
577                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
578        }
579    } else if (ii->isStore() && ii->isFlat()) {
580        assert(Enums::SC_NONE != ii->executedAs());
581        memReqsInPipe++;
582        wrGmReqsInPipe++;
583        if (Enums::SC_SHARED == ii->executedAs()) {
584            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
585                preset(computeUnit->shader->ticks(8));
586            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
587                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
588        } else {
589            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
590                preset(computeUnit->shader->ticks(8));
591            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
592                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
593        }
594    } else if (ii->isLoad() && ii->isGlobalMem()) {
595        memReqsInPipe++;
596        rdGmReqsInPipe++;
597        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
598            preset(computeUnit->shader->ticks(4));
599        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
600            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
601    } else if (ii->isStore() && ii->isGlobalMem()) {
602        memReqsInPipe++;
603        wrGmReqsInPipe++;
604        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
605            preset(computeUnit->shader->ticks(8));
606        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
607            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
608    } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
609        memReqsInPipe++;
610        wrGmReqsInPipe++;
611        rdGmReqsInPipe++;
612        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
613            preset(computeUnit->shader->ticks(8));
614        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
615            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
616    } else if (ii->isLoad() && ii->isLocalMem()) {
617        memReqsInPipe++;
618        rdLmReqsInPipe++;
619        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
620            preset(computeUnit->shader->ticks(4));
621        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
622            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
623    } else if (ii->isStore() && ii->isLocalMem()) {
624        memReqsInPipe++;
625        wrLmReqsInPipe++;
626        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
627            preset(computeUnit->shader->ticks(8));
628        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
629            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
630    } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
631        memReqsInPipe++;
632        wrLmReqsInPipe++;
633        rdLmReqsInPipe++;
634        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
635            preset(computeUnit->shader->ticks(8));
636        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
637            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
638    }
639}
640
641void
642Wavefront::exec()
643{
644    // ---- Exit if wavefront is inactive ----------------------------- //
645
646    if (status == S_STOPPED || status == S_RETURNING ||
647        instructionBuffer.empty()) {
648        return;
649    }
650
651    // Get current instruction
652
653    GPUDynInstPtr ii = instructionBuffer.front();
654
655    const uint32_t old_pc = pc();
656    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
657            "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
658            ii->disassemble(), old_pc);
659
660    // update the instruction stats in the CU
661
662    ii->execute(ii);
663    computeUnit->updateInstStats(ii);
664    // access the VRF
665    computeUnit->vrf[simdId]->exec(ii, this);
666    srcRegOpDist.sample(ii->numSrcRegOperands());
667    dstRegOpDist.sample(ii->numDstRegOperands());
668    computeUnit->numInstrExecuted++;
669    computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
670                                     computeUnit->lastExecCycle[simdId]);
671    computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
672    if (pc() == old_pc) {
673        uint32_t new_pc = _gpuISA.advancePC(old_pc, ii);
674        // PC not modified by instruction, proceed to next or pop frame
675        pc(new_pc);
676        if (new_pc == rpc()) {
677            popFromReconvergenceStack();
678            discardFetch();
679        } else {
680            instructionBuffer.pop_front();
681        }
682    } else {
683        discardFetch();
684    }
685
686    if (computeUnit->shader->hsail_mode==Shader::SIMT) {
687        const int num_active_lanes = execMask().count();
688        computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
689        computeUnit->numVecOpsExecuted += num_active_lanes;
690        if (isGmInstruction(ii)) {
691            computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
692        } else if (isLmInstruction(ii)) {
693            computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
694        }
695    }
696
697    // ---- Update Vector ALU pipeline and other resources ------------------ //
698    // Single precision ALU or Branch or Return or Special instruction
699    if (ii->isALU() || ii->isSpecialOp() ||
700        ii->isBranch() ||
701        // FIXME: Kernel argument loads are currently treated as ALU operations
702        // since we don't send memory packets at execution. If we fix that then
703        // we should map them to one of the memory pipelines
704        (ii->isKernArgSeg() && ii->isLoad()) ||
705        ii->isArgSeg() ||
706        ii->isReturn()) {
707        computeUnit->aluPipe[simdId].set(computeUnit->shader->
708                                         ticks(computeUnit->spBypassLength()));
709
710        // this is to enforce a fixed number of cycles per issue slot per SIMD
711        computeUnit->wfWait[simdId].set(computeUnit->shader->
712                                        ticks(computeUnit->issuePeriod));
713    } else if (ii->isBarrier()) {
714        computeUnit->wfWait[simdId].set(computeUnit->shader->
715                                        ticks(computeUnit->issuePeriod));
716    } else if (ii->isLoad() && ii->isFlat()) {
717        assert(Enums::SC_NONE != ii->executedAs());
718
719        if (Enums::SC_SHARED == ii->executedAs()) {
720            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
721                set(computeUnit->shader->ticks(4));
722            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
723                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
724        } else {
725            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
726                set(computeUnit->shader->ticks(4));
727            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
728                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
729        }
730    } else if (ii->isStore() && ii->isFlat()) {
731        assert(Enums::SC_NONE != ii->executedAs());
732        if (Enums::SC_SHARED == ii->executedAs()) {
733            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
734                set(computeUnit->shader->ticks(8));
735            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
736                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
737        } else {
738            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
739                set(computeUnit->shader->ticks(8));
740            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
741                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
742        }
743    } else if (ii->isLoad() && ii->isGlobalMem()) {
744        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
745            set(computeUnit->shader->ticks(4));
746        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
747            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
748    } else if (ii->isStore() && ii->isGlobalMem()) {
749        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
750            set(computeUnit->shader->ticks(8));
751        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
752            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
753    } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
754        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
755            set(computeUnit->shader->ticks(8));
756        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
757            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
758    } else if (ii->isLoad() && ii->isLocalMem()) {
759        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
760            set(computeUnit->shader->ticks(4));
761        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
762            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
763    } else if (ii->isStore() && ii->isLocalMem()) {
764        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
765            set(computeUnit->shader->ticks(8));
766        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
767            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
768    } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
769        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
770            set(computeUnit->shader->ticks(8));
771        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
772            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
773    }
774}
775
776bool
777Wavefront::waitingAtBarrier(int lane)
778{
779    return barCnt[lane] < maxBarCnt;
780}
781
782void
783Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
784                                    const VectorMask& mask)
785{
786    assert(mask.count());
787    reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
788}
789
790void
791Wavefront::popFromReconvergenceStack()
792{
793    assert(!reconvergenceStack.empty());
794
795    DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
796            computeUnit->cu_id, simdId, wfSlotId, wfDynId,
797            execMask().to_string<char, std::string::traits_type,
798            std::string::allocator_type>().c_str(), pc());
799
800    reconvergenceStack.pop_back();
801
802    DPRINTF(WavefrontStack, "%3i %s\n", pc(),
803            execMask().to_string<char, std::string::traits_type,
804            std::string::allocator_type>().c_str());
805
806}
807
808void
809Wavefront::discardFetch()
810{
811    instructionBuffer.clear();
812    dropFetch |=pendingFetch;
813}
814
815uint32_t
816Wavefront::pc() const
817{
818    return reconvergenceStack.back()->pc;
819}
820
821uint32_t
822Wavefront::rpc() const
823{
824    return reconvergenceStack.back()->rpc;
825}
826
827VectorMask
828Wavefront::execMask() const
829{
830    return reconvergenceStack.back()->execMask;
831}
832
833bool
834Wavefront::execMask(int lane) const
835{
836    return reconvergenceStack.back()->execMask[lane];
837}
838
839
840void
841Wavefront::pc(uint32_t new_pc)
842{
843    reconvergenceStack.back()->pc = new_pc;
844}
845
846uint32_t
847Wavefront::getStaticContextSize() const
848{
849    return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
850           sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
851           sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
852           sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
853           computeUnit->wfSize() * sizeof(ReconvergenceStackEntry);
854}
855
856void
857Wavefront::getContext(const void *out)
858{
859    uint8_t *iter = (uint8_t *)out;
860    for (int i = 0; i < barCnt.size(); i++) {
861        *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
862    }
863    *(int *)iter = wfId; iter += sizeof(wfId);
864    *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
865    *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
866    *(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
867    *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
868    *(uint32_t *)iter = wgId; iter += sizeof(wgId);
869    *(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
870    *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
871    *(Addr *)iter = privBase; iter += sizeof(privBase);
872    *(Addr *)iter = spillBase; iter += sizeof(spillBase);
873
874    int stackSize = reconvergenceStack.size();
875    ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
876                                    std::numeric_limits<uint32_t>::max(),
877                                    std::numeric_limits<uint64_t>::max()};
878    for (int i = 0; i < workItemId[0].size(); i++) {
879        if (i < stackSize) {
880            *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
881            iter += sizeof(ReconvergenceStackEntry);
882            reconvergenceStack.pop_back();
883        } else {
884            *(ReconvergenceStackEntry *)iter = empty;
885            iter += sizeof(ReconvergenceStackEntry);
886        }
887    }
888
889    int wf_size = computeUnit->wfSize();
890    for (int i = 0; i < maxSpVgprs; i++) {
891        uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
892        for (int lane = 0; lane < wf_size; lane++) {
893            uint32_t regVal = computeUnit->vrf[simdId]->
894                            read<uint32_t>(vgprIdx,lane);
895            *(uint32_t *)iter = regVal; iter += sizeof(regVal);
896        }
897    }
898
899    for (int i = 0; i < maxDpVgprs; i++) {
900        uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
901        for (int lane = 0; lane < wf_size; lane++) {
902            uint64_t regVal = computeUnit->vrf[simdId]->
903                            read<uint64_t>(vgprIdx,lane);
904            *(uint64_t *)iter = regVal; iter += sizeof(regVal);
905        }
906    }
907
908    for (int i = 0; i < condRegState->numRegs(); i++) {
909        for (int lane = 0; lane < wf_size; lane++) {
910            uint64_t regVal = condRegState->read<uint64_t>(i, lane);
911            *(uint64_t *)iter = regVal; iter += sizeof(regVal);
912        }
913    }
914
915    /* saving LDS content */
916    if (ldsChunk)
917        for (int i = 0; i < ldsChunk->size(); i++) {
918            char val = ldsChunk->read<char>(i);
919            *(char *) iter = val; iter += sizeof(val);
920        }
921}
922
923void
924Wavefront::setContext(const void *in)
925{
926    uint8_t *iter = (uint8_t *)in;
927    for (int i = 0; i < barCnt.size(); i++) {
928        barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
929    }
930    wfId = *(int *)iter; iter += sizeof(wfId);
931    maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
932    oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
933    barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
934    computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
935    wgId = *(uint32_t *)iter; iter += sizeof(wgId);
936    barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
937    initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
938    privBase = *(Addr *)iter; iter += sizeof(privBase);
939    spillBase = *(Addr *)iter; iter += sizeof(spillBase);
940
941    for (int i = 0; i < workItemId[0].size(); i++) {
942        ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter;
943        iter += sizeof(ReconvergenceStackEntry);
944        if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
945            pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
946                                     newEntry.execMask);
947        }
948    }
949    int wf_size = computeUnit->wfSize();
950
951    for (int i = 0; i < maxSpVgprs; i++) {
952        uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
953        for (int lane = 0; lane < wf_size; lane++) {
954            uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
955            computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
956        }
957    }
958
959    for (int i = 0; i < maxDpVgprs; i++) {
960        uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
961        for (int lane = 0; lane < wf_size; lane++) {
962            uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
963            computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
964        }
965    }
966
967    for (int i = 0; i < condRegState->numRegs(); i++) {
968        for (int lane = 0; lane < wf_size; lane++) {
969            uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
970            condRegState->write<uint64_t>(i, lane, regVal);
971        }
972    }
973    /** Restoring LDS contents */
974    if (ldsChunk)
975        for (int i = 0; i < ldsChunk->size(); i++) {
976            char val = *(char *) iter; iter += sizeof(val);
977            ldsChunk->write<char>(i, val);
978        }
979}
980
981void
982Wavefront::computeActualWgSz(NDRange *ndr)
983{
984    actualWgSzTotal = 1;
985    for (int d = 0; d < 3; ++d) {
986        actualWgSz[d] = std::min(workGroupSz[d],
987                                 gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
988        actualWgSzTotal *= actualWgSz[d];
989    }
990}
991