wavefront.cc revision 11657:5fad5a37d6fc
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#include "gpu-compute/wavefront.hh"
37
38#include "debug/GPUExec.hh"
39#include "debug/WavefrontStack.hh"
40#include "gpu-compute/code_enums.hh"
41#include "gpu-compute/compute_unit.hh"
42#include "gpu-compute/gpu_dyn_inst.hh"
43#include "gpu-compute/shader.hh"
44#include "gpu-compute/vector_register_file.hh"
45
46Wavefront*
47WavefrontParams::create()
48{
49    return new Wavefront(this);
50}
51
52Wavefront::Wavefront(const Params *p)
53  : SimObject(p), callArgMem(nullptr)
54{
55    lastTrace = 0;
56    simdId = p->simdId;
57    wfSlotId = p->wf_slot_id;
58    status = S_STOPPED;
59    reservedVectorRegs = 0;
60    startVgprIndex = 0;
61    outstandingReqs = 0;
62    memReqsInPipe = 0;
63    outstandingReqsWrGm = 0;
64    outstandingReqsWrLm = 0;
65    outstandingReqsRdGm = 0;
66    outstandingReqsRdLm = 0;
67    rdLmReqsInPipe = 0;
68    rdGmReqsInPipe = 0;
69    wrLmReqsInPipe = 0;
70    wrGmReqsInPipe = 0;
71
72    barrierCnt = 0;
73    oldBarrierCnt = 0;
74    stalledAtBarrier = false;
75
76    memTraceBusy = 0;
77    oldVgprTcnt = 0xffffffffffffffffll;
78    oldDgprTcnt = 0xffffffffffffffffll;
79    oldVgpr.resize(p->wfSize);
80
81    pendingFetch = false;
82    dropFetch = false;
83    condRegState = new ConditionRegisterState();
84    maxSpVgprs = 0;
85    maxDpVgprs = 0;
86    lastAddr.resize(p->wfSize);
87    workItemFlatId.resize(p->wfSize);
88    oldDgpr.resize(p->wfSize);
89    barCnt.resize(p->wfSize);
90    for (int i = 0; i < 3; ++i) {
91        workItemId[i].resize(p->wfSize);
92    }
93}
94
95void
96Wavefront::regStats()
97{
98    SimObject::regStats();
99
100    srcRegOpDist
101        .init(0, 4, 2)
102        .name(name() + ".src_reg_operand_dist")
103        .desc("number of executed instructions with N source register operands")
104        ;
105
106    dstRegOpDist
107        .init(0, 3, 2)
108        .name(name() + ".dst_reg_operand_dist")
109        .desc("number of executed instructions with N destination register "
110              "operands")
111        ;
112
113    // FIXME: the name of the WF needs to be unique
114    numTimesBlockedDueWAXDependencies
115        .name(name() + ".timesBlockedDueWAXDependencies")
116        .desc("number of times the wf's instructions are blocked due to WAW "
117              "or WAR dependencies")
118        ;
119
120    // FIXME: the name of the WF needs to be unique
121    numTimesBlockedDueRAWDependencies
122        .name(name() + ".timesBlockedDueRAWDependencies")
123        .desc("number of times the wf's instructions are blocked due to RAW "
124              "dependencies")
125        ;
126
127    // FIXME: the name of the WF needs to be unique
128    numTimesBlockedDueVrfPortAvail
129        .name(name() + ".timesBlockedDueVrfPortAvail")
130        .desc("number of times instructions are blocked due to VRF port "
131              "availability")
132        ;
133}
134
135void
136Wavefront::init()
137{
138    reservedVectorRegs = 0;
139    startVgprIndex = 0;
140}
141
142void
143Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
144{
145    condRegState->init(num_cregs);
146    maxSpVgprs = num_sregs;
147    maxDpVgprs = num_dregs;
148}
149
150Wavefront::~Wavefront()
151{
152    if (callArgMem)
153        delete callArgMem;
154    delete condRegState;
155}
156
157void
158Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
159{
160    wfDynId = _wf_dyn_id;
161    basePtr = _base_ptr;
162    status = S_RUNNING;
163}
164
165bool
166Wavefront::isGmInstruction(GPUDynInstPtr ii)
167{
168    if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
169        IS_OT_ATOMIC_PM(ii->opType())) {
170        return true;
171    }
172
173    if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
174        IS_OT_ATOMIC_GM(ii->opType())) {
175        return true;
176    }
177
178    if (IS_OT_FLAT(ii->opType())) {
179        return true;
180    }
181
182    return false;
183}
184
185bool
186Wavefront::isLmInstruction(GPUDynInstPtr ii)
187{
188    if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) ||
189        IS_OT_ATOMIC_LM(ii->opType())) {
190        return true;
191    }
192
193    return false;
194}
195
196bool
197Wavefront::isOldestInstALU()
198{
199    assert(!instructionBuffer.empty());
200    GPUDynInstPtr ii = instructionBuffer.front();
201
202    if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP ||
203        ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
204        ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
205        ii->opType() == Enums::OT_KERN_READ)) {
206        return true;
207    }
208
209    return false;
210}
211
212bool
213Wavefront::isOldestInstBarrier()
214{
215    assert(!instructionBuffer.empty());
216    GPUDynInstPtr ii = instructionBuffer.front();
217
218    if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) {
219        return true;
220    }
221
222    return false;
223}
224
225bool
226Wavefront::isOldestInstGMem()
227{
228    assert(!instructionBuffer.empty());
229    GPUDynInstPtr ii = instructionBuffer.front();
230
231    if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) ||
232        IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
233
234        return true;
235    }
236
237    return false;
238}
239
240bool
241Wavefront::isOldestInstLMem()
242{
243    assert(!instructionBuffer.empty());
244    GPUDynInstPtr ii = instructionBuffer.front();
245
246    if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) ||
247        IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
248
249        return true;
250    }
251
252    return false;
253}
254
255bool
256Wavefront::isOldestInstPrivMem()
257{
258    assert(!instructionBuffer.empty());
259    GPUDynInstPtr ii = instructionBuffer.front();
260
261    if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) ||
262        IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
263
264        return true;
265    }
266
267    return false;
268}
269
270bool
271Wavefront::isOldestInstFlatMem()
272{
273    assert(!instructionBuffer.empty());
274    GPUDynInstPtr ii = instructionBuffer.front();
275
276    if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) {
277
278        return true;
279    }
280
281    return false;
282}
283
284// Return true if the Wavefront's instruction
285// buffer has branch instruction.
286bool
287Wavefront::instructionBufferHasBranch()
288{
289    for (auto it : instructionBuffer) {
290        GPUDynInstPtr ii = it;
291
292        if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) {
293            return true;
294        }
295    }
296
297    return false;
298}
299
300// Remap HSAIL register to physical VGPR.
301// HSAIL register = virtual register assigned to an operand by HLC compiler
302uint32_t
303Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
304{
305    assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
306    // add the offset from where the VGPRs of the wavefront have been assigned
307    uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
308    // HSAIL double precision (DP) register: calculate the physical VGPR index
309    // assuming that DP registers are placed after SP ones in the VRF. The DP
310    // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
311    // the DP VGPR index before mapping it to the physical VRF address space
312    if (mode == 1 && size > 4) {
313        physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
314    }
315
316    assert((startVgprIndex <= physicalVgprIndex) &&
317           (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
318
319    // calculate absolute physical VGPR index
320    return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
321}
322
323// Return true if this wavefront is ready
324// to execute an instruction of the specified type.
325int
326Wavefront::ready(itype_e type)
327{
328    // Check to make sure wave is running
329    if (status == S_STOPPED || status == S_RETURNING ||
330        instructionBuffer.empty()) {
331        return 0;
332    }
333
334    // Is the wave waiting at a barrier
335    if (stalledAtBarrier) {
336        if (!computeUnit->AllAtBarrier(barrierId,barrierCnt,
337                        computeUnit->getRefCounter(dispatchId, wgId))) {
338            // Are all threads at barrier?
339            return 0;
340        }
341        oldBarrierCnt = barrierCnt;
342        stalledAtBarrier = false;
343    }
344
345    // Read instruction
346    GPUDynInstPtr ii = instructionBuffer.front();
347
348    bool ready_inst M5_VAR_USED = false;
349    bool glbMemBusRdy = false;
350    bool glbMemIssueRdy = false;
351    if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
352        for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
353            if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
354                glbMemBusRdy = true;
355            if (computeUnit->wfWait[j].prerdy())
356                glbMemIssueRdy = true;
357        }
358    }
359    bool locMemBusRdy = false;
360    bool locMemIssueRdy = false;
361    if (type == I_SHARED || type == I_FLAT) {
362        for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
363            if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
364                locMemBusRdy = true;
365            if (computeUnit->wfWait[j].prerdy())
366                locMemIssueRdy = true;
367        }
368    }
369
370    // The following code is very error prone and the entire process for
371    // checking readiness will be fixed eventually.  In the meantime, let's
372    // make sure that we do not silently let an instruction type slip
373    // through this logic and always return not ready.
374    if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP ||
375          ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
376          ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
377          ii->opType() == Enums::OT_KERN_READ ||
378          ii->opType() == Enums::OT_ARG ||
379          IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
380          IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) ||
381          IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
382          IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
383          IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) {
384        panic("next instruction: %s is of unknown type\n", ii->disassemble());
385    }
386
387    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
388            computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
389
390    if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) {
391        // Here for ALU instruction (barrier)
392        if (!computeUnit->wfWait[simdId].prerdy()) {
393            // Is wave slot free?
394            return 0;
395        }
396
397        // Are there in pipe or outstanding memory requests?
398        if ((outstandingReqs + memReqsInPipe) > 0) {
399            return 0;
400        }
401
402        ready_inst = true;
403    } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) {
404        // Here for ALU instruction (nop)
405        if (!computeUnit->wfWait[simdId].prerdy()) {
406            // Is wave slot free?
407            return 0;
408        }
409
410        ready_inst = true;
411    } else if (type == I_ALU && ii->opType() == Enums::OT_RET) {
412        // Here for ALU instruction (return)
413        if (!computeUnit->wfWait[simdId].prerdy()) {
414            // Is wave slot free?
415            return 0;
416        }
417
418        // Are there in pipe or outstanding memory requests?
419        if ((outstandingReqs + memReqsInPipe) > 0) {
420            return 0;
421        }
422
423        ready_inst = true;
424    } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH ||
425               ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
426               ii->opType() == Enums::OT_KERN_READ ||
427               ii->opType() == Enums::OT_ARG)) {
428        // Here for ALU instruction (all others)
429        if (!computeUnit->wfWait[simdId].prerdy()) {
430            // Is alu slot free?
431            return 0;
432        }
433        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
434                    VrfAccessType::RD_WR)) {
435            return 0;
436        }
437
438        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
439            return 0;
440        }
441        ready_inst = true;
442    } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) ||
443               IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
444        // Here Global memory instruction
445        if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) {
446            // Are there in pipe or outstanding global memory write requests?
447            if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
448                return 0;
449            }
450        }
451
452        if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) ||
453            IS_OT_HIST_GM(ii->opType())) {
454            // Are there in pipe or outstanding global memory read requests?
455            if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0)
456                return 0;
457        }
458
459        if (!glbMemIssueRdy) {
460            // Is WV issue slot free?
461            return 0;
462        }
463
464        if (!glbMemBusRdy) {
465            // Is there an available VRF->Global memory read bus?
466            return 0;
467        }
468
469        if (!computeUnit->globalMemoryPipe.
470            isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
471            // Can we insert a new request to the Global Mem Request FIFO?
472            return 0;
473        }
474        // can we schedule source & destination operands on the VRF?
475        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
476                    VrfAccessType::RD_WR)) {
477            return 0;
478        }
479        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
480            return 0;
481        }
482        ready_inst = true;
483    } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) ||
484               IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
485        // Here for Shared memory instruction
486        if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) {
487            if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
488                return 0;
489            }
490        }
491
492        if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
493            IS_OT_HIST_LM(ii->opType())) {
494            if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
495                return 0;
496            }
497        }
498
499        if (!locMemBusRdy) {
500            // Is there an available VRF->LDS read bus?
501            return 0;
502        }
503        if (!locMemIssueRdy) {
504            // Is wave slot free?
505            return 0;
506        }
507
508        if (!computeUnit->localMemoryPipe.
509            isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
510            // Can we insert a new request to the LDS Request FIFO?
511            return 0;
512        }
513        // can we schedule source & destination operands on the VRF?
514        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
515                    VrfAccessType::RD_WR)) {
516            return 0;
517        }
518        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
519            return 0;
520        }
521        ready_inst = true;
522    } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) ||
523               IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
524        // Here for Private memory instruction ------------------------    //
525        if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) {
526            if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
527                return 0;
528            }
529        }
530
531        if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) ||
532            IS_OT_HIST_PM(ii->opType())) {
533            if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) {
534                return 0;
535            }
536        }
537
538        if (!glbMemBusRdy) {
539            // Is there an available VRF->Global memory read bus?
540            return 0;
541        }
542
543        if (!glbMemIssueRdy) {
544             // Is wave slot free?
545            return 0;
546        }
547
548        if (!computeUnit->globalMemoryPipe.
549            isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
550            // Can we insert a new request to the Global Mem Request FIFO?
551            return 0;
552        }
553        // can we schedule source & destination operands on the VRF?
554        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
555                    VrfAccessType::RD_WR)) {
556            return 0;
557        }
558        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
559            return 0;
560        }
561        ready_inst = true;
562    } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) {
563        if (!glbMemBusRdy) {
564            // Is there an available VRF->Global memory read bus?
565            return 0;
566        }
567
568        if (!locMemBusRdy) {
569            // Is there an available VRF->LDS read bus?
570            return 0;
571        }
572
573        if (!glbMemIssueRdy) {
574            // Is wave slot free?
575            return 0;
576        }
577
578        if (!locMemIssueRdy) {
579            return 0;
580        }
581        if (!computeUnit->globalMemoryPipe.
582            isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
583            // Can we insert a new request to the Global Mem Request FIFO?
584            return 0;
585        }
586
587        if (!computeUnit->localMemoryPipe.
588            isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
589            // Can we insert a new request to the LDS Request FIFO?
590            return 0;
591        }
592        // can we schedule source & destination operands on the VRF?
593        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
594                    VrfAccessType::RD_WR)) {
595            return 0;
596        }
597        // are all the operands ready? (RAW, WAW and WAR depedencies met?)
598        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
599            return 0;
600        }
601        ready_inst = true;
602    } else {
603        return 0;
604    }
605
606    assert(ready_inst);
607
608    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
609            simdId, wfSlotId, ii->disassemble());
610    return 1;
611}
612
613void
614Wavefront::updateResources()
615{
616    // Get current instruction
617    GPUDynInstPtr ii = instructionBuffer.front();
618    assert(ii);
619    computeUnit->vrf[simdId]->updateResources(this, ii);
620    // Single precision ALU or Branch or Return or Special instruction
621    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
622        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
623        // FIXME: Kernel argument loads are currently treated as ALU operations
624        // since we don't send memory packets at execution. If we fix that then
625        // we should map them to one of the memory pipelines
626        ii->opType()==Enums::OT_KERN_READ ||
627        ii->opType()==Enums::OT_ARG ||
628        ii->opType()==Enums::OT_RET) {
629        computeUnit->aluPipe[simdId].preset(computeUnit->shader->
630                                            ticks(computeUnit->spBypassLength()));
631        // this is to enforce a fixed number of cycles per issue slot per SIMD
632        computeUnit->wfWait[simdId].preset(computeUnit->shader->
633                                           ticks(computeUnit->issuePeriod));
634    } else if (ii->opType() == Enums::OT_BARRIER) {
635        computeUnit->wfWait[simdId].preset(computeUnit->shader->
636                                           ticks(computeUnit->issuePeriod));
637    } else if (ii->opType() == Enums::OT_FLAT_READ) {
638        assert(Enums::SC_NONE != ii->executedAs());
639        memReqsInPipe++;
640        rdGmReqsInPipe++;
641        if ( Enums::SC_SHARED == ii->executedAs() ) {
642            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
643                preset(computeUnit->shader->ticks(4));
644            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
645                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
646        } else {
647            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
648                preset(computeUnit->shader->ticks(4));
649            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
650                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
651        }
652    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
653        assert(Enums::SC_NONE != ii->executedAs());
654        memReqsInPipe++;
655        wrGmReqsInPipe++;
656        if (Enums::SC_SHARED == ii->executedAs()) {
657            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
658                preset(computeUnit->shader->ticks(8));
659            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
660                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
661        } else {
662            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
663                preset(computeUnit->shader->ticks(8));
664            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
665                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
666        }
667    } else if (IS_OT_READ_GM(ii->opType())) {
668        memReqsInPipe++;
669        rdGmReqsInPipe++;
670        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
671            preset(computeUnit->shader->ticks(4));
672        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
673            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
674    } else if (IS_OT_WRITE_GM(ii->opType())) {
675        memReqsInPipe++;
676        wrGmReqsInPipe++;
677        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
678            preset(computeUnit->shader->ticks(8));
679        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
680            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
681    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
682        memReqsInPipe++;
683        wrGmReqsInPipe++;
684        rdGmReqsInPipe++;
685        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
686            preset(computeUnit->shader->ticks(8));
687        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
688            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
689    } else if (IS_OT_READ_LM(ii->opType())) {
690        memReqsInPipe++;
691        rdLmReqsInPipe++;
692        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
693            preset(computeUnit->shader->ticks(4));
694        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
695            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
696    } else if (IS_OT_WRITE_LM(ii->opType())) {
697        memReqsInPipe++;
698        wrLmReqsInPipe++;
699        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
700            preset(computeUnit->shader->ticks(8));
701        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
702            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
703    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
704        memReqsInPipe++;
705        wrLmReqsInPipe++;
706        rdLmReqsInPipe++;
707        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
708            preset(computeUnit->shader->ticks(8));
709        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
710            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
711    } else if (IS_OT_READ_PM(ii->opType())) {
712        memReqsInPipe++;
713        rdGmReqsInPipe++;
714        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
715            preset(computeUnit->shader->ticks(4));
716        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
717            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
718    } else if (IS_OT_WRITE_PM(ii->opType())) {
719        memReqsInPipe++;
720        wrGmReqsInPipe++;
721        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
722            preset(computeUnit->shader->ticks(8));
723        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
724            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
725    } else if (IS_OT_ATOMIC_PM(ii->opType())) {
726        memReqsInPipe++;
727        wrGmReqsInPipe++;
728        rdGmReqsInPipe++;
729        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
730            preset(computeUnit->shader->ticks(8));
731        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
732            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
733    }
734}
735
736void
737Wavefront::exec()
738{
739    // ---- Exit if wavefront is inactive ----------------------------- //
740
741    if (status == S_STOPPED || status == S_RETURNING ||
742        instructionBuffer.empty()) {
743        return;
744    }
745
746    // Get current instruction
747
748    GPUDynInstPtr ii = instructionBuffer.front();
749
750    const uint32_t old_pc = pc();
751    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
752            "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
753            ii->disassemble(), old_pc);
754    ii->execute();
755    // access the VRF
756    computeUnit->vrf[simdId]->exec(ii, this);
757    srcRegOpDist.sample(ii->numSrcRegOperands());
758    dstRegOpDist.sample(ii->numDstRegOperands());
759    computeUnit->numInstrExecuted++;
760    computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
761                                     computeUnit->lastExecCycle[simdId]);
762    computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
763    if (pc() == old_pc) {
764        uint32_t new_pc = old_pc + 1;
765        // PC not modified by instruction, proceed to next or pop frame
766        pc(new_pc);
767        if (new_pc == rpc()) {
768            popFromReconvergenceStack();
769            discardFetch();
770        } else {
771            instructionBuffer.pop_front();
772        }
773    }
774
775    if (computeUnit->shader->hsail_mode==Shader::SIMT) {
776        const int num_active_lanes = execMask().count();
777        computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
778        computeUnit->numVecOpsExecuted += num_active_lanes;
779        if (isGmInstruction(ii)) {
780            computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
781        } else if (isLmInstruction(ii)) {
782            computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
783        }
784    }
785
786    // ---- Update Vector ALU pipeline and other resources ------------------ //
787    // Single precision ALU or Branch or Return or Special instruction
788    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
789        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
790        // FIXME: Kernel argument loads are currently treated as ALU operations
791        // since we don't send memory packets at execution. If we fix that then
792        // we should map them to one of the memory pipelines
793        ii->opType() == Enums::OT_KERN_READ ||
794        ii->opType() == Enums::OT_ARG ||
795        ii->opType() == Enums::OT_RET) {
796        computeUnit->aluPipe[simdId].set(computeUnit->shader->
797                                         ticks(computeUnit->spBypassLength()));
798
799        // this is to enforce a fixed number of cycles per issue slot per SIMD
800        computeUnit->wfWait[simdId].set(computeUnit->shader->
801                                        ticks(computeUnit->issuePeriod));
802    } else if (ii->opType() == Enums::OT_BARRIER) {
803        computeUnit->wfWait[simdId].set(computeUnit->shader->
804                                        ticks(computeUnit->issuePeriod));
805    } else if (ii->opType() == Enums::OT_FLAT_READ) {
806        assert(Enums::SC_NONE != ii->executedAs());
807
808        if (Enums::SC_SHARED == ii->executedAs()) {
809            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
810                set(computeUnit->shader->ticks(4));
811            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
812                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
813        } else {
814            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
815                set(computeUnit->shader->ticks(4));
816            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
817                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
818        }
819    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
820        assert(Enums::SC_NONE != ii->executedAs());
821        if (Enums::SC_SHARED == ii->executedAs()) {
822            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
823                set(computeUnit->shader->ticks(8));
824            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
825                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
826        } else {
827            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
828                set(computeUnit->shader->ticks(8));
829            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
830                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
831        }
832    } else if (IS_OT_READ_GM(ii->opType())) {
833        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
834            set(computeUnit->shader->ticks(4));
835        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
836            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
837    } else if (IS_OT_WRITE_GM(ii->opType())) {
838        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
839            set(computeUnit->shader->ticks(8));
840        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
841            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
842    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
843        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
844            set(computeUnit->shader->ticks(8));
845        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
846            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
847    } else if (IS_OT_READ_LM(ii->opType())) {
848        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
849            set(computeUnit->shader->ticks(4));
850        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
851            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
852    } else if (IS_OT_WRITE_LM(ii->opType())) {
853        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
854            set(computeUnit->shader->ticks(8));
855        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
856            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
857    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
858        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
859            set(computeUnit->shader->ticks(8));
860        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
861            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
862    }
863}
864
865bool
866Wavefront::waitingAtBarrier(int lane)
867{
868    return barCnt[lane] < maxBarCnt;
869}
870
871void
872Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
873                                    const VectorMask& mask)
874{
875    assert(mask.count());
876    reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
877}
878
879void
880Wavefront::popFromReconvergenceStack()
881{
882    assert(!reconvergenceStack.empty());
883
884    DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
885            computeUnit->cu_id, simdId, wfSlotId, wfDynId,
886            execMask().to_string<char, std::string::traits_type,
887            std::string::allocator_type>().c_str(), pc());
888
889    reconvergenceStack.pop_back();
890
891    DPRINTF(WavefrontStack, "%3i %s\n", pc(),
892            execMask().to_string<char, std::string::traits_type,
893            std::string::allocator_type>().c_str());
894
895}
896
897void
898Wavefront::discardFetch()
899{
900    instructionBuffer.clear();
901    dropFetch |=pendingFetch;
902}
903
904uint32_t
905Wavefront::pc() const
906{
907    return reconvergenceStack.back()->pc;
908}
909
910uint32_t
911Wavefront::rpc() const
912{
913    return reconvergenceStack.back()->rpc;
914}
915
916VectorMask
917Wavefront::execMask() const
918{
919    return reconvergenceStack.back()->execMask;
920}
921
922bool
923Wavefront::execMask(int lane) const
924{
925    return reconvergenceStack.back()->execMask[lane];
926}
927
928
929void
930Wavefront::pc(uint32_t new_pc)
931{
932    reconvergenceStack.back()->pc = new_pc;
933}
934
935uint32_t
936Wavefront::getStaticContextSize() const
937{
938    return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
939           sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
940           sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
941           sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
942           computeUnit->wfSize() * sizeof(ReconvergenceStackEntry);
943}
944
945void
946Wavefront::getContext(const void *out)
947{
948    uint8_t *iter = (uint8_t *)out;
949    for (int i = 0; i < barCnt.size(); i++) {
950        *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
951    }
952    *(int *)iter = wfId; iter += sizeof(wfId);
953    *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
954    *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
955    *(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
956    *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
957    *(uint32_t *)iter = wgId; iter += sizeof(wgId);
958    *(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
959    *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
960    *(Addr *)iter = privBase; iter += sizeof(privBase);
961    *(Addr *)iter = spillBase; iter += sizeof(spillBase);
962
963    int stackSize = reconvergenceStack.size();
964    ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
965                                    std::numeric_limits<uint32_t>::max(),
966                                    std::numeric_limits<uint64_t>::max()};
967    for (int i = 0; i < workItemId[0].size(); i++) {
968        if (i < stackSize) {
969            *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
970            iter += sizeof(ReconvergenceStackEntry);
971            reconvergenceStack.pop_back();
972        } else {
973            *(ReconvergenceStackEntry *)iter = empty;
974            iter += sizeof(ReconvergenceStackEntry);
975        }
976    }
977
978    int wf_size = computeUnit->wfSize();
979    for (int i = 0; i < maxSpVgprs; i++) {
980        uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
981        for (int lane = 0; lane < wf_size; lane++) {
982            uint32_t regVal = computeUnit->vrf[simdId]->
983                            read<uint32_t>(vgprIdx,lane);
984            *(uint32_t *)iter = regVal; iter += sizeof(regVal);
985        }
986    }
987
988    for (int i = 0; i < maxDpVgprs; i++) {
989        uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
990        for (int lane = 0; lane < wf_size; lane++) {
991            uint64_t regVal = computeUnit->vrf[simdId]->
992                            read<uint64_t>(vgprIdx,lane);
993            *(uint64_t *)iter = regVal; iter += sizeof(regVal);
994        }
995    }
996
997    for (int i = 0; i < condRegState->numRegs(); i++) {
998        for (int lane = 0; lane < wf_size; lane++) {
999            uint64_t regVal = condRegState->read<uint64_t>(i, lane);
1000            *(uint64_t *)iter = regVal; iter += sizeof(regVal);
1001        }
1002    }
1003
1004    /* saving LDS content */
1005    if (ldsChunk)
1006        for (int i = 0; i < ldsChunk->size(); i++) {
1007            char val = ldsChunk->read<char>(i);
1008            *(char *) iter = val; iter += sizeof(val);
1009        }
1010}
1011
1012void
1013Wavefront::setContext(const void *in)
1014{
1015    uint8_t *iter = (uint8_t *)in;
1016    for (int i = 0; i < barCnt.size(); i++) {
1017        barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
1018    }
1019    wfId = *(int *)iter; iter += sizeof(wfId);
1020    maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
1021    oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
1022    barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
1023    computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
1024    wgId = *(uint32_t *)iter; iter += sizeof(wgId);
1025    barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
1026    initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
1027    privBase = *(Addr *)iter; iter += sizeof(privBase);
1028    spillBase = *(Addr *)iter; iter += sizeof(spillBase);
1029
1030    for (int i = 0; i < workItemId[0].size(); i++) {
1031        ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter;
1032        iter += sizeof(ReconvergenceStackEntry);
1033        if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
1034            pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
1035                                     newEntry.execMask);
1036        }
1037    }
1038    int wf_size = computeUnit->wfSize();
1039
1040    for (int i = 0; i < maxSpVgprs; i++) {
1041        uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
1042        for (int lane = 0; lane < wf_size; lane++) {
1043            uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
1044            computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
1045        }
1046    }
1047
1048    for (int i = 0; i < maxDpVgprs; i++) {
1049        uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
1050        for (int lane = 0; lane < wf_size; lane++) {
1051            uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
1052            computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
1053        }
1054    }
1055
1056    for (int i = 0; i < condRegState->numRegs(); i++) {
1057        for (int lane = 0; lane < wf_size; lane++) {
1058            uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
1059            condRegState->write<uint64_t>(i, lane, regVal);
1060        }
1061    }
1062    /** Restoring LDS contents */
1063    if (ldsChunk)
1064        for (int i = 0; i < ldsChunk->size(); i++) {
1065            char val = *(char *) iter; iter += sizeof(val);
1066            ldsChunk->write<char>(i, val);
1067        }
1068}
1069
1070void
1071Wavefront::computeActualWgSz(NDRange *ndr)
1072{
1073    actualWgSzTotal = 1;
1074    for (int d = 0; d < 3; ++d) {
1075        actualWgSz[d] = std::min(workGroupSz[d],
1076                                 gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
1077        actualWgSzTotal *= actualWgSz[d];
1078    }
1079}
1080