wavefront.cc revision 11345:b6a66a90e0a1
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#include "gpu-compute/wavefront.hh"
37
38#include "debug/GPUExec.hh"
39#include "debug/WavefrontStack.hh"
40#include "gpu-compute/code_enums.hh"
41#include "gpu-compute/compute_unit.hh"
42#include "gpu-compute/gpu_dyn_inst.hh"
43#include "gpu-compute/shader.hh"
44#include "gpu-compute/vector_register_file.hh"
45
46Wavefront*
47WavefrontParams::create()
48{
49    return new Wavefront(this);
50}
51
52Wavefront::Wavefront(const Params *p)
53  : SimObject(p), callArgMem(nullptr)
54{
55    last_trace = 0;
56    simdId = p->simdId;
57    wfSlotId = p->wf_slot_id;
58
59    status = S_STOPPED;
60    reservedVectorRegs = 0;
61    startVgprIndex = 0;
62    outstanding_reqs = 0;
63    mem_reqs_in_pipe = 0;
64    outstanding_reqs_wr_gm = 0;
65    outstanding_reqs_wr_lm = 0;
66    outstanding_reqs_rd_gm = 0;
67    outstanding_reqs_rd_lm = 0;
68    rd_lm_reqs_in_pipe = 0;
69    rd_gm_reqs_in_pipe = 0;
70    wr_lm_reqs_in_pipe = 0;
71    wr_gm_reqs_in_pipe = 0;
72
73    barrier_cnt = 0;
74    old_barrier_cnt = 0;
75    stalledAtBarrier = false;
76
77    mem_trace_busy = 0;
78    old_vgpr_tcnt = 0xffffffffffffffffll;
79    old_dgpr_tcnt = 0xffffffffffffffffll;
80
81    pendingFetch = false;
82    dropFetch = false;
83    condRegState = new ConditionRegisterState();
84    maxSpVgprs = 0;
85    maxDpVgprs = 0;
86}
87
88void
89Wavefront::regStats()
90{
91    srcRegOpDist
92        .init(0, 4, 2)
93        .name(name() + ".src_reg_operand_dist")
94        .desc("number of executed instructions with N source register operands")
95        ;
96
97    dstRegOpDist
98        .init(0, 3, 2)
99        .name(name() + ".dst_reg_operand_dist")
100        .desc("number of executed instructions with N destination register "
101              "operands")
102        ;
103
104    // FIXME: the name of the WF needs to be unique
105    numTimesBlockedDueWAXDependencies
106        .name(name() + ".timesBlockedDueWAXDependencies")
107        .desc("number of times the wf's instructions are blocked due to WAW "
108              "or WAR dependencies")
109        ;
110
111    // FIXME: the name of the WF needs to be unique
112    numTimesBlockedDueRAWDependencies
113        .name(name() + ".timesBlockedDueRAWDependencies")
114        .desc("number of times the wf's instructions are blocked due to RAW "
115              "dependencies")
116        ;
117
118    // FIXME: the name of the WF needs to be unique
119    numTimesBlockedDueVrfPortAvail
120        .name(name() + ".timesBlockedDueVrfPortAvail")
121        .desc("number of times instructions are blocked due to VRF port "
122              "availability")
123        ;
124}
125
126void
127Wavefront::init()
128{
129    reservedVectorRegs = 0;
130    startVgprIndex = 0;
131}
132
133void
134Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
135{
136    condRegState->init(num_cregs);
137    maxSpVgprs = num_sregs;
138    maxDpVgprs = num_dregs;
139}
140
141Wavefront::~Wavefront()
142{
143    if (callArgMem)
144        delete callArgMem;
145}
146
147void
148Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr)
149{
150    wfDynId = _wfDynId;
151    base_ptr = _base_ptr;
152    status = S_RUNNING;
153}
154
155bool
156Wavefront::isGmInstruction(GPUDynInstPtr ii)
157{
158    if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
159        IS_OT_ATOMIC_PM(ii->opType())) {
160        return true;
161    }
162
163    if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
164        IS_OT_ATOMIC_GM(ii->opType())) {
165        return true;
166    }
167
168    if (IS_OT_FLAT(ii->opType())) {
169        return true;
170    }
171
172    return false;
173}
174
175bool
176Wavefront::isLmInstruction(GPUDynInstPtr ii)
177{
178    if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) ||
179        IS_OT_ATOMIC_LM(ii->opType())) {
180        return true;
181    }
182
183    return false;
184}
185
186bool
187Wavefront::isOldestInstALU()
188{
189    assert(!instructionBuffer.empty());
190    GPUDynInstPtr ii = instructionBuffer.front();
191
192    if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP ||
193        ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
194        ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
195        ii->opType() == Enums::OT_KERN_READ)) {
196        return true;
197    }
198
199    return false;
200}
201
202bool
203Wavefront::isOldestInstBarrier()
204{
205    assert(!instructionBuffer.empty());
206    GPUDynInstPtr ii = instructionBuffer.front();
207
208    if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) {
209        return true;
210    }
211
212    return false;
213}
214
215bool
216Wavefront::isOldestInstGMem()
217{
218    assert(!instructionBuffer.empty());
219    GPUDynInstPtr ii = instructionBuffer.front();
220
221    if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) ||
222        IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
223
224        return true;
225    }
226
227    return false;
228}
229
230bool
231Wavefront::isOldestInstLMem()
232{
233    assert(!instructionBuffer.empty());
234    GPUDynInstPtr ii = instructionBuffer.front();
235
236    if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) ||
237        IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
238
239        return true;
240    }
241
242    return false;
243}
244
245bool
246Wavefront::isOldestInstPrivMem()
247{
248    assert(!instructionBuffer.empty());
249    GPUDynInstPtr ii = instructionBuffer.front();
250
251    if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) ||
252        IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
253
254        return true;
255    }
256
257    return false;
258}
259
260bool
261Wavefront::isOldestInstFlatMem()
262{
263    assert(!instructionBuffer.empty());
264    GPUDynInstPtr ii = instructionBuffer.front();
265
266    if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) {
267
268        return true;
269    }
270
271    return false;
272}
273
274// Return true if the Wavefront's instruction
275// buffer has branch instruction.
276bool
277Wavefront::instructionBufferHasBranch()
278{
279    for (auto it : instructionBuffer) {
280        GPUDynInstPtr ii = it;
281
282        if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) {
283            return true;
284        }
285    }
286
287    return false;
288}
289
290// Remap HSAIL register to physical VGPR.
291// HSAIL register = virtual register assigned to an operand by HLC compiler
292uint32_t
293Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
294{
295    assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
296    // add the offset from where the VGPRs of the wavefront have been assigned
297    uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
298    // HSAIL double precision (DP) register: calculate the physical VGPR index
299    // assuming that DP registers are placed after SP ones in the VRF. The DP
300    // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
301    // the DP VGPR index before mapping it to the physical VRF address space
302    if (mode == 1 && size > 4) {
303        physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
304    }
305
306    assert((startVgprIndex <= physicalVgprIndex) &&
307           (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
308
309    // calculate absolute physical VGPR index
310    return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
311}
312
313// Return true if this wavefront is ready
314// to execute an instruction of the specified type.
315int
316Wavefront::ready(itype_e type)
317{
318    // Check to make sure wave is running
319    if (status == S_STOPPED || status == S_RETURNING ||
320        instructionBuffer.empty()) {
321        return 0;
322    }
323
324    // Is the wave waiting at a barrier
325    if (stalledAtBarrier) {
326        if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt,
327                        computeUnit->getRefCounter(dispatchid, wg_id))) {
328            // Are all threads at barrier?
329            return 0;
330        }
331        old_barrier_cnt = barrier_cnt;
332        stalledAtBarrier = false;
333    }
334
335    // Read instruction
336    GPUDynInstPtr ii = instructionBuffer.front();
337
338    bool ready_inst M5_VAR_USED = false;
339    bool glbMemBusRdy = false;
340    bool glbMemIssueRdy = false;
341    if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
342        for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
343            if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
344                glbMemBusRdy = true;
345            if (computeUnit->wfWait[j].prerdy())
346                glbMemIssueRdy = true;
347        }
348    }
349    bool locMemBusRdy = false;
350    bool locMemIssueRdy = false;
351    if (type == I_SHARED || type == I_FLAT) {
352        for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
353            if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
354                locMemBusRdy = true;
355            if (computeUnit->wfWait[j].prerdy())
356                locMemIssueRdy = true;
357        }
358    }
359
360    // The following code is very error prone and the entire process for
361    // checking readiness will be fixed eventually.  In the meantime, let's
362    // make sure that we do not silently let an instruction type slip
363    // through this logic and always return not ready.
364    if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP ||
365          ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
366          ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
367          ii->opType() == Enums::OT_KERN_READ ||
368          ii->opType() == Enums::OT_ARG ||
369          IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
370          IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) ||
371          IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
372          IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
373          IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) {
374        panic("next instruction: %s is of unknown type\n", ii->disassemble());
375    }
376
377    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
378            computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
379
380    if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) {
381        // Here for ALU instruction (barrier)
382        if (!computeUnit->wfWait[simdId].prerdy()) {
383            // Is wave slot free?
384            return 0;
385        }
386
387        // Are there in pipe or outstanding memory requests?
388        if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
389            return 0;
390        }
391
392        ready_inst = true;
393    } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) {
394        // Here for ALU instruction (nop)
395        if (!computeUnit->wfWait[simdId].prerdy()) {
396            // Is wave slot free?
397            return 0;
398        }
399
400        ready_inst = true;
401    } else if (type == I_ALU && ii->opType() == Enums::OT_RET) {
402        // Here for ALU instruction (return)
403        if (!computeUnit->wfWait[simdId].prerdy()) {
404            // Is wave slot free?
405            return 0;
406        }
407
408        // Are there in pipe or outstanding memory requests?
409        if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
410            return 0;
411        }
412
413        ready_inst = true;
414    } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH ||
415               ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
416               ii->opType() == Enums::OT_KERN_READ ||
417               ii->opType() == Enums::OT_ARG)) {
418        // Here for ALU instruction (all others)
419        if (!computeUnit->wfWait[simdId].prerdy()) {
420            // Is alu slot free?
421            return 0;
422        }
423        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
424                    VrfAccessType::RD_WR)) {
425            return 0;
426        }
427
428        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
429            return 0;
430        }
431        ready_inst = true;
432    } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) ||
433               IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
434        // Here Global memory instruction
435        if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) {
436            // Are there in pipe or outstanding global memory write requests?
437            if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
438                return 0;
439            }
440        }
441
442        if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) ||
443            IS_OT_HIST_GM(ii->opType())) {
444            // Are there in pipe or outstanding global memory read requests?
445            if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0)
446                return 0;
447        }
448
449        if (!glbMemIssueRdy) {
450            // Is WV issue slot free?
451            return 0;
452        }
453
454        if (!glbMemBusRdy) {
455            // Is there an available VRF->Global memory read bus?
456            return 0;
457        }
458
459        if (!computeUnit->globalMemoryPipe.
460            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
461            // Can we insert a new request to the Global Mem Request FIFO?
462            return 0;
463        }
464        // can we schedule source & destination operands on the VRF?
465        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
466                    VrfAccessType::RD_WR)) {
467            return 0;
468        }
469        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
470            return 0;
471        }
472        ready_inst = true;
473    } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) ||
474               IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
475        // Here for Shared memory instruction
476        if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) {
477            if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) {
478                return 0;
479            }
480        }
481
482        if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
483            IS_OT_HIST_LM(ii->opType())) {
484            if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) {
485                return 0;
486            }
487        }
488
489        if (!locMemBusRdy) {
490            // Is there an available VRF->LDS read bus?
491            return 0;
492        }
493        if (!locMemIssueRdy) {
494            // Is wave slot free?
495            return 0;
496        }
497
498        if (!computeUnit->localMemoryPipe.
499            isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
500            // Can we insert a new request to the LDS Request FIFO?
501            return 0;
502        }
503        // can we schedule source & destination operands on the VRF?
504        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
505                    VrfAccessType::RD_WR)) {
506            return 0;
507        }
508        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
509            return 0;
510        }
511        ready_inst = true;
512    } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) ||
513               IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
514        // Here for Private memory instruction ------------------------    //
515        if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) {
516            if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
517                return 0;
518            }
519        }
520
521        if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) ||
522            IS_OT_HIST_PM(ii->opType())) {
523            if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) {
524                return 0;
525            }
526        }
527
528        if (!glbMemBusRdy) {
529            // Is there an available VRF->Global memory read bus?
530            return 0;
531        }
532
533        if (!glbMemIssueRdy) {
534             // Is wave slot free?
535            return 0;
536        }
537
538        if (!computeUnit->globalMemoryPipe.
539            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
540            // Can we insert a new request to the Global Mem Request FIFO?
541            return 0;
542        }
543        // can we schedule source & destination operands on the VRF?
544        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
545                    VrfAccessType::RD_WR)) {
546            return 0;
547        }
548        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
549            return 0;
550        }
551        ready_inst = true;
552    } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) {
553        if (!glbMemBusRdy) {
554            // Is there an available VRF->Global memory read bus?
555            return 0;
556        }
557
558        if (!locMemBusRdy) {
559            // Is there an available VRF->LDS read bus?
560            return 0;
561        }
562
563        if (!glbMemIssueRdy) {
564            // Is wave slot free?
565            return 0;
566        }
567
568        if (!locMemIssueRdy) {
569            return 0;
570        }
571        if (!computeUnit->globalMemoryPipe.
572            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
573            // Can we insert a new request to the Global Mem Request FIFO?
574            return 0;
575        }
576
577        if (!computeUnit->localMemoryPipe.
578            isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
579            // Can we insert a new request to the LDS Request FIFO?
580            return 0;
581        }
582        // can we schedule source & destination operands on the VRF?
583        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
584                    VrfAccessType::RD_WR)) {
585            return 0;
586        }
587        // are all the operands ready? (RAW, WAW and WAR depedencies met?)
588        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
589            return 0;
590        }
591        ready_inst = true;
592    } else {
593        return 0;
594    }
595
596    assert(ready_inst);
597
598    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
599            simdId, wfSlotId, ii->disassemble());
600    return 1;
601}
602
603void
604Wavefront::updateResources()
605{
606    // Get current instruction
607    GPUDynInstPtr ii = instructionBuffer.front();
608    assert(ii);
609    computeUnit->vrf[simdId]->updateResources(this, ii);
610    // Single precision ALU or Branch or Return or Special instruction
611    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
612        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
613        // FIXME: Kernel argument loads are currently treated as ALU operations
614        // since we don't send memory packets at execution. If we fix that then
615        // we should map them to one of the memory pipelines
616        ii->opType()==Enums::OT_KERN_READ ||
617        ii->opType()==Enums::OT_ARG ||
618        ii->opType()==Enums::OT_RET) {
619        computeUnit->aluPipe[simdId].preset(computeUnit->shader->
620                                            ticks(computeUnit->spBypassLength()));
621        // this is to enforce a fixed number of cycles per issue slot per SIMD
622        computeUnit->wfWait[simdId].preset(computeUnit->shader->
623                                           ticks(computeUnit->issuePeriod));
624    } else if (ii->opType() == Enums::OT_BARRIER) {
625        computeUnit->wfWait[simdId].preset(computeUnit->shader->
626                                           ticks(computeUnit->issuePeriod));
627    } else if (ii->opType() == Enums::OT_FLAT_READ) {
628        assert(Enums::SC_NONE != ii->executedAs());
629        mem_reqs_in_pipe++;
630        rd_gm_reqs_in_pipe++;
631        if ( Enums::SC_SHARED == ii->executedAs() ) {
632            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
633                preset(computeUnit->shader->ticks(4));
634            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
635                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
636        } else {
637            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
638                preset(computeUnit->shader->ticks(4));
639            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
640                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
641        }
642    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
643        assert(Enums::SC_NONE != ii->executedAs());
644        mem_reqs_in_pipe++;
645        wr_gm_reqs_in_pipe++;
646        if (Enums::SC_SHARED == ii->executedAs()) {
647            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
648                preset(computeUnit->shader->ticks(8));
649            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
650                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
651        } else {
652            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
653                preset(computeUnit->shader->ticks(8));
654            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
655                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
656        }
657    } else if (IS_OT_READ_GM(ii->opType())) {
658        mem_reqs_in_pipe++;
659        rd_gm_reqs_in_pipe++;
660        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
661            preset(computeUnit->shader->ticks(4));
662        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
663            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
664    } else if (IS_OT_WRITE_GM(ii->opType())) {
665        mem_reqs_in_pipe++;
666        wr_gm_reqs_in_pipe++;
667        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
668            preset(computeUnit->shader->ticks(8));
669        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
670            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
671    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
672        mem_reqs_in_pipe++;
673        wr_gm_reqs_in_pipe++;
674        rd_gm_reqs_in_pipe++;
675        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
676            preset(computeUnit->shader->ticks(8));
677        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
678            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
679    } else if (IS_OT_READ_LM(ii->opType())) {
680        mem_reqs_in_pipe++;
681        rd_lm_reqs_in_pipe++;
682        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
683            preset(computeUnit->shader->ticks(4));
684        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
685            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
686    } else if (IS_OT_WRITE_LM(ii->opType())) {
687        mem_reqs_in_pipe++;
688        wr_lm_reqs_in_pipe++;
689        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
690            preset(computeUnit->shader->ticks(8));
691        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
692            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
693    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
694        mem_reqs_in_pipe++;
695        wr_lm_reqs_in_pipe++;
696        rd_lm_reqs_in_pipe++;
697        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
698            preset(computeUnit->shader->ticks(8));
699        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
700            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
701    } else if (IS_OT_READ_PM(ii->opType())) {
702        mem_reqs_in_pipe++;
703        rd_gm_reqs_in_pipe++;
704        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
705            preset(computeUnit->shader->ticks(4));
706        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
707            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
708    } else if (IS_OT_WRITE_PM(ii->opType())) {
709        mem_reqs_in_pipe++;
710        wr_gm_reqs_in_pipe++;
711        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
712            preset(computeUnit->shader->ticks(8));
713        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
714            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
715    } else if (IS_OT_ATOMIC_PM(ii->opType())) {
716        mem_reqs_in_pipe++;
717        wr_gm_reqs_in_pipe++;
718        rd_gm_reqs_in_pipe++;
719        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
720            preset(computeUnit->shader->ticks(8));
721        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
722            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
723    }
724}
725
726void
727Wavefront::exec()
728{
729    // ---- Exit if wavefront is inactive ----------------------------- //
730
731    if (status == S_STOPPED || status == S_RETURNING ||
732        instructionBuffer.empty()) {
733        return;
734    }
735
736    // Get current instruction
737
738    GPUDynInstPtr ii = instructionBuffer.front();
739
740    const uint32_t old_pc = pc();
741    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
742            "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
743            ii->disassemble(), old_pc);
744    ii->execute();
745    // access the VRF
746    computeUnit->vrf[simdId]->exec(ii, this);
747    srcRegOpDist.sample(ii->numSrcRegOperands());
748    dstRegOpDist.sample(ii->numDstRegOperands());
749    computeUnit->numInstrExecuted++;
750    computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
751                                     computeUnit->lastExecCycle[simdId]);
752    computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
753    if (pc() == old_pc) {
754        uint32_t new_pc = old_pc + 1;
755        // PC not modified by instruction, proceed to next or pop frame
756        pc(new_pc);
757        if (new_pc == rpc()) {
758            popFromReconvergenceStack();
759            discardFetch();
760        } else {
761            instructionBuffer.pop_front();
762        }
763    }
764
765    if (computeUnit->shader->hsail_mode==Shader::SIMT) {
766        const int num_active_lanes = execMask().count();
767        computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
768        computeUnit->numVecOpsExecuted += num_active_lanes;
769        if (isGmInstruction(ii)) {
770            computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
771        } else if (isLmInstruction(ii)) {
772            computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
773        }
774    }
775
776    // ---- Update Vector ALU pipeline and other resources ------------------ //
777    // Single precision ALU or Branch or Return or Special instruction
778    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
779        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
780        // FIXME: Kernel argument loads are currently treated as ALU operations
781        // since we don't send memory packets at execution. If we fix that then
782        // we should map them to one of the memory pipelines
783        ii->opType() == Enums::OT_KERN_READ ||
784        ii->opType() == Enums::OT_ARG ||
785        ii->opType() == Enums::OT_RET) {
786        computeUnit->aluPipe[simdId].set(computeUnit->shader->
787                                         ticks(computeUnit->spBypassLength()));
788
789        // this is to enforce a fixed number of cycles per issue slot per SIMD
790        computeUnit->wfWait[simdId].set(computeUnit->shader->
791                                        ticks(computeUnit->issuePeriod));
792    } else if (ii->opType() == Enums::OT_BARRIER) {
793        computeUnit->wfWait[simdId].set(computeUnit->shader->
794                                        ticks(computeUnit->issuePeriod));
795    } else if (ii->opType() == Enums::OT_FLAT_READ) {
796        assert(Enums::SC_NONE != ii->executedAs());
797
798        if (Enums::SC_SHARED == ii->executedAs()) {
799            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
800                set(computeUnit->shader->ticks(4));
801            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
802                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
803        } else {
804            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
805                set(computeUnit->shader->ticks(4));
806            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
807                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
808        }
809    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
810        assert(Enums::SC_NONE != ii->executedAs());
811        if (Enums::SC_SHARED == ii->executedAs()) {
812            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
813                set(computeUnit->shader->ticks(8));
814            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
815                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
816        } else {
817            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
818                set(computeUnit->shader->ticks(8));
819            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
820                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
821        }
822    } else if (IS_OT_READ_GM(ii->opType())) {
823        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
824            set(computeUnit->shader->ticks(4));
825        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
826            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
827    } else if (IS_OT_WRITE_GM(ii->opType())) {
828        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
829            set(computeUnit->shader->ticks(8));
830        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
831            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
832    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
833        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
834            set(computeUnit->shader->ticks(8));
835        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
836            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
837    } else if (IS_OT_READ_LM(ii->opType())) {
838        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
839            set(computeUnit->shader->ticks(4));
840        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
841            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
842    } else if (IS_OT_WRITE_LM(ii->opType())) {
843        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
844            set(computeUnit->shader->ticks(8));
845        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
846            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
847    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
848        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
849            set(computeUnit->shader->ticks(8));
850        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
851            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
852    }
853}
854
855bool
856Wavefront::waitingAtBarrier(int lane)
857{
858    return bar_cnt[lane] < max_bar_cnt;
859}
860
861void
862Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
863                                    const VectorMask& mask)
864{
865    assert(mask.count());
866    reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask));
867}
868
869void
870Wavefront::popFromReconvergenceStack()
871{
872    assert(!reconvergenceStack.empty());
873
874    DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
875            computeUnit->cu_id, simdId, wfSlotId, wfDynId,
876            execMask().to_string<char, std::string::traits_type,
877            std::string::allocator_type>().c_str(), pc());
878
879    reconvergenceStack.pop();
880
881    DPRINTF(WavefrontStack, "%3i %s\n", pc(),
882            execMask().to_string<char, std::string::traits_type,
883            std::string::allocator_type>().c_str());
884
885}
886
887void
888Wavefront::discardFetch()
889{
890    instructionBuffer.clear();
891    dropFetch |=pendingFetch;
892}
893
894uint32_t
895Wavefront::pc() const
896{
897    return reconvergenceStack.top()->pc;
898}
899
900uint32_t
901Wavefront::rpc() const
902{
903    return reconvergenceStack.top()->rpc;
904}
905
906VectorMask
907Wavefront::execMask() const
908{
909    return reconvergenceStack.top()->execMask;
910}
911
912bool
913Wavefront::execMask(int lane) const
914{
915    return reconvergenceStack.top()->execMask[lane];
916}
917
918
919void
920Wavefront::pc(uint32_t new_pc)
921{
922    reconvergenceStack.top()->pc = new_pc;
923}
924