wavefront.cc revision 11523
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#include "gpu-compute/wavefront.hh"
37
38#include "debug/GPUExec.hh"
39#include "debug/WavefrontStack.hh"
40#include "gpu-compute/code_enums.hh"
41#include "gpu-compute/compute_unit.hh"
42#include "gpu-compute/gpu_dyn_inst.hh"
43#include "gpu-compute/shader.hh"
44#include "gpu-compute/vector_register_file.hh"
45
46Wavefront*
47WavefrontParams::create()
48{
49    return new Wavefront(this);
50}
51
52Wavefront::Wavefront(const Params *p)
53  : SimObject(p), callArgMem(nullptr)
54{
55    last_trace = 0;
56    simdId = p->simdId;
57    wfSlotId = p->wf_slot_id;
58
59    status = S_STOPPED;
60    reservedVectorRegs = 0;
61    startVgprIndex = 0;
62    outstanding_reqs = 0;
63    mem_reqs_in_pipe = 0;
64    outstanding_reqs_wr_gm = 0;
65    outstanding_reqs_wr_lm = 0;
66    outstanding_reqs_rd_gm = 0;
67    outstanding_reqs_rd_lm = 0;
68    rd_lm_reqs_in_pipe = 0;
69    rd_gm_reqs_in_pipe = 0;
70    wr_lm_reqs_in_pipe = 0;
71    wr_gm_reqs_in_pipe = 0;
72
73    barrier_cnt = 0;
74    old_barrier_cnt = 0;
75    stalledAtBarrier = false;
76
77    mem_trace_busy = 0;
78    old_vgpr_tcnt = 0xffffffffffffffffll;
79    old_dgpr_tcnt = 0xffffffffffffffffll;
80
81    pendingFetch = false;
82    dropFetch = false;
83    condRegState = new ConditionRegisterState();
84    maxSpVgprs = 0;
85    maxDpVgprs = 0;
86}
87
88void
89Wavefront::regStats()
90{
91    SimObject::regStats();
92
93    srcRegOpDist
94        .init(0, 4, 2)
95        .name(name() + ".src_reg_operand_dist")
96        .desc("number of executed instructions with N source register operands")
97        ;
98
99    dstRegOpDist
100        .init(0, 3, 2)
101        .name(name() + ".dst_reg_operand_dist")
102        .desc("number of executed instructions with N destination register "
103              "operands")
104        ;
105
106    // FIXME: the name of the WF needs to be unique
107    numTimesBlockedDueWAXDependencies
108        .name(name() + ".timesBlockedDueWAXDependencies")
109        .desc("number of times the wf's instructions are blocked due to WAW "
110              "or WAR dependencies")
111        ;
112
113    // FIXME: the name of the WF needs to be unique
114    numTimesBlockedDueRAWDependencies
115        .name(name() + ".timesBlockedDueRAWDependencies")
116        .desc("number of times the wf's instructions are blocked due to RAW "
117              "dependencies")
118        ;
119
120    // FIXME: the name of the WF needs to be unique
121    numTimesBlockedDueVrfPortAvail
122        .name(name() + ".timesBlockedDueVrfPortAvail")
123        .desc("number of times instructions are blocked due to VRF port "
124              "availability")
125        ;
126}
127
128void
129Wavefront::init()
130{
131    reservedVectorRegs = 0;
132    startVgprIndex = 0;
133}
134
135void
136Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
137{
138    condRegState->init(num_cregs);
139    maxSpVgprs = num_sregs;
140    maxDpVgprs = num_dregs;
141}
142
143Wavefront::~Wavefront()
144{
145    if (callArgMem)
146        delete callArgMem;
147}
148
149void
150Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr)
151{
152    wfDynId = _wfDynId;
153    base_ptr = _base_ptr;
154    status = S_RUNNING;
155}
156
157bool
158Wavefront::isGmInstruction(GPUDynInstPtr ii)
159{
160    if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
161        IS_OT_ATOMIC_PM(ii->opType())) {
162        return true;
163    }
164
165    if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
166        IS_OT_ATOMIC_GM(ii->opType())) {
167        return true;
168    }
169
170    if (IS_OT_FLAT(ii->opType())) {
171        return true;
172    }
173
174    return false;
175}
176
177bool
178Wavefront::isLmInstruction(GPUDynInstPtr ii)
179{
180    if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) ||
181        IS_OT_ATOMIC_LM(ii->opType())) {
182        return true;
183    }
184
185    return false;
186}
187
188bool
189Wavefront::isOldestInstALU()
190{
191    assert(!instructionBuffer.empty());
192    GPUDynInstPtr ii = instructionBuffer.front();
193
194    if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP ||
195        ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
196        ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
197        ii->opType() == Enums::OT_KERN_READ)) {
198        return true;
199    }
200
201    return false;
202}
203
204bool
205Wavefront::isOldestInstBarrier()
206{
207    assert(!instructionBuffer.empty());
208    GPUDynInstPtr ii = instructionBuffer.front();
209
210    if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) {
211        return true;
212    }
213
214    return false;
215}
216
217bool
218Wavefront::isOldestInstGMem()
219{
220    assert(!instructionBuffer.empty());
221    GPUDynInstPtr ii = instructionBuffer.front();
222
223    if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) ||
224        IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
225
226        return true;
227    }
228
229    return false;
230}
231
232bool
233Wavefront::isOldestInstLMem()
234{
235    assert(!instructionBuffer.empty());
236    GPUDynInstPtr ii = instructionBuffer.front();
237
238    if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) ||
239        IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
240
241        return true;
242    }
243
244    return false;
245}
246
247bool
248Wavefront::isOldestInstPrivMem()
249{
250    assert(!instructionBuffer.empty());
251    GPUDynInstPtr ii = instructionBuffer.front();
252
253    if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) ||
254        IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
255
256        return true;
257    }
258
259    return false;
260}
261
262bool
263Wavefront::isOldestInstFlatMem()
264{
265    assert(!instructionBuffer.empty());
266    GPUDynInstPtr ii = instructionBuffer.front();
267
268    if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) {
269
270        return true;
271    }
272
273    return false;
274}
275
276// Return true if the Wavefront's instruction
277// buffer has branch instruction.
278bool
279Wavefront::instructionBufferHasBranch()
280{
281    for (auto it : instructionBuffer) {
282        GPUDynInstPtr ii = it;
283
284        if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) {
285            return true;
286        }
287    }
288
289    return false;
290}
291
292// Remap HSAIL register to physical VGPR.
293// HSAIL register = virtual register assigned to an operand by HLC compiler
294uint32_t
295Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
296{
297    assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
298    // add the offset from where the VGPRs of the wavefront have been assigned
299    uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
300    // HSAIL double precision (DP) register: calculate the physical VGPR index
301    // assuming that DP registers are placed after SP ones in the VRF. The DP
302    // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
303    // the DP VGPR index before mapping it to the physical VRF address space
304    if (mode == 1 && size > 4) {
305        physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
306    }
307
308    assert((startVgprIndex <= physicalVgprIndex) &&
309           (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
310
311    // calculate absolute physical VGPR index
312    return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
313}
314
315// Return true if this wavefront is ready
316// to execute an instruction of the specified type.
317int
318Wavefront::ready(itype_e type)
319{
320    // Check to make sure wave is running
321    if (status == S_STOPPED || status == S_RETURNING ||
322        instructionBuffer.empty()) {
323        return 0;
324    }
325
326    // Is the wave waiting at a barrier
327    if (stalledAtBarrier) {
328        if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt,
329                        computeUnit->getRefCounter(dispatchid, wg_id))) {
330            // Are all threads at barrier?
331            return 0;
332        }
333        old_barrier_cnt = barrier_cnt;
334        stalledAtBarrier = false;
335    }
336
337    // Read instruction
338    GPUDynInstPtr ii = instructionBuffer.front();
339
340    bool ready_inst M5_VAR_USED = false;
341    bool glbMemBusRdy = false;
342    bool glbMemIssueRdy = false;
343    if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
344        for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
345            if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
346                glbMemBusRdy = true;
347            if (computeUnit->wfWait[j].prerdy())
348                glbMemIssueRdy = true;
349        }
350    }
351    bool locMemBusRdy = false;
352    bool locMemIssueRdy = false;
353    if (type == I_SHARED || type == I_FLAT) {
354        for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
355            if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
356                locMemBusRdy = true;
357            if (computeUnit->wfWait[j].prerdy())
358                locMemIssueRdy = true;
359        }
360    }
361
362    // The following code is very error prone and the entire process for
363    // checking readiness will be fixed eventually.  In the meantime, let's
364    // make sure that we do not silently let an instruction type slip
365    // through this logic and always return not ready.
366    if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP ||
367          ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
368          ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
369          ii->opType() == Enums::OT_KERN_READ ||
370          ii->opType() == Enums::OT_ARG ||
371          IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
372          IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) ||
373          IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
374          IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
375          IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) {
376        panic("next instruction: %s is of unknown type\n", ii->disassemble());
377    }
378
379    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
380            computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
381
382    if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) {
383        // Here for ALU instruction (barrier)
384        if (!computeUnit->wfWait[simdId].prerdy()) {
385            // Is wave slot free?
386            return 0;
387        }
388
389        // Are there in pipe or outstanding memory requests?
390        if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
391            return 0;
392        }
393
394        ready_inst = true;
395    } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) {
396        // Here for ALU instruction (nop)
397        if (!computeUnit->wfWait[simdId].prerdy()) {
398            // Is wave slot free?
399            return 0;
400        }
401
402        ready_inst = true;
403    } else if (type == I_ALU && ii->opType() == Enums::OT_RET) {
404        // Here for ALU instruction (return)
405        if (!computeUnit->wfWait[simdId].prerdy()) {
406            // Is wave slot free?
407            return 0;
408        }
409
410        // Are there in pipe or outstanding memory requests?
411        if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
412            return 0;
413        }
414
415        ready_inst = true;
416    } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH ||
417               ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
418               ii->opType() == Enums::OT_KERN_READ ||
419               ii->opType() == Enums::OT_ARG)) {
420        // Here for ALU instruction (all others)
421        if (!computeUnit->wfWait[simdId].prerdy()) {
422            // Is alu slot free?
423            return 0;
424        }
425        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
426                    VrfAccessType::RD_WR)) {
427            return 0;
428        }
429
430        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
431            return 0;
432        }
433        ready_inst = true;
434    } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) ||
435               IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
436        // Here Global memory instruction
437        if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) {
438            // Are there in pipe or outstanding global memory write requests?
439            if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
440                return 0;
441            }
442        }
443
444        if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) ||
445            IS_OT_HIST_GM(ii->opType())) {
446            // Are there in pipe or outstanding global memory read requests?
447            if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0)
448                return 0;
449        }
450
451        if (!glbMemIssueRdy) {
452            // Is WV issue slot free?
453            return 0;
454        }
455
456        if (!glbMemBusRdy) {
457            // Is there an available VRF->Global memory read bus?
458            return 0;
459        }
460
461        if (!computeUnit->globalMemoryPipe.
462            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
463            // Can we insert a new request to the Global Mem Request FIFO?
464            return 0;
465        }
466        // can we schedule source & destination operands on the VRF?
467        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
468                    VrfAccessType::RD_WR)) {
469            return 0;
470        }
471        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
472            return 0;
473        }
474        ready_inst = true;
475    } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) ||
476               IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
477        // Here for Shared memory instruction
478        if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) {
479            if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) {
480                return 0;
481            }
482        }
483
484        if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
485            IS_OT_HIST_LM(ii->opType())) {
486            if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) {
487                return 0;
488            }
489        }
490
491        if (!locMemBusRdy) {
492            // Is there an available VRF->LDS read bus?
493            return 0;
494        }
495        if (!locMemIssueRdy) {
496            // Is wave slot free?
497            return 0;
498        }
499
500        if (!computeUnit->localMemoryPipe.
501            isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
502            // Can we insert a new request to the LDS Request FIFO?
503            return 0;
504        }
505        // can we schedule source & destination operands on the VRF?
506        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
507                    VrfAccessType::RD_WR)) {
508            return 0;
509        }
510        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
511            return 0;
512        }
513        ready_inst = true;
514    } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) ||
515               IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
516        // Here for Private memory instruction ------------------------    //
517        if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) {
518            if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
519                return 0;
520            }
521        }
522
523        if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) ||
524            IS_OT_HIST_PM(ii->opType())) {
525            if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) {
526                return 0;
527            }
528        }
529
530        if (!glbMemBusRdy) {
531            // Is there an available VRF->Global memory read bus?
532            return 0;
533        }
534
535        if (!glbMemIssueRdy) {
536             // Is wave slot free?
537            return 0;
538        }
539
540        if (!computeUnit->globalMemoryPipe.
541            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
542            // Can we insert a new request to the Global Mem Request FIFO?
543            return 0;
544        }
545        // can we schedule source & destination operands on the VRF?
546        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
547                    VrfAccessType::RD_WR)) {
548            return 0;
549        }
550        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
551            return 0;
552        }
553        ready_inst = true;
554    } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) {
555        if (!glbMemBusRdy) {
556            // Is there an available VRF->Global memory read bus?
557            return 0;
558        }
559
560        if (!locMemBusRdy) {
561            // Is there an available VRF->LDS read bus?
562            return 0;
563        }
564
565        if (!glbMemIssueRdy) {
566            // Is wave slot free?
567            return 0;
568        }
569
570        if (!locMemIssueRdy) {
571            return 0;
572        }
573        if (!computeUnit->globalMemoryPipe.
574            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
575            // Can we insert a new request to the Global Mem Request FIFO?
576            return 0;
577        }
578
579        if (!computeUnit->localMemoryPipe.
580            isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
581            // Can we insert a new request to the LDS Request FIFO?
582            return 0;
583        }
584        // can we schedule source & destination operands on the VRF?
585        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
586                    VrfAccessType::RD_WR)) {
587            return 0;
588        }
589        // are all the operands ready? (RAW, WAW and WAR depedencies met?)
590        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
591            return 0;
592        }
593        ready_inst = true;
594    } else {
595        return 0;
596    }
597
598    assert(ready_inst);
599
600    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
601            simdId, wfSlotId, ii->disassemble());
602    return 1;
603}
604
605void
606Wavefront::updateResources()
607{
608    // Get current instruction
609    GPUDynInstPtr ii = instructionBuffer.front();
610    assert(ii);
611    computeUnit->vrf[simdId]->updateResources(this, ii);
612    // Single precision ALU or Branch or Return or Special instruction
613    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
614        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
615        // FIXME: Kernel argument loads are currently treated as ALU operations
616        // since we don't send memory packets at execution. If we fix that then
617        // we should map them to one of the memory pipelines
618        ii->opType()==Enums::OT_KERN_READ ||
619        ii->opType()==Enums::OT_ARG ||
620        ii->opType()==Enums::OT_RET) {
621        computeUnit->aluPipe[simdId].preset(computeUnit->shader->
622                                            ticks(computeUnit->spBypassLength()));
623        // this is to enforce a fixed number of cycles per issue slot per SIMD
624        computeUnit->wfWait[simdId].preset(computeUnit->shader->
625                                           ticks(computeUnit->issuePeriod));
626    } else if (ii->opType() == Enums::OT_BARRIER) {
627        computeUnit->wfWait[simdId].preset(computeUnit->shader->
628                                           ticks(computeUnit->issuePeriod));
629    } else if (ii->opType() == Enums::OT_FLAT_READ) {
630        assert(Enums::SC_NONE != ii->executedAs());
631        mem_reqs_in_pipe++;
632        rd_gm_reqs_in_pipe++;
633        if ( Enums::SC_SHARED == ii->executedAs() ) {
634            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
635                preset(computeUnit->shader->ticks(4));
636            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
637                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
638        } else {
639            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
640                preset(computeUnit->shader->ticks(4));
641            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
642                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
643        }
644    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
645        assert(Enums::SC_NONE != ii->executedAs());
646        mem_reqs_in_pipe++;
647        wr_gm_reqs_in_pipe++;
648        if (Enums::SC_SHARED == ii->executedAs()) {
649            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
650                preset(computeUnit->shader->ticks(8));
651            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
652                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
653        } else {
654            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
655                preset(computeUnit->shader->ticks(8));
656            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
657                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
658        }
659    } else if (IS_OT_READ_GM(ii->opType())) {
660        mem_reqs_in_pipe++;
661        rd_gm_reqs_in_pipe++;
662        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
663            preset(computeUnit->shader->ticks(4));
664        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
665            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
666    } else if (IS_OT_WRITE_GM(ii->opType())) {
667        mem_reqs_in_pipe++;
668        wr_gm_reqs_in_pipe++;
669        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
670            preset(computeUnit->shader->ticks(8));
671        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
672            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
673    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
674        mem_reqs_in_pipe++;
675        wr_gm_reqs_in_pipe++;
676        rd_gm_reqs_in_pipe++;
677        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
678            preset(computeUnit->shader->ticks(8));
679        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
680            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
681    } else if (IS_OT_READ_LM(ii->opType())) {
682        mem_reqs_in_pipe++;
683        rd_lm_reqs_in_pipe++;
684        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
685            preset(computeUnit->shader->ticks(4));
686        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
687            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
688    } else if (IS_OT_WRITE_LM(ii->opType())) {
689        mem_reqs_in_pipe++;
690        wr_lm_reqs_in_pipe++;
691        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
692            preset(computeUnit->shader->ticks(8));
693        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
694            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
695    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
696        mem_reqs_in_pipe++;
697        wr_lm_reqs_in_pipe++;
698        rd_lm_reqs_in_pipe++;
699        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
700            preset(computeUnit->shader->ticks(8));
701        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
702            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
703    } else if (IS_OT_READ_PM(ii->opType())) {
704        mem_reqs_in_pipe++;
705        rd_gm_reqs_in_pipe++;
706        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
707            preset(computeUnit->shader->ticks(4));
708        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
709            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
710    } else if (IS_OT_WRITE_PM(ii->opType())) {
711        mem_reqs_in_pipe++;
712        wr_gm_reqs_in_pipe++;
713        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
714            preset(computeUnit->shader->ticks(8));
715        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
716            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
717    } else if (IS_OT_ATOMIC_PM(ii->opType())) {
718        mem_reqs_in_pipe++;
719        wr_gm_reqs_in_pipe++;
720        rd_gm_reqs_in_pipe++;
721        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
722            preset(computeUnit->shader->ticks(8));
723        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
724            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
725    }
726}
727
728void
729Wavefront::exec()
730{
731    // ---- Exit if wavefront is inactive ----------------------------- //
732
733    if (status == S_STOPPED || status == S_RETURNING ||
734        instructionBuffer.empty()) {
735        return;
736    }
737
738    // Get current instruction
739
740    GPUDynInstPtr ii = instructionBuffer.front();
741
742    const uint32_t old_pc = pc();
743    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
744            "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
745            ii->disassemble(), old_pc);
746    ii->execute();
747    // access the VRF
748    computeUnit->vrf[simdId]->exec(ii, this);
749    srcRegOpDist.sample(ii->numSrcRegOperands());
750    dstRegOpDist.sample(ii->numDstRegOperands());
751    computeUnit->numInstrExecuted++;
752    computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
753                                     computeUnit->lastExecCycle[simdId]);
754    computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
755    if (pc() == old_pc) {
756        uint32_t new_pc = old_pc + 1;
757        // PC not modified by instruction, proceed to next or pop frame
758        pc(new_pc);
759        if (new_pc == rpc()) {
760            popFromReconvergenceStack();
761            discardFetch();
762        } else {
763            instructionBuffer.pop_front();
764        }
765    }
766
767    if (computeUnit->shader->hsail_mode==Shader::SIMT) {
768        const int num_active_lanes = execMask().count();
769        computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
770        computeUnit->numVecOpsExecuted += num_active_lanes;
771        if (isGmInstruction(ii)) {
772            computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
773        } else if (isLmInstruction(ii)) {
774            computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
775        }
776    }
777
778    // ---- Update Vector ALU pipeline and other resources ------------------ //
779    // Single precision ALU or Branch or Return or Special instruction
780    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
781        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
782        // FIXME: Kernel argument loads are currently treated as ALU operations
783        // since we don't send memory packets at execution. If we fix that then
784        // we should map them to one of the memory pipelines
785        ii->opType() == Enums::OT_KERN_READ ||
786        ii->opType() == Enums::OT_ARG ||
787        ii->opType() == Enums::OT_RET) {
788        computeUnit->aluPipe[simdId].set(computeUnit->shader->
789                                         ticks(computeUnit->spBypassLength()));
790
791        // this is to enforce a fixed number of cycles per issue slot per SIMD
792        computeUnit->wfWait[simdId].set(computeUnit->shader->
793                                        ticks(computeUnit->issuePeriod));
794    } else if (ii->opType() == Enums::OT_BARRIER) {
795        computeUnit->wfWait[simdId].set(computeUnit->shader->
796                                        ticks(computeUnit->issuePeriod));
797    } else if (ii->opType() == Enums::OT_FLAT_READ) {
798        assert(Enums::SC_NONE != ii->executedAs());
799
800        if (Enums::SC_SHARED == ii->executedAs()) {
801            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
802                set(computeUnit->shader->ticks(4));
803            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
804                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
805        } else {
806            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
807                set(computeUnit->shader->ticks(4));
808            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
809                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
810        }
811    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
812        assert(Enums::SC_NONE != ii->executedAs());
813        if (Enums::SC_SHARED == ii->executedAs()) {
814            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
815                set(computeUnit->shader->ticks(8));
816            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
817                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
818        } else {
819            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
820                set(computeUnit->shader->ticks(8));
821            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
822                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
823        }
824    } else if (IS_OT_READ_GM(ii->opType())) {
825        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
826            set(computeUnit->shader->ticks(4));
827        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
828            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
829    } else if (IS_OT_WRITE_GM(ii->opType())) {
830        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
831            set(computeUnit->shader->ticks(8));
832        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
833            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
834    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
835        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
836            set(computeUnit->shader->ticks(8));
837        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
838            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
839    } else if (IS_OT_READ_LM(ii->opType())) {
840        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
841            set(computeUnit->shader->ticks(4));
842        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
843            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
844    } else if (IS_OT_WRITE_LM(ii->opType())) {
845        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
846            set(computeUnit->shader->ticks(8));
847        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
848            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
849    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
850        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
851            set(computeUnit->shader->ticks(8));
852        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
853            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
854    }
855}
856
857bool
858Wavefront::waitingAtBarrier(int lane)
859{
860    return bar_cnt[lane] < max_bar_cnt;
861}
862
863void
864Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
865                                    const VectorMask& mask)
866{
867    assert(mask.count());
868    reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask));
869}
870
871void
872Wavefront::popFromReconvergenceStack()
873{
874    assert(!reconvergenceStack.empty());
875
876    DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
877            computeUnit->cu_id, simdId, wfSlotId, wfDynId,
878            execMask().to_string<char, std::string::traits_type,
879            std::string::allocator_type>().c_str(), pc());
880
881    reconvergenceStack.pop();
882
883    DPRINTF(WavefrontStack, "%3i %s\n", pc(),
884            execMask().to_string<char, std::string::traits_type,
885            std::string::allocator_type>().c_str());
886
887}
888
889void
890Wavefront::discardFetch()
891{
892    instructionBuffer.clear();
893    dropFetch |=pendingFetch;
894}
895
896uint32_t
897Wavefront::pc() const
898{
899    return reconvergenceStack.top()->pc;
900}
901
902uint32_t
903Wavefront::rpc() const
904{
905    return reconvergenceStack.top()->rpc;
906}
907
908VectorMask
909Wavefront::execMask() const
910{
911    return reconvergenceStack.top()->execMask;
912}
913
914bool
915Wavefront::execMask(int lane) const
916{
917    return reconvergenceStack.top()->execMask[lane];
918}
919
920
921void
922Wavefront::pc(uint32_t new_pc)
923{
924    reconvergenceStack.top()->pc = new_pc;
925}
926