wavefront.cc revision 11308
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#include "gpu-compute/wavefront.hh"
37
38#include "debug/GPUExec.hh"
39#include "debug/WavefrontStack.hh"
40#include "gpu-compute/code_enums.hh"
41#include "gpu-compute/compute_unit.hh"
42#include "gpu-compute/gpu_dyn_inst.hh"
43#include "gpu-compute/shader.hh"
44#include "gpu-compute/vector_register_file.hh"
45
46Wavefront*
47WavefrontParams::create()
48{
49    return new Wavefront(this);
50}
51
52Wavefront::Wavefront(const Params *p)
53  : SimObject(p), callArgMem(nullptr)
54{
55    last_trace = 0;
56    simdId = p->simdId;
57    wfSlotId = p->wf_slot_id;
58
59    status = S_STOPPED;
60    reservedVectorRegs = 0;
61    startVgprIndex = 0;
62    outstanding_reqs = 0;
63    mem_reqs_in_pipe = 0;
64    outstanding_reqs_wr_gm = 0;
65    outstanding_reqs_wr_lm = 0;
66    outstanding_reqs_rd_gm = 0;
67    outstanding_reqs_rd_lm = 0;
68    rd_lm_reqs_in_pipe = 0;
69    rd_gm_reqs_in_pipe = 0;
70    wr_lm_reqs_in_pipe = 0;
71    wr_gm_reqs_in_pipe = 0;
72
73    barrier_cnt = 0;
74    old_barrier_cnt = 0;
75    stalledAtBarrier = false;
76
77    mem_trace_busy = 0;
78    old_vgpr_tcnt = 0xffffffffffffffffll;
79    old_dgpr_tcnt = 0xffffffffffffffffll;
80
81    pendingFetch = false;
82    dropFetch = false;
83    condRegState = new ConditionRegisterState();
84    maxSpVgprs = 0;
85    maxDpVgprs = 0;
86}
87
88void
89Wavefront::regStats()
90{
91    srcRegOpDist
92        .init(0, 4, 2)
93        .name(name() + ".src_reg_operand_dist")
94        .desc("number of executed instructions with N source register operands")
95        ;
96
97    dstRegOpDist
98        .init(0, 3, 2)
99        .name(name() + ".dst_reg_operand_dist")
100        .desc("number of executed instructions with N destination register "
101              "operands")
102        ;
103
104    // FIXME: the name of the WF needs to be unique
105    numTimesBlockedDueWAXDependencies
106        .name(name() + ".timesBlockedDueWAXDependencies")
107        .desc("number of times the wf's instructions are blocked due to WAW "
108              "or WAR dependencies")
109        ;
110
111    // FIXME: the name of the WF needs to be unique
112    numTimesBlockedDueRAWDependencies
113        .name(name() + ".timesBlockedDueRAWDependencies")
114        .desc("number of times the wf's instructions are blocked due to RAW "
115              "dependencies")
116        ;
117
118    // FIXME: the name of the WF needs to be unique
119    numTimesBlockedDueVrfPortAvail
120        .name(name() + ".timesBlockedDueVrfPortAvail")
121        .desc("number of times instructions are blocked due to VRF port "
122              "availability")
123        ;
124}
125
126void
127Wavefront::init()
128{
129    reservedVectorRegs = 0;
130    startVgprIndex = 0;
131}
132
133void
134Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
135{
136    condRegState->init(num_cregs);
137    maxSpVgprs = num_sregs;
138    maxDpVgprs = num_dregs;
139}
140
141Wavefront::~Wavefront()
142{
143    if (callArgMem)
144        delete callArgMem;
145}
146
147void
148Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr)
149{
150    wfDynId = _wfDynId;
151    base_ptr = _base_ptr;
152    status = S_RUNNING;
153}
154
155bool
156Wavefront::isGmInstruction(GPUDynInstPtr ii)
157{
158    if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
159        IS_OT_ATOMIC_PM(ii->opType())) {
160        return true;
161    }
162
163    if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
164        IS_OT_ATOMIC_GM(ii->opType())) {
165
166        return true;
167    }
168
169    if (IS_OT_FLAT(ii->opType())) {
170        return true;
171    }
172
173    return false;
174}
175
176bool
177Wavefront::isLmInstruction(GPUDynInstPtr ii)
178{
179    if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) ||
180        IS_OT_ATOMIC_LM(ii->opType())) {
181        return true;
182    }
183
184    return false;
185}
186
187bool
188Wavefront::isOldestInstALU()
189{
190    assert(!instructionBuffer.empty());
191    GPUDynInstPtr ii = instructionBuffer.front();
192
193    if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP ||
194        ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
195        ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
196        ii->opType() == Enums::OT_KERN_READ)) {
197        return true;
198    }
199
200    return false;
201}
202
203bool
204Wavefront::isOldestInstBarrier()
205{
206    assert(!instructionBuffer.empty());
207    GPUDynInstPtr ii = instructionBuffer.front();
208
209    if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) {
210        return true;
211    }
212
213    return false;
214}
215
216bool
217Wavefront::isOldestInstGMem()
218{
219    assert(!instructionBuffer.empty());
220    GPUDynInstPtr ii = instructionBuffer.front();
221
222    if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) ||
223        IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
224
225        return true;
226    }
227
228    return false;
229}
230
231bool
232Wavefront::isOldestInstLMem()
233{
234    assert(!instructionBuffer.empty());
235    GPUDynInstPtr ii = instructionBuffer.front();
236
237    if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) ||
238        IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
239
240        return true;
241    }
242
243    return false;
244}
245
246bool
247Wavefront::isOldestInstPrivMem()
248{
249    assert(!instructionBuffer.empty());
250    GPUDynInstPtr ii = instructionBuffer.front();
251
252    if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) ||
253        IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
254
255        return true;
256    }
257
258    return false;
259}
260
261bool
262Wavefront::isOldestInstFlatMem()
263{
264    assert(!instructionBuffer.empty());
265    GPUDynInstPtr ii = instructionBuffer.front();
266
267    if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) {
268
269        return true;
270    }
271
272    return false;
273}
274
275// Return true if the Wavefront's instruction
276// buffer has branch instruction.
277bool
278Wavefront::instructionBufferHasBranch()
279{
280    for (auto it : instructionBuffer) {
281        GPUDynInstPtr ii = it;
282
283        if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) {
284            return true;
285        }
286    }
287
288    return false;
289}
290
291// Remap HSAIL register to physical VGPR.
292// HSAIL register = virtual register assigned to an operand by HLC compiler
293uint32_t
294Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
295{
296    assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
297    // add the offset from where the VGPRs of the wavefront have been assigned
298    uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
299    // HSAIL double precision (DP) register: calculate the physical VGPR index
300    // assuming that DP registers are placed after SP ones in the VRF. The DP
301    // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
302    // the DP VGPR index before mapping it to the physical VRF address space
303    if (mode == 1 && size > 4) {
304        physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
305    }
306
307    assert((startVgprIndex <= physicalVgprIndex) &&
308           (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
309
310    // calculate absolute physical VGPR index
311    return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
312}
313
314// Return true if this wavefront is ready
315// to execute an instruction of the specified type.
316int
317Wavefront::ready(itype_e type)
318{
319    // Check to make sure wave is running
320    if (status == S_STOPPED || status == S_RETURNING ||
321        instructionBuffer.empty()) {
322        return 0;
323    }
324
325    // Is the wave waiting at a barrier
326    if (stalledAtBarrier) {
327        if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt,
328                        computeUnit->getRefCounter(dispatchid, wg_id))) {
329            // Are all threads at barrier?
330            return 0;
331        }
332        old_barrier_cnt = barrier_cnt;
333        stalledAtBarrier = false;
334    }
335
336    // Read instruction
337    GPUDynInstPtr ii = instructionBuffer.front();
338
339    bool ready_inst M5_VAR_USED = false;
340    bool glbMemBusRdy = false;
341    bool glbMemIssueRdy = false;
342    if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
343        for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
344            if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
345                glbMemBusRdy = true;
346            if (computeUnit->wfWait[j].prerdy())
347                glbMemIssueRdy = true;
348        }
349    }
350    bool locMemBusRdy = false;
351    bool locMemIssueRdy = false;
352    if (type == I_SHARED) {
353        for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
354            if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
355                locMemBusRdy = true;
356            if (computeUnit->wfWait[j].prerdy())
357                locMemIssueRdy = true;
358        }
359    }
360
361    // The following code is very error prone and the entire process for
362    // checking readiness will be fixed eventually.  In the meantime, let's
363    // make sure that we do not silently let an instruction type slip
364    // through this logic and always return not ready.
365    if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP ||
366          ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
367          ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
368          ii->opType() == Enums::OT_KERN_READ ||
369          ii->opType() == Enums::OT_ARG ||
370          IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
371          IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) ||
372          IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
373          IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
374          IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) {
375        panic("next instruction: %s is of unknown type\n", ii->disassemble());
376    }
377
378    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
379            computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
380
381    if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) {
382        // Here for ALU instruction (barrier)
383        if (!computeUnit->wfWait[simdId].prerdy()) {
384            // Is wave slot free?
385            return 0;
386        }
387
388        // Are there in pipe or outstanding memory requests?
389        if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
390            return 0;
391        }
392
393        ready_inst = true;
394    } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) {
395        // Here for ALU instruction (nop)
396        if (!computeUnit->wfWait[simdId].prerdy()) {
397            // Is wave slot free?
398            return 0;
399        }
400
401        ready_inst = true;
402    } else if (type == I_ALU && ii->opType() == Enums::OT_RET) {
403        // Here for ALU instruction (return)
404        if (!computeUnit->wfWait[simdId].prerdy()) {
405            // Is wave slot free?
406            return 0;
407        }
408
409        // Are there in pipe or outstanding memory requests?
410        if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
411            return 0;
412        }
413
414        ready_inst = true;
415    } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH ||
416               ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
417               ii->opType() == Enums::OT_KERN_READ ||
418               ii->opType() == Enums::OT_ARG)) {
419        // Here for ALU instruction (all others)
420        if (!computeUnit->wfWait[simdId].prerdy()) {
421            // Is alu slot free?
422            return 0;
423        }
424        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
425                    VrfAccessType::RD_WR)) {
426            return 0;
427        }
428
429        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
430            return 0;
431        }
432        ready_inst = true;
433    } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) ||
434               IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
435        // Here Global memory instruction
436        if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) {
437            // Are there in pipe or outstanding global memory write requests?
438            if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
439                return 0;
440            }
441        }
442
443        if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) ||
444            IS_OT_HIST_GM(ii->opType())) {
445            // Are there in pipe or outstanding global memory read requests?
446            if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0)
447                return 0;
448        }
449
450        if (!glbMemIssueRdy) {
451            // Is WV issue slot free?
452            return 0;
453        }
454
455        if (!glbMemBusRdy) {
456            // Is there an available VRF->Global memory read bus?
457            return 0;
458        }
459
460        if (!computeUnit->globalMemoryPipe.
461            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
462            // Can we insert a new request to the Global Mem Request FIFO?
463            return 0;
464        }
465        // can we schedule source & destination operands on the VRF?
466        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
467                    VrfAccessType::RD_WR)) {
468            return 0;
469        }
470        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
471            return 0;
472        }
473        ready_inst = true;
474    } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) ||
475               IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
476        // Here for Shared memory instruction
477        if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) {
478            if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) {
479                return 0;
480            }
481        }
482
483        if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
484            IS_OT_HIST_LM(ii->opType())) {
485            if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) {
486                return 0;
487            }
488        }
489
490        if (!locMemBusRdy) {
491            // Is there an available VRF->LDS read bus?
492            return 0;
493        }
494        if (!locMemIssueRdy) {
495            // Is wave slot free?
496            return 0;
497        }
498
499        if (!computeUnit->localMemoryPipe.
500            isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
501            // Can we insert a new request to the LDS Request FIFO?
502            return 0;
503        }
504        // can we schedule source & destination operands on the VRF?
505        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
506                    VrfAccessType::RD_WR)) {
507            return 0;
508        }
509        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
510            return 0;
511        }
512        ready_inst = true;
513    } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) ||
514               IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
515        // Here for Private memory instruction ------------------------    //
516        if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) {
517            if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
518                return 0;
519            }
520        }
521
522        if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) ||
523            IS_OT_HIST_PM(ii->opType())) {
524            if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) {
525                return 0;
526            }
527        }
528
529        if (!glbMemBusRdy) {
530            // Is there an available VRF->Global memory read bus?
531            return 0;
532        }
533
534        if (!glbMemIssueRdy) {
535             // Is wave slot free?
536            return 0;
537        }
538
539        if (!computeUnit->globalMemoryPipe.
540            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
541            // Can we insert a new request to the Global Mem Request FIFO?
542            return 0;
543        }
544        // can we schedule source & destination operands on the VRF?
545        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
546                    VrfAccessType::RD_WR)) {
547            return 0;
548        }
549        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
550            return 0;
551        }
552        ready_inst = true;
553    } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) {
554        if (!glbMemBusRdy) {
555            // Is there an available VRF->Global memory read bus?
556            return 0;
557        }
558
559        if (!locMemBusRdy) {
560            // Is there an available VRF->LDS read bus?
561            return 0;
562        }
563
564        if (!glbMemIssueRdy) {
565            // Is wave slot free?
566            return 0;
567        }
568
569        if (!locMemIssueRdy) {
570            return 0;
571        }
572        if (!computeUnit->globalMemoryPipe.
573            isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
574            // Can we insert a new request to the Global Mem Request FIFO?
575            return 0;
576        }
577
578        if (!computeUnit->localMemoryPipe.
579            isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
580            // Can we insert a new request to the LDS Request FIFO?
581            return 0;
582        }
583        // can we schedule source & destination operands on the VRF?
584        if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
585                    VrfAccessType::RD_WR)) {
586            return 0;
587        }
588        // are all the operands ready? (RAW, WAW and WAR depedencies met?)
589        if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
590            return 0;
591        }
592        ready_inst = true;
593    } else {
594        return 0;
595    }
596
597    assert(ready_inst);
598
599    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
600            simdId, wfSlotId, ii->disassemble());
601
602    return 1;
603}
604
605void
606Wavefront::updateResources()
607{
608    // Get current instruction
609    GPUDynInstPtr ii = instructionBuffer.front();
610    assert(ii);
611    computeUnit->vrf[simdId]->updateResources(this, ii);
612    // Single precision ALU or Branch or Return or Special instruction
613    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
614        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
615        // FIXME: Kernel argument loads are currently treated as ALU operations
616        // since we don't send memory packets at execution. If we fix that then
617        // we should map them to one of the memory pipelines
618        ii->opType()==Enums::OT_KERN_READ ||
619        ii->opType()==Enums::OT_ARG ||
620        ii->opType()==Enums::OT_RET) {
621        computeUnit->aluPipe[simdId].preset(computeUnit->shader->
622                                            ticks(computeUnit->spBypassLength()));
623        // this is to enforce a fixed number of cycles per issue slot per SIMD
624        computeUnit->wfWait[simdId].preset(computeUnit->shader->
625                                           ticks(computeUnit->issuePeriod));
626    } else if (ii->opType() == Enums::OT_BARRIER) {
627        computeUnit->wfWait[simdId].preset(computeUnit->shader->
628                                           ticks(computeUnit->issuePeriod));
629    } else if (ii->opType() == Enums::OT_FLAT_READ) {
630        assert(Enums::SC_NONE != ii->executedAs());
631        mem_reqs_in_pipe++;
632        rd_gm_reqs_in_pipe++;
633        if ( Enums::SC_SHARED == ii->executedAs() ) {
634            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
635                preset(computeUnit->shader->ticks(4));
636            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
637                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
638        } else {
639            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
640                preset(computeUnit->shader->ticks(4));
641            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
642                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
643        }
644    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
645        assert(Enums::SC_NONE != ii->executedAs());
646        mem_reqs_in_pipe++;
647        wr_gm_reqs_in_pipe++;
648        if (Enums::SC_SHARED == ii->executedAs()) {
649            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
650                preset(computeUnit->shader->ticks(8));
651            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
652                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
653        } else {
654            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
655                preset(computeUnit->shader->ticks(8));
656            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
657                preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
658        }
659    } else if (IS_OT_READ_GM(ii->opType())) {
660        mem_reqs_in_pipe++;
661        rd_gm_reqs_in_pipe++;
662        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
663            preset(computeUnit->shader->ticks(4));
664        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
665            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
666    } else if (IS_OT_WRITE_GM(ii->opType())) {
667        mem_reqs_in_pipe++;
668        wr_gm_reqs_in_pipe++;
669        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
670            preset(computeUnit->shader->ticks(8));
671        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
672            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
673    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
674        mem_reqs_in_pipe++;
675        wr_gm_reqs_in_pipe++;
676        rd_gm_reqs_in_pipe++;
677        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
678            preset(computeUnit->shader->ticks(8));
679        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
680            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
681    } else if (IS_OT_READ_LM(ii->opType())) {
682        mem_reqs_in_pipe++;
683        rd_lm_reqs_in_pipe++;
684        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
685            preset(computeUnit->shader->ticks(4));
686        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
687            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
688    } else if (IS_OT_WRITE_LM(ii->opType())) {
689        mem_reqs_in_pipe++;
690        wr_lm_reqs_in_pipe++;
691        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
692            preset(computeUnit->shader->ticks(8));
693        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
694            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
695    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
696        mem_reqs_in_pipe++;
697        wr_lm_reqs_in_pipe++;
698        rd_lm_reqs_in_pipe++;
699        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
700            preset(computeUnit->shader->ticks(8));
701        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
702            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
703    } else if (IS_OT_READ_PM(ii->opType())) {
704        mem_reqs_in_pipe++;
705        rd_gm_reqs_in_pipe++;
706        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
707            preset(computeUnit->shader->ticks(4));
708        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
709            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
710    } else if (IS_OT_WRITE_PM(ii->opType())) {
711        mem_reqs_in_pipe++;
712        wr_gm_reqs_in_pipe++;
713        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
714            preset(computeUnit->shader->ticks(8));
715        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
716            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
717    } else if (IS_OT_ATOMIC_PM(ii->opType())) {
718        mem_reqs_in_pipe++;
719        wr_gm_reqs_in_pipe++;
720        rd_gm_reqs_in_pipe++;
721        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
722            preset(computeUnit->shader->ticks(8));
723        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
724            preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
725    }
726}
727
728void
729Wavefront::exec()
730{
731    // ---- Exit if wavefront is inactive ----------------------------- //
732
733    if (status == S_STOPPED || status == S_RETURNING ||
734        instructionBuffer.empty()) {
735        return;
736    }
737
738    // Get current instruction
739
740    GPUDynInstPtr ii = instructionBuffer.front();
741
742    const uint32_t old_pc = pc();
743    DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
744            "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
745            ii->disassemble(), old_pc);
746    ii->execute();
747    // access the VRF
748    computeUnit->vrf[simdId]->exec(ii, this);
749    srcRegOpDist.sample(ii->numSrcRegOperands());
750    dstRegOpDist.sample(ii->numDstRegOperands());
751    computeUnit->numInstrExecuted++;
752    computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
753                                     computeUnit->lastExecCycle[simdId]);
754    computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
755    if (pc() == old_pc) {
756        uint32_t new_pc = old_pc + 1;
757        // PC not modified by instruction, proceed to next or pop frame
758        pc(new_pc);
759        if (new_pc == rpc()) {
760            popFromReconvergenceStack();
761            discardFetch();
762        } else {
763            instructionBuffer.pop_front();
764        }
765    }
766
767    if (computeUnit->shader->hsail_mode==Shader::SIMT) {
768        const int num_active_lanes = execMask().count();
769        computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
770        computeUnit->numVecOpsExecuted += num_active_lanes;
771        if (isGmInstruction(ii)) {
772            computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
773        } else if (isLmInstruction(ii)) {
774            computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
775        }
776    }
777
778    // ---- Update Vector ALU pipeline and other resources ------------------ //
779    // Single precision ALU or Branch or Return or Special instruction
780    if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
781        ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
782        // FIXME: Kernel argument loads are currently treated as ALU operations
783        // since we don't send memory packets at execution. If we fix that then
784        // we should map them to one of the memory pipelines
785        ii->opType() == Enums::OT_KERN_READ ||
786        ii->opType() == Enums::OT_ARG ||
787        ii->opType() == Enums::OT_RET) {
788        computeUnit->aluPipe[simdId].set(computeUnit->shader->
789                                         ticks(computeUnit->spBypassLength()));
790
791        // this is to enforce a fixed number of cycles per issue slot per SIMD
792        computeUnit->wfWait[simdId].set(computeUnit->shader->
793                                        ticks(computeUnit->issuePeriod));
794    } else if (ii->opType() == Enums::OT_BARRIER) {
795        computeUnit->wfWait[simdId].set(computeUnit->shader->
796                                        ticks(computeUnit->issuePeriod));
797    } else if (ii->opType() == Enums::OT_FLAT_READ) {
798        assert(Enums::SC_NONE != ii->executedAs());
799
800        if (Enums::SC_SHARED == ii->executedAs()) {
801            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
802                set(computeUnit->shader->ticks(4));
803            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
804                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
805        } else {
806            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
807                set(computeUnit->shader->ticks(4));
808            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
809                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
810        }
811    } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
812        assert(Enums::SC_NONE != ii->executedAs());
813        if (Enums::SC_SHARED == ii->executedAs()) {
814            computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
815                set(computeUnit->shader->ticks(8));
816            computeUnit->wfWait[computeUnit->ShrMemUnitId()].
817                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
818        } else {
819            computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
820                set(computeUnit->shader->ticks(8));
821            computeUnit->wfWait[computeUnit->GlbMemUnitId()].
822                set(computeUnit->shader->ticks(computeUnit->issuePeriod));
823        }
824    } else if (IS_OT_READ_GM(ii->opType())) {
825        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
826            set(computeUnit->shader->ticks(4));
827        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
828            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
829    } else if (IS_OT_WRITE_GM(ii->opType())) {
830        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
831            set(computeUnit->shader->ticks(8));
832        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
833            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
834    } else if (IS_OT_ATOMIC_GM(ii->opType())) {
835        computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
836            set(computeUnit->shader->ticks(8));
837        computeUnit->wfWait[computeUnit->GlbMemUnitId()].
838            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
839    } else if (IS_OT_READ_LM(ii->opType())) {
840        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
841            set(computeUnit->shader->ticks(4));
842        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
843            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
844    } else if (IS_OT_WRITE_LM(ii->opType())) {
845        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
846            set(computeUnit->shader->ticks(8));
847        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
848            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
849    } else if (IS_OT_ATOMIC_LM(ii->opType())) {
850        computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
851            set(computeUnit->shader->ticks(8));
852        computeUnit->wfWait[computeUnit->ShrMemUnitId()].
853            set(computeUnit->shader->ticks(computeUnit->issuePeriod));
854    }
855}
856
857bool
858Wavefront::waitingAtBarrier(int lane)
859{
860    return bar_cnt[lane] < max_bar_cnt;
861}
862
863void
864Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
865                                    const VectorMask& mask)
866{
867    assert(mask.count());
868    reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask));
869}
870
871void
872Wavefront::popFromReconvergenceStack()
873{
874    assert(!reconvergenceStack.empty());
875
876    DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
877            computeUnit->cu_id, simdId, wfSlotId, wfDynId,
878            execMask().to_string<char, std::string::traits_type,
879            std::string::allocator_type>().c_str(), pc());
880
881    reconvergenceStack.pop();
882
883    DPRINTF(WavefrontStack, "%3i %s\n", pc(),
884            execMask().to_string<char, std::string::traits_type,
885            std::string::allocator_type>().c_str());
886
887}
888
889void
890Wavefront::discardFetch()
891{
892    instructionBuffer.clear();
893    dropFetch |=pendingFetch;
894}
895
896uint32_t
897Wavefront::pc() const
898{
899    return reconvergenceStack.top()->pc;
900}
901
902uint32_t
903Wavefront::rpc() const
904{
905    return reconvergenceStack.top()->rpc;
906}
907
908VectorMask
909Wavefront::execMask() const
910{
911    return reconvergenceStack.top()->execMask;
912}
913
914bool
915Wavefront::execMask(int lane) const
916{
917    return reconvergenceStack.top()->execMask[lane];
918}
919
920
921void
922Wavefront::pc(uint32_t new_pc)
923{
924    reconvergenceStack.top()->pc = new_pc;
925}
926