wavefront.cc (11694:c3b4d57a15c5) wavefront.cc (11695:0a65922d564d)
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#include "gpu-compute/wavefront.hh"
37
38#include "debug/GPUExec.hh"
39#include "debug/WavefrontStack.hh"
40#include "gpu-compute/compute_unit.hh"
41#include "gpu-compute/gpu_dyn_inst.hh"
42#include "gpu-compute/shader.hh"
43#include "gpu-compute/vector_register_file.hh"
44
45Wavefront*
46WavefrontParams::create()
47{
48 return new Wavefront(this);
49}
50
51Wavefront::Wavefront(const Params *p)
52 : SimObject(p), callArgMem(nullptr)
53{
54 lastTrace = 0;
55 simdId = p->simdId;
56 wfSlotId = p->wf_slot_id;
57 status = S_STOPPED;
58 reservedVectorRegs = 0;
59 startVgprIndex = 0;
60 outstandingReqs = 0;
61 memReqsInPipe = 0;
62 outstandingReqsWrGm = 0;
63 outstandingReqsWrLm = 0;
64 outstandingReqsRdGm = 0;
65 outstandingReqsRdLm = 0;
66 rdLmReqsInPipe = 0;
67 rdGmReqsInPipe = 0;
68 wrLmReqsInPipe = 0;
69 wrGmReqsInPipe = 0;
70
71 barrierCnt = 0;
72 oldBarrierCnt = 0;
73 stalledAtBarrier = false;
74
75 memTraceBusy = 0;
76 oldVgprTcnt = 0xffffffffffffffffll;
77 oldDgprTcnt = 0xffffffffffffffffll;
78 oldVgpr.resize(p->wfSize);
79
80 pendingFetch = false;
81 dropFetch = false;
82 condRegState = new ConditionRegisterState();
83 maxSpVgprs = 0;
84 maxDpVgprs = 0;
85 lastAddr.resize(p->wfSize);
86 workItemFlatId.resize(p->wfSize);
87 oldDgpr.resize(p->wfSize);
88 barCnt.resize(p->wfSize);
89 for (int i = 0; i < 3; ++i) {
90 workItemId[i].resize(p->wfSize);
91 }
92}
93
94void
95Wavefront::regStats()
96{
97 SimObject::regStats();
98
99 srcRegOpDist
100 .init(0, 4, 2)
101 .name(name() + ".src_reg_operand_dist")
102 .desc("number of executed instructions with N source register operands")
103 ;
104
105 dstRegOpDist
106 .init(0, 3, 2)
107 .name(name() + ".dst_reg_operand_dist")
108 .desc("number of executed instructions with N destination register "
109 "operands")
110 ;
111
112 // FIXME: the name of the WF needs to be unique
113 numTimesBlockedDueWAXDependencies
114 .name(name() + ".timesBlockedDueWAXDependencies")
115 .desc("number of times the wf's instructions are blocked due to WAW "
116 "or WAR dependencies")
117 ;
118
119 // FIXME: the name of the WF needs to be unique
120 numTimesBlockedDueRAWDependencies
121 .name(name() + ".timesBlockedDueRAWDependencies")
122 .desc("number of times the wf's instructions are blocked due to RAW "
123 "dependencies")
124 ;
125
126 // FIXME: the name of the WF needs to be unique
127 numTimesBlockedDueVrfPortAvail
128 .name(name() + ".timesBlockedDueVrfPortAvail")
129 .desc("number of times instructions are blocked due to VRF port "
130 "availability")
131 ;
132}
133
134void
135Wavefront::init()
136{
137 reservedVectorRegs = 0;
138 startVgprIndex = 0;
139}
140
141void
142Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
143{
144 condRegState->init(num_cregs);
145 maxSpVgprs = num_sregs;
146 maxDpVgprs = num_dregs;
147}
148
149Wavefront::~Wavefront()
150{
151 if (callArgMem)
152 delete callArgMem;
153 delete condRegState;
154}
155
156void
157Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
158{
159 wfDynId = _wf_dyn_id;
160 basePtr = _base_ptr;
161 status = S_RUNNING;
162}
163
164bool
165Wavefront::isGmInstruction(GPUDynInstPtr ii)
166{
167 if (ii->isGlobalMem() || ii->isFlat())
168 return true;
169
170 return false;
171}
172
173bool
174Wavefront::isLmInstruction(GPUDynInstPtr ii)
175{
176 if (ii->isLocalMem()) {
177 return true;
178 }
179
180 return false;
181}
182
183bool
184Wavefront::isOldestInstALU()
185{
186 assert(!instructionBuffer.empty());
187 GPUDynInstPtr ii = instructionBuffer.front();
188
189 if (status != S_STOPPED && (ii->isNop() ||
190 ii->isReturn() || ii->isBranch() ||
191 ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
192 return true;
193 }
194
195 return false;
196}
197
198bool
199Wavefront::isOldestInstBarrier()
200{
201 assert(!instructionBuffer.empty());
202 GPUDynInstPtr ii = instructionBuffer.front();
203
204 if (status != S_STOPPED && ii->isBarrier()) {
205 return true;
206 }
207
208 return false;
209}
210
211bool
212Wavefront::isOldestInstGMem()
213{
214 assert(!instructionBuffer.empty());
215 GPUDynInstPtr ii = instructionBuffer.front();
216
217 if (status != S_STOPPED && ii->isGlobalMem()) {
218 return true;
219 }
220
221 return false;
222}
223
224bool
225Wavefront::isOldestInstLMem()
226{
227 assert(!instructionBuffer.empty());
228 GPUDynInstPtr ii = instructionBuffer.front();
229
230 if (status != S_STOPPED && ii->isLocalMem()) {
231 return true;
232 }
233
234 return false;
235}
236
237bool
238Wavefront::isOldestInstPrivMem()
239{
240 assert(!instructionBuffer.empty());
241 GPUDynInstPtr ii = instructionBuffer.front();
242
243 if (status != S_STOPPED && ii->isPrivateSeg()) {
244 return true;
245 }
246
247 return false;
248}
249
250bool
251Wavefront::isOldestInstFlatMem()
252{
253 assert(!instructionBuffer.empty());
254 GPUDynInstPtr ii = instructionBuffer.front();
255
256 if (status != S_STOPPED && ii->isFlat()) {
257 return true;
258 }
259
260 return false;
261}
262
263// Return true if the Wavefront's instruction
264// buffer has branch instruction.
265bool
266Wavefront::instructionBufferHasBranch()
267{
268 for (auto it : instructionBuffer) {
269 GPUDynInstPtr ii = it;
270
271 if (ii->isReturn() || ii->isBranch()) {
272 return true;
273 }
274 }
275
276 return false;
277}
278
279// Remap HSAIL register to physical VGPR.
280// HSAIL register = virtual register assigned to an operand by HLC compiler
281uint32_t
282Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
283{
284 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
285 // add the offset from where the VGPRs of the wavefront have been assigned
286 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
287 // HSAIL double precision (DP) register: calculate the physical VGPR index
288 // assuming that DP registers are placed after SP ones in the VRF. The DP
289 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
290 // the DP VGPR index before mapping it to the physical VRF address space
291 if (mode == 1 && size > 4) {
292 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
293 }
294
295 assert((startVgprIndex <= physicalVgprIndex) &&
296 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
297
298 // calculate absolute physical VGPR index
299 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
300}
301
302// Return true if this wavefront is ready
303// to execute an instruction of the specified type.
304int
305Wavefront::ready(itype_e type)
306{
307 // Check to make sure wave is running
308 if (status == S_STOPPED || status == S_RETURNING ||
309 instructionBuffer.empty()) {
310 return 0;
311 }
312
313 // Is the wave waiting at a barrier
314 if (stalledAtBarrier) {
315 if (!computeUnit->AllAtBarrier(barrierId,barrierCnt,
316 computeUnit->getRefCounter(dispatchId, wgId))) {
317 // Are all threads at barrier?
318 return 0;
319 }
320 oldBarrierCnt = barrierCnt;
321 stalledAtBarrier = false;
322 }
323
324 // Read instruction
325 GPUDynInstPtr ii = instructionBuffer.front();
326
327 bool ready_inst M5_VAR_USED = false;
328 bool glbMemBusRdy = false;
329 bool glbMemIssueRdy = false;
330 if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
331 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
332 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
333 glbMemBusRdy = true;
334 if (computeUnit->wfWait[j].prerdy())
335 glbMemIssueRdy = true;
336 }
337 }
338 bool locMemBusRdy = false;
339 bool locMemIssueRdy = false;
340 if (type == I_SHARED || type == I_FLAT) {
341 for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
342 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
343 locMemBusRdy = true;
344 if (computeUnit->wfWait[j].prerdy())
345 locMemIssueRdy = true;
346 }
347 }
348
349 // The following code is very error prone and the entire process for
350 // checking readiness will be fixed eventually. In the meantime, let's
351 // make sure that we do not silently let an instruction type slip
352 // through this logic and always return not ready.
353 if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
354 ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
355 ii->isMemFence() || ii->isFlat())) {
356 panic("next instruction: %s is of unknown type\n", ii->disassemble());
357 }
358
359 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
360 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
361
362 if (type == I_ALU && ii->isBarrier()) {
363 // Here for ALU instruction (barrier)
364 if (!computeUnit->wfWait[simdId].prerdy()) {
365 // Is wave slot free?
366 return 0;
367 }
368
369 // Are there in pipe or outstanding memory requests?
370 if ((outstandingReqs + memReqsInPipe) > 0) {
371 return 0;
372 }
373
374 ready_inst = true;
375 } else if (type == I_ALU && ii->isNop()) {
376 // Here for ALU instruction (nop)
377 if (!computeUnit->wfWait[simdId].prerdy()) {
378 // Is wave slot free?
379 return 0;
380 }
381
382 ready_inst = true;
383 } else if (type == I_ALU && ii->isReturn()) {
384 // Here for ALU instruction (return)
385 if (!computeUnit->wfWait[simdId].prerdy()) {
386 // Is wave slot free?
387 return 0;
388 }
389
390 // Are there in pipe or outstanding memory requests?
391 if ((outstandingReqs + memReqsInPipe) > 0) {
392 return 0;
393 }
394
395 ready_inst = true;
396 } else if (type == I_ALU && (ii->isBranch() ||
397 ii->isALU() ||
398 (ii->isKernArgSeg() && ii->isLoad()) ||
399 ii->isArgSeg())) {
400 // Here for ALU instruction (all others)
401 if (!computeUnit->wfWait[simdId].prerdy()) {
402 // Is alu slot free?
403 return 0;
404 }
405 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
406 VrfAccessType::RD_WR)) {
407 return 0;
408 }
409
410 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
411 return 0;
412 }
413 ready_inst = true;
414 } else if (type == I_GLOBAL && ii->isGlobalMem()) {
415 // Here Global memory instruction
416 if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
417 // Are there in pipe or outstanding global memory write requests?
418 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
419 return 0;
420 }
421 }
422
423 if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
424 // Are there in pipe or outstanding global memory read requests?
425 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0)
426 return 0;
427 }
428
429 if (!glbMemIssueRdy) {
430 // Is WV issue slot free?
431 return 0;
432 }
433
434 if (!glbMemBusRdy) {
435 // Is there an available VRF->Global memory read bus?
436 return 0;
437 }
438
439 if (!computeUnit->globalMemoryPipe.
440 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
441 // Can we insert a new request to the Global Mem Request FIFO?
442 return 0;
443 }
444 // can we schedule source & destination operands on the VRF?
445 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
446 VrfAccessType::RD_WR)) {
447 return 0;
448 }
449 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
450 return 0;
451 }
452 ready_inst = true;
453 } else if (type == I_SHARED && ii->isLocalMem()) {
454 // Here for Shared memory instruction
455 if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
456 if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
457 return 0;
458 }
459 }
460
461 if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
462 if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
463 return 0;
464 }
465 }
466
467 if (!locMemBusRdy) {
468 // Is there an available VRF->LDS read bus?
469 return 0;
470 }
471 if (!locMemIssueRdy) {
472 // Is wave slot free?
473 return 0;
474 }
475
476 if (!computeUnit->localMemoryPipe.
477 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
478 // Can we insert a new request to the LDS Request FIFO?
479 return 0;
480 }
481 // can we schedule source & destination operands on the VRF?
482 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
483 VrfAccessType::RD_WR)) {
484 return 0;
485 }
486 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
487 return 0;
488 }
489 ready_inst = true;
490 } else if (type == I_FLAT && ii->isFlat()) {
491 if (!glbMemBusRdy) {
492 // Is there an available VRF->Global memory read bus?
493 return 0;
494 }
495
496 if (!locMemBusRdy) {
497 // Is there an available VRF->LDS read bus?
498 return 0;
499 }
500
501 if (!glbMemIssueRdy) {
502 // Is wave slot free?
503 return 0;
504 }
505
506 if (!locMemIssueRdy) {
507 return 0;
508 }
509 if (!computeUnit->globalMemoryPipe.
510 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
511 // Can we insert a new request to the Global Mem Request FIFO?
512 return 0;
513 }
514
515 if (!computeUnit->localMemoryPipe.
516 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
517 // Can we insert a new request to the LDS Request FIFO?
518 return 0;
519 }
520 // can we schedule source & destination operands on the VRF?
521 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
522 VrfAccessType::RD_WR)) {
523 return 0;
524 }
525 // are all the operands ready? (RAW, WAW and WAR depedencies met?)
526 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
527 return 0;
528 }
529 ready_inst = true;
530 } else {
531 return 0;
532 }
533
534 assert(ready_inst);
535
536 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
537 simdId, wfSlotId, ii->disassemble());
538 return 1;
539}
540
541void
542Wavefront::updateResources()
543{
544 // Get current instruction
545 GPUDynInstPtr ii = instructionBuffer.front();
546 assert(ii);
547 computeUnit->vrf[simdId]->updateResources(this, ii);
548 // Single precision ALU or Branch or Return or Special instruction
549 if (ii->isALU() || ii->isSpecialOp() ||
550 ii->isBranch() ||
551 // FIXME: Kernel argument loads are currently treated as ALU operations
552 // since we don't send memory packets at execution. If we fix that then
553 // we should map them to one of the memory pipelines
554 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
555 ii->isReturn()) {
556 computeUnit->aluPipe[simdId].preset(computeUnit->shader->
557 ticks(computeUnit->spBypassLength()));
558 // this is to enforce a fixed number of cycles per issue slot per SIMD
559 computeUnit->wfWait[simdId].preset(computeUnit->shader->
560 ticks(computeUnit->issuePeriod));
561 } else if (ii->isBarrier()) {
562 computeUnit->wfWait[simdId].preset(computeUnit->shader->
563 ticks(computeUnit->issuePeriod));
564 } else if (ii->isLoad() && ii->isFlat()) {
565 assert(Enums::SC_NONE != ii->executedAs());
566 memReqsInPipe++;
567 rdGmReqsInPipe++;
568 if ( Enums::SC_SHARED == ii->executedAs() ) {
569 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
570 preset(computeUnit->shader->ticks(4));
571 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
572 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
573 } else {
574 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
575 preset(computeUnit->shader->ticks(4));
576 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
577 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
578 }
579 } else if (ii->isStore() && ii->isFlat()) {
580 assert(Enums::SC_NONE != ii->executedAs());
581 memReqsInPipe++;
582 wrGmReqsInPipe++;
583 if (Enums::SC_SHARED == ii->executedAs()) {
584 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
585 preset(computeUnit->shader->ticks(8));
586 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
587 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
588 } else {
589 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
590 preset(computeUnit->shader->ticks(8));
591 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
592 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
593 }
594 } else if (ii->isLoad() && ii->isGlobalMem()) {
595 memReqsInPipe++;
596 rdGmReqsInPipe++;
597 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
598 preset(computeUnit->shader->ticks(4));
599 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
600 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
601 } else if (ii->isStore() && ii->isGlobalMem()) {
602 memReqsInPipe++;
603 wrGmReqsInPipe++;
604 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
605 preset(computeUnit->shader->ticks(8));
606 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
607 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
608 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
609 memReqsInPipe++;
610 wrGmReqsInPipe++;
611 rdGmReqsInPipe++;
612 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
613 preset(computeUnit->shader->ticks(8));
614 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
615 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
616 } else if (ii->isLoad() && ii->isLocalMem()) {
617 memReqsInPipe++;
618 rdLmReqsInPipe++;
619 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
620 preset(computeUnit->shader->ticks(4));
621 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
622 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
623 } else if (ii->isStore() && ii->isLocalMem()) {
624 memReqsInPipe++;
625 wrLmReqsInPipe++;
626 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
627 preset(computeUnit->shader->ticks(8));
628 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
629 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
630 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
631 memReqsInPipe++;
632 wrLmReqsInPipe++;
633 rdLmReqsInPipe++;
634 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
635 preset(computeUnit->shader->ticks(8));
636 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
637 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
638 }
639}
640
641void
642Wavefront::exec()
643{
644 // ---- Exit if wavefront is inactive ----------------------------- //
645
646 if (status == S_STOPPED || status == S_RETURNING ||
647 instructionBuffer.empty()) {
648 return;
649 }
650
651 // Get current instruction
652
653 GPUDynInstPtr ii = instructionBuffer.front();
654
655 const uint32_t old_pc = pc();
656 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
657 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
658 ii->disassemble(), old_pc);
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#include "gpu-compute/wavefront.hh"
37
38#include "debug/GPUExec.hh"
39#include "debug/WavefrontStack.hh"
40#include "gpu-compute/compute_unit.hh"
41#include "gpu-compute/gpu_dyn_inst.hh"
42#include "gpu-compute/shader.hh"
43#include "gpu-compute/vector_register_file.hh"
44
45Wavefront*
46WavefrontParams::create()
47{
48 return new Wavefront(this);
49}
50
51Wavefront::Wavefront(const Params *p)
52 : SimObject(p), callArgMem(nullptr)
53{
54 lastTrace = 0;
55 simdId = p->simdId;
56 wfSlotId = p->wf_slot_id;
57 status = S_STOPPED;
58 reservedVectorRegs = 0;
59 startVgprIndex = 0;
60 outstandingReqs = 0;
61 memReqsInPipe = 0;
62 outstandingReqsWrGm = 0;
63 outstandingReqsWrLm = 0;
64 outstandingReqsRdGm = 0;
65 outstandingReqsRdLm = 0;
66 rdLmReqsInPipe = 0;
67 rdGmReqsInPipe = 0;
68 wrLmReqsInPipe = 0;
69 wrGmReqsInPipe = 0;
70
71 barrierCnt = 0;
72 oldBarrierCnt = 0;
73 stalledAtBarrier = false;
74
75 memTraceBusy = 0;
76 oldVgprTcnt = 0xffffffffffffffffll;
77 oldDgprTcnt = 0xffffffffffffffffll;
78 oldVgpr.resize(p->wfSize);
79
80 pendingFetch = false;
81 dropFetch = false;
82 condRegState = new ConditionRegisterState();
83 maxSpVgprs = 0;
84 maxDpVgprs = 0;
85 lastAddr.resize(p->wfSize);
86 workItemFlatId.resize(p->wfSize);
87 oldDgpr.resize(p->wfSize);
88 barCnt.resize(p->wfSize);
89 for (int i = 0; i < 3; ++i) {
90 workItemId[i].resize(p->wfSize);
91 }
92}
93
94void
95Wavefront::regStats()
96{
97 SimObject::regStats();
98
99 srcRegOpDist
100 .init(0, 4, 2)
101 .name(name() + ".src_reg_operand_dist")
102 .desc("number of executed instructions with N source register operands")
103 ;
104
105 dstRegOpDist
106 .init(0, 3, 2)
107 .name(name() + ".dst_reg_operand_dist")
108 .desc("number of executed instructions with N destination register "
109 "operands")
110 ;
111
112 // FIXME: the name of the WF needs to be unique
113 numTimesBlockedDueWAXDependencies
114 .name(name() + ".timesBlockedDueWAXDependencies")
115 .desc("number of times the wf's instructions are blocked due to WAW "
116 "or WAR dependencies")
117 ;
118
119 // FIXME: the name of the WF needs to be unique
120 numTimesBlockedDueRAWDependencies
121 .name(name() + ".timesBlockedDueRAWDependencies")
122 .desc("number of times the wf's instructions are blocked due to RAW "
123 "dependencies")
124 ;
125
126 // FIXME: the name of the WF needs to be unique
127 numTimesBlockedDueVrfPortAvail
128 .name(name() + ".timesBlockedDueVrfPortAvail")
129 .desc("number of times instructions are blocked due to VRF port "
130 "availability")
131 ;
132}
133
134void
135Wavefront::init()
136{
137 reservedVectorRegs = 0;
138 startVgprIndex = 0;
139}
140
141void
142Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
143{
144 condRegState->init(num_cregs);
145 maxSpVgprs = num_sregs;
146 maxDpVgprs = num_dregs;
147}
148
149Wavefront::~Wavefront()
150{
151 if (callArgMem)
152 delete callArgMem;
153 delete condRegState;
154}
155
156void
157Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
158{
159 wfDynId = _wf_dyn_id;
160 basePtr = _base_ptr;
161 status = S_RUNNING;
162}
163
164bool
165Wavefront::isGmInstruction(GPUDynInstPtr ii)
166{
167 if (ii->isGlobalMem() || ii->isFlat())
168 return true;
169
170 return false;
171}
172
173bool
174Wavefront::isLmInstruction(GPUDynInstPtr ii)
175{
176 if (ii->isLocalMem()) {
177 return true;
178 }
179
180 return false;
181}
182
183bool
184Wavefront::isOldestInstALU()
185{
186 assert(!instructionBuffer.empty());
187 GPUDynInstPtr ii = instructionBuffer.front();
188
189 if (status != S_STOPPED && (ii->isNop() ||
190 ii->isReturn() || ii->isBranch() ||
191 ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
192 return true;
193 }
194
195 return false;
196}
197
198bool
199Wavefront::isOldestInstBarrier()
200{
201 assert(!instructionBuffer.empty());
202 GPUDynInstPtr ii = instructionBuffer.front();
203
204 if (status != S_STOPPED && ii->isBarrier()) {
205 return true;
206 }
207
208 return false;
209}
210
211bool
212Wavefront::isOldestInstGMem()
213{
214 assert(!instructionBuffer.empty());
215 GPUDynInstPtr ii = instructionBuffer.front();
216
217 if (status != S_STOPPED && ii->isGlobalMem()) {
218 return true;
219 }
220
221 return false;
222}
223
224bool
225Wavefront::isOldestInstLMem()
226{
227 assert(!instructionBuffer.empty());
228 GPUDynInstPtr ii = instructionBuffer.front();
229
230 if (status != S_STOPPED && ii->isLocalMem()) {
231 return true;
232 }
233
234 return false;
235}
236
237bool
238Wavefront::isOldestInstPrivMem()
239{
240 assert(!instructionBuffer.empty());
241 GPUDynInstPtr ii = instructionBuffer.front();
242
243 if (status != S_STOPPED && ii->isPrivateSeg()) {
244 return true;
245 }
246
247 return false;
248}
249
250bool
251Wavefront::isOldestInstFlatMem()
252{
253 assert(!instructionBuffer.empty());
254 GPUDynInstPtr ii = instructionBuffer.front();
255
256 if (status != S_STOPPED && ii->isFlat()) {
257 return true;
258 }
259
260 return false;
261}
262
263// Return true if the Wavefront's instruction
264// buffer has branch instruction.
265bool
266Wavefront::instructionBufferHasBranch()
267{
268 for (auto it : instructionBuffer) {
269 GPUDynInstPtr ii = it;
270
271 if (ii->isReturn() || ii->isBranch()) {
272 return true;
273 }
274 }
275
276 return false;
277}
278
279// Remap HSAIL register to physical VGPR.
280// HSAIL register = virtual register assigned to an operand by HLC compiler
281uint32_t
282Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
283{
284 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
285 // add the offset from where the VGPRs of the wavefront have been assigned
286 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
287 // HSAIL double precision (DP) register: calculate the physical VGPR index
288 // assuming that DP registers are placed after SP ones in the VRF. The DP
289 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
290 // the DP VGPR index before mapping it to the physical VRF address space
291 if (mode == 1 && size > 4) {
292 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
293 }
294
295 assert((startVgprIndex <= physicalVgprIndex) &&
296 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
297
298 // calculate absolute physical VGPR index
299 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
300}
301
302// Return true if this wavefront is ready
303// to execute an instruction of the specified type.
304int
305Wavefront::ready(itype_e type)
306{
307 // Check to make sure wave is running
308 if (status == S_STOPPED || status == S_RETURNING ||
309 instructionBuffer.empty()) {
310 return 0;
311 }
312
313 // Is the wave waiting at a barrier
314 if (stalledAtBarrier) {
315 if (!computeUnit->AllAtBarrier(barrierId,barrierCnt,
316 computeUnit->getRefCounter(dispatchId, wgId))) {
317 // Are all threads at barrier?
318 return 0;
319 }
320 oldBarrierCnt = barrierCnt;
321 stalledAtBarrier = false;
322 }
323
324 // Read instruction
325 GPUDynInstPtr ii = instructionBuffer.front();
326
327 bool ready_inst M5_VAR_USED = false;
328 bool glbMemBusRdy = false;
329 bool glbMemIssueRdy = false;
330 if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
331 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
332 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
333 glbMemBusRdy = true;
334 if (computeUnit->wfWait[j].prerdy())
335 glbMemIssueRdy = true;
336 }
337 }
338 bool locMemBusRdy = false;
339 bool locMemIssueRdy = false;
340 if (type == I_SHARED || type == I_FLAT) {
341 for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
342 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
343 locMemBusRdy = true;
344 if (computeUnit->wfWait[j].prerdy())
345 locMemIssueRdy = true;
346 }
347 }
348
349 // The following code is very error prone and the entire process for
350 // checking readiness will be fixed eventually. In the meantime, let's
351 // make sure that we do not silently let an instruction type slip
352 // through this logic and always return not ready.
353 if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
354 ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
355 ii->isMemFence() || ii->isFlat())) {
356 panic("next instruction: %s is of unknown type\n", ii->disassemble());
357 }
358
359 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
360 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
361
362 if (type == I_ALU && ii->isBarrier()) {
363 // Here for ALU instruction (barrier)
364 if (!computeUnit->wfWait[simdId].prerdy()) {
365 // Is wave slot free?
366 return 0;
367 }
368
369 // Are there in pipe or outstanding memory requests?
370 if ((outstandingReqs + memReqsInPipe) > 0) {
371 return 0;
372 }
373
374 ready_inst = true;
375 } else if (type == I_ALU && ii->isNop()) {
376 // Here for ALU instruction (nop)
377 if (!computeUnit->wfWait[simdId].prerdy()) {
378 // Is wave slot free?
379 return 0;
380 }
381
382 ready_inst = true;
383 } else if (type == I_ALU && ii->isReturn()) {
384 // Here for ALU instruction (return)
385 if (!computeUnit->wfWait[simdId].prerdy()) {
386 // Is wave slot free?
387 return 0;
388 }
389
390 // Are there in pipe or outstanding memory requests?
391 if ((outstandingReqs + memReqsInPipe) > 0) {
392 return 0;
393 }
394
395 ready_inst = true;
396 } else if (type == I_ALU && (ii->isBranch() ||
397 ii->isALU() ||
398 (ii->isKernArgSeg() && ii->isLoad()) ||
399 ii->isArgSeg())) {
400 // Here for ALU instruction (all others)
401 if (!computeUnit->wfWait[simdId].prerdy()) {
402 // Is alu slot free?
403 return 0;
404 }
405 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
406 VrfAccessType::RD_WR)) {
407 return 0;
408 }
409
410 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
411 return 0;
412 }
413 ready_inst = true;
414 } else if (type == I_GLOBAL && ii->isGlobalMem()) {
415 // Here Global memory instruction
416 if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
417 // Are there in pipe or outstanding global memory write requests?
418 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
419 return 0;
420 }
421 }
422
423 if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
424 // Are there in pipe or outstanding global memory read requests?
425 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0)
426 return 0;
427 }
428
429 if (!glbMemIssueRdy) {
430 // Is WV issue slot free?
431 return 0;
432 }
433
434 if (!glbMemBusRdy) {
435 // Is there an available VRF->Global memory read bus?
436 return 0;
437 }
438
439 if (!computeUnit->globalMemoryPipe.
440 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
441 // Can we insert a new request to the Global Mem Request FIFO?
442 return 0;
443 }
444 // can we schedule source & destination operands on the VRF?
445 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
446 VrfAccessType::RD_WR)) {
447 return 0;
448 }
449 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
450 return 0;
451 }
452 ready_inst = true;
453 } else if (type == I_SHARED && ii->isLocalMem()) {
454 // Here for Shared memory instruction
455 if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
456 if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
457 return 0;
458 }
459 }
460
461 if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
462 if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
463 return 0;
464 }
465 }
466
467 if (!locMemBusRdy) {
468 // Is there an available VRF->LDS read bus?
469 return 0;
470 }
471 if (!locMemIssueRdy) {
472 // Is wave slot free?
473 return 0;
474 }
475
476 if (!computeUnit->localMemoryPipe.
477 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
478 // Can we insert a new request to the LDS Request FIFO?
479 return 0;
480 }
481 // can we schedule source & destination operands on the VRF?
482 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
483 VrfAccessType::RD_WR)) {
484 return 0;
485 }
486 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
487 return 0;
488 }
489 ready_inst = true;
490 } else if (type == I_FLAT && ii->isFlat()) {
491 if (!glbMemBusRdy) {
492 // Is there an available VRF->Global memory read bus?
493 return 0;
494 }
495
496 if (!locMemBusRdy) {
497 // Is there an available VRF->LDS read bus?
498 return 0;
499 }
500
501 if (!glbMemIssueRdy) {
502 // Is wave slot free?
503 return 0;
504 }
505
506 if (!locMemIssueRdy) {
507 return 0;
508 }
509 if (!computeUnit->globalMemoryPipe.
510 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
511 // Can we insert a new request to the Global Mem Request FIFO?
512 return 0;
513 }
514
515 if (!computeUnit->localMemoryPipe.
516 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
517 // Can we insert a new request to the LDS Request FIFO?
518 return 0;
519 }
520 // can we schedule source & destination operands on the VRF?
521 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
522 VrfAccessType::RD_WR)) {
523 return 0;
524 }
525 // are all the operands ready? (RAW, WAW and WAR depedencies met?)
526 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
527 return 0;
528 }
529 ready_inst = true;
530 } else {
531 return 0;
532 }
533
534 assert(ready_inst);
535
536 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
537 simdId, wfSlotId, ii->disassemble());
538 return 1;
539}
540
541void
542Wavefront::updateResources()
543{
544 // Get current instruction
545 GPUDynInstPtr ii = instructionBuffer.front();
546 assert(ii);
547 computeUnit->vrf[simdId]->updateResources(this, ii);
548 // Single precision ALU or Branch or Return or Special instruction
549 if (ii->isALU() || ii->isSpecialOp() ||
550 ii->isBranch() ||
551 // FIXME: Kernel argument loads are currently treated as ALU operations
552 // since we don't send memory packets at execution. If we fix that then
553 // we should map them to one of the memory pipelines
554 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
555 ii->isReturn()) {
556 computeUnit->aluPipe[simdId].preset(computeUnit->shader->
557 ticks(computeUnit->spBypassLength()));
558 // this is to enforce a fixed number of cycles per issue slot per SIMD
559 computeUnit->wfWait[simdId].preset(computeUnit->shader->
560 ticks(computeUnit->issuePeriod));
561 } else if (ii->isBarrier()) {
562 computeUnit->wfWait[simdId].preset(computeUnit->shader->
563 ticks(computeUnit->issuePeriod));
564 } else if (ii->isLoad() && ii->isFlat()) {
565 assert(Enums::SC_NONE != ii->executedAs());
566 memReqsInPipe++;
567 rdGmReqsInPipe++;
568 if ( Enums::SC_SHARED == ii->executedAs() ) {
569 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
570 preset(computeUnit->shader->ticks(4));
571 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
572 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
573 } else {
574 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
575 preset(computeUnit->shader->ticks(4));
576 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
577 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
578 }
579 } else if (ii->isStore() && ii->isFlat()) {
580 assert(Enums::SC_NONE != ii->executedAs());
581 memReqsInPipe++;
582 wrGmReqsInPipe++;
583 if (Enums::SC_SHARED == ii->executedAs()) {
584 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
585 preset(computeUnit->shader->ticks(8));
586 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
587 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
588 } else {
589 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
590 preset(computeUnit->shader->ticks(8));
591 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
592 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
593 }
594 } else if (ii->isLoad() && ii->isGlobalMem()) {
595 memReqsInPipe++;
596 rdGmReqsInPipe++;
597 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
598 preset(computeUnit->shader->ticks(4));
599 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
600 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
601 } else if (ii->isStore() && ii->isGlobalMem()) {
602 memReqsInPipe++;
603 wrGmReqsInPipe++;
604 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
605 preset(computeUnit->shader->ticks(8));
606 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
607 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
608 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
609 memReqsInPipe++;
610 wrGmReqsInPipe++;
611 rdGmReqsInPipe++;
612 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
613 preset(computeUnit->shader->ticks(8));
614 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
615 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
616 } else if (ii->isLoad() && ii->isLocalMem()) {
617 memReqsInPipe++;
618 rdLmReqsInPipe++;
619 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
620 preset(computeUnit->shader->ticks(4));
621 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
622 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
623 } else if (ii->isStore() && ii->isLocalMem()) {
624 memReqsInPipe++;
625 wrLmReqsInPipe++;
626 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
627 preset(computeUnit->shader->ticks(8));
628 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
629 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
630 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
631 memReqsInPipe++;
632 wrLmReqsInPipe++;
633 rdLmReqsInPipe++;
634 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
635 preset(computeUnit->shader->ticks(8));
636 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
637 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
638 }
639}
640
641void
642Wavefront::exec()
643{
644 // ---- Exit if wavefront is inactive ----------------------------- //
645
646 if (status == S_STOPPED || status == S_RETURNING ||
647 instructionBuffer.empty()) {
648 return;
649 }
650
651 // Get current instruction
652
653 GPUDynInstPtr ii = instructionBuffer.front();
654
655 const uint32_t old_pc = pc();
656 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
657 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
658 ii->disassemble(), old_pc);
659
660 // update the instruction stats in the CU
661
659 ii->execute(ii);
662 ii->execute(ii);
663 computeUnit->updateInstStats(ii);
660 // access the VRF
661 computeUnit->vrf[simdId]->exec(ii, this);
662 srcRegOpDist.sample(ii->numSrcRegOperands());
663 dstRegOpDist.sample(ii->numDstRegOperands());
664 computeUnit->numInstrExecuted++;
665 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
666 computeUnit->lastExecCycle[simdId]);
667 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
668 if (pc() == old_pc) {
669 uint32_t new_pc = old_pc + 1;
670 // PC not modified by instruction, proceed to next or pop frame
671 pc(new_pc);
672 if (new_pc == rpc()) {
673 popFromReconvergenceStack();
674 discardFetch();
675 } else {
676 instructionBuffer.pop_front();
677 }
678 } else {
679 discardFetch();
680 }
681
682 if (computeUnit->shader->hsail_mode==Shader::SIMT) {
683 const int num_active_lanes = execMask().count();
684 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
685 computeUnit->numVecOpsExecuted += num_active_lanes;
686 if (isGmInstruction(ii)) {
687 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
688 } else if (isLmInstruction(ii)) {
689 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
690 }
691 }
692
693 // ---- Update Vector ALU pipeline and other resources ------------------ //
694 // Single precision ALU or Branch or Return or Special instruction
695 if (ii->isALU() || ii->isSpecialOp() ||
696 ii->isBranch() ||
697 // FIXME: Kernel argument loads are currently treated as ALU operations
698 // since we don't send memory packets at execution. If we fix that then
699 // we should map them to one of the memory pipelines
700 (ii->isKernArgSeg() && ii->isLoad()) ||
701 ii->isArgSeg() ||
702 ii->isReturn()) {
703 computeUnit->aluPipe[simdId].set(computeUnit->shader->
704 ticks(computeUnit->spBypassLength()));
705
706 // this is to enforce a fixed number of cycles per issue slot per SIMD
707 computeUnit->wfWait[simdId].set(computeUnit->shader->
708 ticks(computeUnit->issuePeriod));
709 } else if (ii->isBarrier()) {
710 computeUnit->wfWait[simdId].set(computeUnit->shader->
711 ticks(computeUnit->issuePeriod));
712 } else if (ii->isLoad() && ii->isFlat()) {
713 assert(Enums::SC_NONE != ii->executedAs());
714
715 if (Enums::SC_SHARED == ii->executedAs()) {
716 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
717 set(computeUnit->shader->ticks(4));
718 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
719 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
720 } else {
721 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
722 set(computeUnit->shader->ticks(4));
723 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
724 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
725 }
726 } else if (ii->isStore() && ii->isFlat()) {
727 assert(Enums::SC_NONE != ii->executedAs());
728 if (Enums::SC_SHARED == ii->executedAs()) {
729 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
730 set(computeUnit->shader->ticks(8));
731 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
732 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
733 } else {
734 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
735 set(computeUnit->shader->ticks(8));
736 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
737 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
738 }
739 } else if (ii->isLoad() && ii->isGlobalMem()) {
740 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
741 set(computeUnit->shader->ticks(4));
742 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
743 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
744 } else if (ii->isStore() && ii->isGlobalMem()) {
745 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
746 set(computeUnit->shader->ticks(8));
747 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
748 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
749 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
750 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
751 set(computeUnit->shader->ticks(8));
752 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
753 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
754 } else if (ii->isLoad() && ii->isLocalMem()) {
755 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
756 set(computeUnit->shader->ticks(4));
757 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
758 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
759 } else if (ii->isStore() && ii->isLocalMem()) {
760 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
761 set(computeUnit->shader->ticks(8));
762 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
763 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
764 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
765 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
766 set(computeUnit->shader->ticks(8));
767 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
768 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
769 }
770}
771
772bool
773Wavefront::waitingAtBarrier(int lane)
774{
775 return barCnt[lane] < maxBarCnt;
776}
777
778void
779Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
780 const VectorMask& mask)
781{
782 assert(mask.count());
783 reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
784}
785
786void
787Wavefront::popFromReconvergenceStack()
788{
789 assert(!reconvergenceStack.empty());
790
791 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
792 computeUnit->cu_id, simdId, wfSlotId, wfDynId,
793 execMask().to_string<char, std::string::traits_type,
794 std::string::allocator_type>().c_str(), pc());
795
796 reconvergenceStack.pop_back();
797
798 DPRINTF(WavefrontStack, "%3i %s\n", pc(),
799 execMask().to_string<char, std::string::traits_type,
800 std::string::allocator_type>().c_str());
801
802}
803
804void
805Wavefront::discardFetch()
806{
807 instructionBuffer.clear();
808 dropFetch |=pendingFetch;
809}
810
811uint32_t
812Wavefront::pc() const
813{
814 return reconvergenceStack.back()->pc;
815}
816
817uint32_t
818Wavefront::rpc() const
819{
820 return reconvergenceStack.back()->rpc;
821}
822
823VectorMask
824Wavefront::execMask() const
825{
826 return reconvergenceStack.back()->execMask;
827}
828
829bool
830Wavefront::execMask(int lane) const
831{
832 return reconvergenceStack.back()->execMask[lane];
833}
834
835
836void
837Wavefront::pc(uint32_t new_pc)
838{
839 reconvergenceStack.back()->pc = new_pc;
840}
841
842uint32_t
843Wavefront::getStaticContextSize() const
844{
845 return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
846 sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
847 sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
848 sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
849 computeUnit->wfSize() * sizeof(ReconvergenceStackEntry);
850}
851
852void
853Wavefront::getContext(const void *out)
854{
855 uint8_t *iter = (uint8_t *)out;
856 for (int i = 0; i < barCnt.size(); i++) {
857 *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
858 }
859 *(int *)iter = wfId; iter += sizeof(wfId);
860 *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
861 *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
862 *(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
863 *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
864 *(uint32_t *)iter = wgId; iter += sizeof(wgId);
865 *(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
866 *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
867 *(Addr *)iter = privBase; iter += sizeof(privBase);
868 *(Addr *)iter = spillBase; iter += sizeof(spillBase);
869
870 int stackSize = reconvergenceStack.size();
871 ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
872 std::numeric_limits<uint32_t>::max(),
873 std::numeric_limits<uint64_t>::max()};
874 for (int i = 0; i < workItemId[0].size(); i++) {
875 if (i < stackSize) {
876 *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
877 iter += sizeof(ReconvergenceStackEntry);
878 reconvergenceStack.pop_back();
879 } else {
880 *(ReconvergenceStackEntry *)iter = empty;
881 iter += sizeof(ReconvergenceStackEntry);
882 }
883 }
884
885 int wf_size = computeUnit->wfSize();
886 for (int i = 0; i < maxSpVgprs; i++) {
887 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
888 for (int lane = 0; lane < wf_size; lane++) {
889 uint32_t regVal = computeUnit->vrf[simdId]->
890 read<uint32_t>(vgprIdx,lane);
891 *(uint32_t *)iter = regVal; iter += sizeof(regVal);
892 }
893 }
894
895 for (int i = 0; i < maxDpVgprs; i++) {
896 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
897 for (int lane = 0; lane < wf_size; lane++) {
898 uint64_t regVal = computeUnit->vrf[simdId]->
899 read<uint64_t>(vgprIdx,lane);
900 *(uint64_t *)iter = regVal; iter += sizeof(regVal);
901 }
902 }
903
904 for (int i = 0; i < condRegState->numRegs(); i++) {
905 for (int lane = 0; lane < wf_size; lane++) {
906 uint64_t regVal = condRegState->read<uint64_t>(i, lane);
907 *(uint64_t *)iter = regVal; iter += sizeof(regVal);
908 }
909 }
910
911 /* saving LDS content */
912 if (ldsChunk)
913 for (int i = 0; i < ldsChunk->size(); i++) {
914 char val = ldsChunk->read<char>(i);
915 *(char *) iter = val; iter += sizeof(val);
916 }
917}
918
919void
920Wavefront::setContext(const void *in)
921{
922 uint8_t *iter = (uint8_t *)in;
923 for (int i = 0; i < barCnt.size(); i++) {
924 barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
925 }
926 wfId = *(int *)iter; iter += sizeof(wfId);
927 maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
928 oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
929 barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
930 computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
931 wgId = *(uint32_t *)iter; iter += sizeof(wgId);
932 barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
933 initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
934 privBase = *(Addr *)iter; iter += sizeof(privBase);
935 spillBase = *(Addr *)iter; iter += sizeof(spillBase);
936
937 for (int i = 0; i < workItemId[0].size(); i++) {
938 ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter;
939 iter += sizeof(ReconvergenceStackEntry);
940 if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
941 pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
942 newEntry.execMask);
943 }
944 }
945 int wf_size = computeUnit->wfSize();
946
947 for (int i = 0; i < maxSpVgprs; i++) {
948 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
949 for (int lane = 0; lane < wf_size; lane++) {
950 uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
951 computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
952 }
953 }
954
955 for (int i = 0; i < maxDpVgprs; i++) {
956 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
957 for (int lane = 0; lane < wf_size; lane++) {
958 uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
959 computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
960 }
961 }
962
963 for (int i = 0; i < condRegState->numRegs(); i++) {
964 for (int lane = 0; lane < wf_size; lane++) {
965 uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
966 condRegState->write<uint64_t>(i, lane, regVal);
967 }
968 }
969 /** Restoring LDS contents */
970 if (ldsChunk)
971 for (int i = 0; i < ldsChunk->size(); i++) {
972 char val = *(char *) iter; iter += sizeof(val);
973 ldsChunk->write<char>(i, val);
974 }
975}
976
977void
978Wavefront::computeActualWgSz(NDRange *ndr)
979{
980 actualWgSzTotal = 1;
981 for (int d = 0; d < 3; ++d) {
982 actualWgSz[d] = std::min(workGroupSz[d],
983 gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
984 actualWgSzTotal *= actualWgSz[d];
985 }
986}
664 // access the VRF
665 computeUnit->vrf[simdId]->exec(ii, this);
666 srcRegOpDist.sample(ii->numSrcRegOperands());
667 dstRegOpDist.sample(ii->numDstRegOperands());
668 computeUnit->numInstrExecuted++;
669 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
670 computeUnit->lastExecCycle[simdId]);
671 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
672 if (pc() == old_pc) {
673 uint32_t new_pc = old_pc + 1;
674 // PC not modified by instruction, proceed to next or pop frame
675 pc(new_pc);
676 if (new_pc == rpc()) {
677 popFromReconvergenceStack();
678 discardFetch();
679 } else {
680 instructionBuffer.pop_front();
681 }
682 } else {
683 discardFetch();
684 }
685
686 if (computeUnit->shader->hsail_mode==Shader::SIMT) {
687 const int num_active_lanes = execMask().count();
688 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
689 computeUnit->numVecOpsExecuted += num_active_lanes;
690 if (isGmInstruction(ii)) {
691 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
692 } else if (isLmInstruction(ii)) {
693 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
694 }
695 }
696
697 // ---- Update Vector ALU pipeline and other resources ------------------ //
698 // Single precision ALU or Branch or Return or Special instruction
699 if (ii->isALU() || ii->isSpecialOp() ||
700 ii->isBranch() ||
701 // FIXME: Kernel argument loads are currently treated as ALU operations
702 // since we don't send memory packets at execution. If we fix that then
703 // we should map them to one of the memory pipelines
704 (ii->isKernArgSeg() && ii->isLoad()) ||
705 ii->isArgSeg() ||
706 ii->isReturn()) {
707 computeUnit->aluPipe[simdId].set(computeUnit->shader->
708 ticks(computeUnit->spBypassLength()));
709
710 // this is to enforce a fixed number of cycles per issue slot per SIMD
711 computeUnit->wfWait[simdId].set(computeUnit->shader->
712 ticks(computeUnit->issuePeriod));
713 } else if (ii->isBarrier()) {
714 computeUnit->wfWait[simdId].set(computeUnit->shader->
715 ticks(computeUnit->issuePeriod));
716 } else if (ii->isLoad() && ii->isFlat()) {
717 assert(Enums::SC_NONE != ii->executedAs());
718
719 if (Enums::SC_SHARED == ii->executedAs()) {
720 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
721 set(computeUnit->shader->ticks(4));
722 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
723 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
724 } else {
725 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
726 set(computeUnit->shader->ticks(4));
727 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
728 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
729 }
730 } else if (ii->isStore() && ii->isFlat()) {
731 assert(Enums::SC_NONE != ii->executedAs());
732 if (Enums::SC_SHARED == ii->executedAs()) {
733 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
734 set(computeUnit->shader->ticks(8));
735 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
736 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
737 } else {
738 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
739 set(computeUnit->shader->ticks(8));
740 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
741 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
742 }
743 } else if (ii->isLoad() && ii->isGlobalMem()) {
744 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
745 set(computeUnit->shader->ticks(4));
746 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
747 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
748 } else if (ii->isStore() && ii->isGlobalMem()) {
749 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
750 set(computeUnit->shader->ticks(8));
751 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
752 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
753 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
754 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
755 set(computeUnit->shader->ticks(8));
756 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
757 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
758 } else if (ii->isLoad() && ii->isLocalMem()) {
759 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
760 set(computeUnit->shader->ticks(4));
761 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
762 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
763 } else if (ii->isStore() && ii->isLocalMem()) {
764 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
765 set(computeUnit->shader->ticks(8));
766 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
767 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
768 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
769 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
770 set(computeUnit->shader->ticks(8));
771 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
772 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
773 }
774}
775
776bool
777Wavefront::waitingAtBarrier(int lane)
778{
779 return barCnt[lane] < maxBarCnt;
780}
781
782void
783Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
784 const VectorMask& mask)
785{
786 assert(mask.count());
787 reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
788}
789
790void
791Wavefront::popFromReconvergenceStack()
792{
793 assert(!reconvergenceStack.empty());
794
795 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
796 computeUnit->cu_id, simdId, wfSlotId, wfDynId,
797 execMask().to_string<char, std::string::traits_type,
798 std::string::allocator_type>().c_str(), pc());
799
800 reconvergenceStack.pop_back();
801
802 DPRINTF(WavefrontStack, "%3i %s\n", pc(),
803 execMask().to_string<char, std::string::traits_type,
804 std::string::allocator_type>().c_str());
805
806}
807
808void
809Wavefront::discardFetch()
810{
811 instructionBuffer.clear();
812 dropFetch |=pendingFetch;
813}
814
815uint32_t
816Wavefront::pc() const
817{
818 return reconvergenceStack.back()->pc;
819}
820
821uint32_t
822Wavefront::rpc() const
823{
824 return reconvergenceStack.back()->rpc;
825}
826
827VectorMask
828Wavefront::execMask() const
829{
830 return reconvergenceStack.back()->execMask;
831}
832
833bool
834Wavefront::execMask(int lane) const
835{
836 return reconvergenceStack.back()->execMask[lane];
837}
838
839
840void
841Wavefront::pc(uint32_t new_pc)
842{
843 reconvergenceStack.back()->pc = new_pc;
844}
845
846uint32_t
847Wavefront::getStaticContextSize() const
848{
849 return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
850 sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
851 sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
852 sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
853 computeUnit->wfSize() * sizeof(ReconvergenceStackEntry);
854}
855
856void
857Wavefront::getContext(const void *out)
858{
859 uint8_t *iter = (uint8_t *)out;
860 for (int i = 0; i < barCnt.size(); i++) {
861 *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
862 }
863 *(int *)iter = wfId; iter += sizeof(wfId);
864 *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
865 *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
866 *(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
867 *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
868 *(uint32_t *)iter = wgId; iter += sizeof(wgId);
869 *(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
870 *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
871 *(Addr *)iter = privBase; iter += sizeof(privBase);
872 *(Addr *)iter = spillBase; iter += sizeof(spillBase);
873
874 int stackSize = reconvergenceStack.size();
875 ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
876 std::numeric_limits<uint32_t>::max(),
877 std::numeric_limits<uint64_t>::max()};
878 for (int i = 0; i < workItemId[0].size(); i++) {
879 if (i < stackSize) {
880 *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
881 iter += sizeof(ReconvergenceStackEntry);
882 reconvergenceStack.pop_back();
883 } else {
884 *(ReconvergenceStackEntry *)iter = empty;
885 iter += sizeof(ReconvergenceStackEntry);
886 }
887 }
888
889 int wf_size = computeUnit->wfSize();
890 for (int i = 0; i < maxSpVgprs; i++) {
891 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
892 for (int lane = 0; lane < wf_size; lane++) {
893 uint32_t regVal = computeUnit->vrf[simdId]->
894 read<uint32_t>(vgprIdx,lane);
895 *(uint32_t *)iter = regVal; iter += sizeof(regVal);
896 }
897 }
898
899 for (int i = 0; i < maxDpVgprs; i++) {
900 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
901 for (int lane = 0; lane < wf_size; lane++) {
902 uint64_t regVal = computeUnit->vrf[simdId]->
903 read<uint64_t>(vgprIdx,lane);
904 *(uint64_t *)iter = regVal; iter += sizeof(regVal);
905 }
906 }
907
908 for (int i = 0; i < condRegState->numRegs(); i++) {
909 for (int lane = 0; lane < wf_size; lane++) {
910 uint64_t regVal = condRegState->read<uint64_t>(i, lane);
911 *(uint64_t *)iter = regVal; iter += sizeof(regVal);
912 }
913 }
914
915 /* saving LDS content */
916 if (ldsChunk)
917 for (int i = 0; i < ldsChunk->size(); i++) {
918 char val = ldsChunk->read<char>(i);
919 *(char *) iter = val; iter += sizeof(val);
920 }
921}
922
923void
924Wavefront::setContext(const void *in)
925{
926 uint8_t *iter = (uint8_t *)in;
927 for (int i = 0; i < barCnt.size(); i++) {
928 barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
929 }
930 wfId = *(int *)iter; iter += sizeof(wfId);
931 maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
932 oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
933 barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
934 computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
935 wgId = *(uint32_t *)iter; iter += sizeof(wgId);
936 barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
937 initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
938 privBase = *(Addr *)iter; iter += sizeof(privBase);
939 spillBase = *(Addr *)iter; iter += sizeof(spillBase);
940
941 for (int i = 0; i < workItemId[0].size(); i++) {
942 ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter;
943 iter += sizeof(ReconvergenceStackEntry);
944 if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
945 pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
946 newEntry.execMask);
947 }
948 }
949 int wf_size = computeUnit->wfSize();
950
951 for (int i = 0; i < maxSpVgprs; i++) {
952 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
953 for (int lane = 0; lane < wf_size; lane++) {
954 uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
955 computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
956 }
957 }
958
959 for (int i = 0; i < maxDpVgprs; i++) {
960 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
961 for (int lane = 0; lane < wf_size; lane++) {
962 uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
963 computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
964 }
965 }
966
967 for (int i = 0; i < condRegState->numRegs(); i++) {
968 for (int lane = 0; lane < wf_size; lane++) {
969 uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
970 condRegState->write<uint64_t>(i, lane, regVal);
971 }
972 }
973 /** Restoring LDS contents */
974 if (ldsChunk)
975 for (int i = 0; i < ldsChunk->size(); i++) {
976 char val = *(char *) iter; iter += sizeof(val);
977 ldsChunk->write<char>(i, val);
978 }
979}
980
981void
982Wavefront::computeActualWgSz(NDRange *ndr)
983{
984 actualWgSzTotal = 1;
985 for (int d = 0; d < 3; ++d) {
986 actualWgSz[d] = std::min(workGroupSz[d],
987 gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
988 actualWgSzTotal *= actualWgSz[d];
989 }
990}