wavefront.cc (11644:d426728892fe) wavefront.cc (11657:5fad5a37d6fc)
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#include "gpu-compute/wavefront.hh"
37
38#include "debug/GPUExec.hh"
39#include "debug/WavefrontStack.hh"
40#include "gpu-compute/code_enums.hh"
41#include "gpu-compute/compute_unit.hh"
42#include "gpu-compute/gpu_dyn_inst.hh"
43#include "gpu-compute/shader.hh"
44#include "gpu-compute/vector_register_file.hh"
45
46Wavefront*
47WavefrontParams::create()
48{
49 return new Wavefront(this);
50}
51
52Wavefront::Wavefront(const Params *p)
53 : SimObject(p), callArgMem(nullptr)
54{
55 lastTrace = 0;
56 simdId = p->simdId;
57 wfSlotId = p->wf_slot_id;
58 status = S_STOPPED;
59 reservedVectorRegs = 0;
60 startVgprIndex = 0;
61 outstandingReqs = 0;
62 memReqsInPipe = 0;
63 outstandingReqsWrGm = 0;
64 outstandingReqsWrLm = 0;
65 outstandingReqsRdGm = 0;
66 outstandingReqsRdLm = 0;
67 rdLmReqsInPipe = 0;
68 rdGmReqsInPipe = 0;
69 wrLmReqsInPipe = 0;
70 wrGmReqsInPipe = 0;
71
72 barrierCnt = 0;
73 oldBarrierCnt = 0;
74 stalledAtBarrier = false;
75
76 memTraceBusy = 0;
77 oldVgprTcnt = 0xffffffffffffffffll;
78 oldDgprTcnt = 0xffffffffffffffffll;
79 oldVgpr.resize(p->wfSize);
80
81 pendingFetch = false;
82 dropFetch = false;
83 condRegState = new ConditionRegisterState();
84 maxSpVgprs = 0;
85 maxDpVgprs = 0;
86 lastAddr.resize(p->wfSize);
87 workItemFlatId.resize(p->wfSize);
88 oldDgpr.resize(p->wfSize);
89 barCnt.resize(p->wfSize);
90 for (int i = 0; i < 3; ++i) {
91 workItemId[i].resize(p->wfSize);
92 }
93}
94
95void
96Wavefront::regStats()
97{
98 SimObject::regStats();
99
100 srcRegOpDist
101 .init(0, 4, 2)
102 .name(name() + ".src_reg_operand_dist")
103 .desc("number of executed instructions with N source register operands")
104 ;
105
106 dstRegOpDist
107 .init(0, 3, 2)
108 .name(name() + ".dst_reg_operand_dist")
109 .desc("number of executed instructions with N destination register "
110 "operands")
111 ;
112
113 // FIXME: the name of the WF needs to be unique
114 numTimesBlockedDueWAXDependencies
115 .name(name() + ".timesBlockedDueWAXDependencies")
116 .desc("number of times the wf's instructions are blocked due to WAW "
117 "or WAR dependencies")
118 ;
119
120 // FIXME: the name of the WF needs to be unique
121 numTimesBlockedDueRAWDependencies
122 .name(name() + ".timesBlockedDueRAWDependencies")
123 .desc("number of times the wf's instructions are blocked due to RAW "
124 "dependencies")
125 ;
126
127 // FIXME: the name of the WF needs to be unique
128 numTimesBlockedDueVrfPortAvail
129 .name(name() + ".timesBlockedDueVrfPortAvail")
130 .desc("number of times instructions are blocked due to VRF port "
131 "availability")
132 ;
133}
134
135void
136Wavefront::init()
137{
138 reservedVectorRegs = 0;
139 startVgprIndex = 0;
140}
141
142void
143Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
144{
145 condRegState->init(num_cregs);
146 maxSpVgprs = num_sregs;
147 maxDpVgprs = num_dregs;
148}
149
150Wavefront::~Wavefront()
151{
152 if (callArgMem)
153 delete callArgMem;
154 delete condRegState;
155}
156
157void
158Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
159{
160 wfDynId = _wf_dyn_id;
161 basePtr = _base_ptr;
162 status = S_RUNNING;
163}
164
165bool
166Wavefront::isGmInstruction(GPUDynInstPtr ii)
167{
168 if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
169 IS_OT_ATOMIC_PM(ii->opType())) {
170 return true;
171 }
172
173 if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
174 IS_OT_ATOMIC_GM(ii->opType())) {
175 return true;
176 }
177
178 if (IS_OT_FLAT(ii->opType())) {
179 return true;
180 }
181
182 return false;
183}
184
185bool
186Wavefront::isLmInstruction(GPUDynInstPtr ii)
187{
188 if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) ||
189 IS_OT_ATOMIC_LM(ii->opType())) {
190 return true;
191 }
192
193 return false;
194}
195
196bool
197Wavefront::isOldestInstALU()
198{
199 assert(!instructionBuffer.empty());
200 GPUDynInstPtr ii = instructionBuffer.front();
201
202 if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP ||
203 ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
204 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
205 ii->opType() == Enums::OT_KERN_READ)) {
206 return true;
207 }
208
209 return false;
210}
211
212bool
213Wavefront::isOldestInstBarrier()
214{
215 assert(!instructionBuffer.empty());
216 GPUDynInstPtr ii = instructionBuffer.front();
217
218 if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) {
219 return true;
220 }
221
222 return false;
223}
224
225bool
226Wavefront::isOldestInstGMem()
227{
228 assert(!instructionBuffer.empty());
229 GPUDynInstPtr ii = instructionBuffer.front();
230
231 if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) ||
232 IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
233
234 return true;
235 }
236
237 return false;
238}
239
240bool
241Wavefront::isOldestInstLMem()
242{
243 assert(!instructionBuffer.empty());
244 GPUDynInstPtr ii = instructionBuffer.front();
245
246 if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) ||
247 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
248
249 return true;
250 }
251
252 return false;
253}
254
255bool
256Wavefront::isOldestInstPrivMem()
257{
258 assert(!instructionBuffer.empty());
259 GPUDynInstPtr ii = instructionBuffer.front();
260
261 if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) ||
262 IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
263
264 return true;
265 }
266
267 return false;
268}
269
270bool
271Wavefront::isOldestInstFlatMem()
272{
273 assert(!instructionBuffer.empty());
274 GPUDynInstPtr ii = instructionBuffer.front();
275
276 if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) {
277
278 return true;
279 }
280
281 return false;
282}
283
284// Return true if the Wavefront's instruction
285// buffer has branch instruction.
286bool
287Wavefront::instructionBufferHasBranch()
288{
289 for (auto it : instructionBuffer) {
290 GPUDynInstPtr ii = it;
291
292 if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) {
293 return true;
294 }
295 }
296
297 return false;
298}
299
300// Remap HSAIL register to physical VGPR.
301// HSAIL register = virtual register assigned to an operand by HLC compiler
302uint32_t
303Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
304{
305 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
306 // add the offset from where the VGPRs of the wavefront have been assigned
307 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
308 // HSAIL double precision (DP) register: calculate the physical VGPR index
309 // assuming that DP registers are placed after SP ones in the VRF. The DP
310 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
311 // the DP VGPR index before mapping it to the physical VRF address space
312 if (mode == 1 && size > 4) {
313 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
314 }
315
316 assert((startVgprIndex <= physicalVgprIndex) &&
317 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
318
319 // calculate absolute physical VGPR index
320 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
321}
322
323// Return true if this wavefront is ready
324// to execute an instruction of the specified type.
325int
326Wavefront::ready(itype_e type)
327{
328 // Check to make sure wave is running
329 if (status == S_STOPPED || status == S_RETURNING ||
330 instructionBuffer.empty()) {
331 return 0;
332 }
333
334 // Is the wave waiting at a barrier
335 if (stalledAtBarrier) {
336 if (!computeUnit->AllAtBarrier(barrierId,barrierCnt,
337 computeUnit->getRefCounter(dispatchId, wgId))) {
338 // Are all threads at barrier?
339 return 0;
340 }
341 oldBarrierCnt = barrierCnt;
342 stalledAtBarrier = false;
343 }
344
345 // Read instruction
346 GPUDynInstPtr ii = instructionBuffer.front();
347
348 bool ready_inst M5_VAR_USED = false;
349 bool glbMemBusRdy = false;
350 bool glbMemIssueRdy = false;
351 if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
352 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
353 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
354 glbMemBusRdy = true;
355 if (computeUnit->wfWait[j].prerdy())
356 glbMemIssueRdy = true;
357 }
358 }
359 bool locMemBusRdy = false;
360 bool locMemIssueRdy = false;
361 if (type == I_SHARED || type == I_FLAT) {
362 for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
363 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
364 locMemBusRdy = true;
365 if (computeUnit->wfWait[j].prerdy())
366 locMemIssueRdy = true;
367 }
368 }
369
370 // The following code is very error prone and the entire process for
371 // checking readiness will be fixed eventually. In the meantime, let's
372 // make sure that we do not silently let an instruction type slip
373 // through this logic and always return not ready.
374 if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP ||
375 ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
376 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
377 ii->opType() == Enums::OT_KERN_READ ||
378 ii->opType() == Enums::OT_ARG ||
379 IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
380 IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) ||
381 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
382 IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
383 IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) {
384 panic("next instruction: %s is of unknown type\n", ii->disassemble());
385 }
386
387 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
388 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
389
390 if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) {
391 // Here for ALU instruction (barrier)
392 if (!computeUnit->wfWait[simdId].prerdy()) {
393 // Is wave slot free?
394 return 0;
395 }
396
397 // Are there in pipe or outstanding memory requests?
398 if ((outstandingReqs + memReqsInPipe) > 0) {
399 return 0;
400 }
401
402 ready_inst = true;
403 } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) {
404 // Here for ALU instruction (nop)
405 if (!computeUnit->wfWait[simdId].prerdy()) {
406 // Is wave slot free?
407 return 0;
408 }
409
410 ready_inst = true;
411 } else if (type == I_ALU && ii->opType() == Enums::OT_RET) {
412 // Here for ALU instruction (return)
413 if (!computeUnit->wfWait[simdId].prerdy()) {
414 // Is wave slot free?
415 return 0;
416 }
417
418 // Are there in pipe or outstanding memory requests?
419 if ((outstandingReqs + memReqsInPipe) > 0) {
420 return 0;
421 }
422
423 ready_inst = true;
424 } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH ||
425 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
426 ii->opType() == Enums::OT_KERN_READ ||
427 ii->opType() == Enums::OT_ARG)) {
428 // Here for ALU instruction (all others)
429 if (!computeUnit->wfWait[simdId].prerdy()) {
430 // Is alu slot free?
431 return 0;
432 }
433 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
434 VrfAccessType::RD_WR)) {
435 return 0;
436 }
437
438 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
439 return 0;
440 }
441 ready_inst = true;
442 } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) ||
443 IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
444 // Here Global memory instruction
445 if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) {
446 // Are there in pipe or outstanding global memory write requests?
447 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
448 return 0;
449 }
450 }
451
452 if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) ||
453 IS_OT_HIST_GM(ii->opType())) {
454 // Are there in pipe or outstanding global memory read requests?
455 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0)
456 return 0;
457 }
458
459 if (!glbMemIssueRdy) {
460 // Is WV issue slot free?
461 return 0;
462 }
463
464 if (!glbMemBusRdy) {
465 // Is there an available VRF->Global memory read bus?
466 return 0;
467 }
468
469 if (!computeUnit->globalMemoryPipe.
470 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
471 // Can we insert a new request to the Global Mem Request FIFO?
472 return 0;
473 }
474 // can we schedule source & destination operands on the VRF?
475 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
476 VrfAccessType::RD_WR)) {
477 return 0;
478 }
479 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
480 return 0;
481 }
482 ready_inst = true;
483 } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) ||
484 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
485 // Here for Shared memory instruction
486 if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) {
487 if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
488 return 0;
489 }
490 }
491
492 if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
493 IS_OT_HIST_LM(ii->opType())) {
494 if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
495 return 0;
496 }
497 }
498
499 if (!locMemBusRdy) {
500 // Is there an available VRF->LDS read bus?
501 return 0;
502 }
503 if (!locMemIssueRdy) {
504 // Is wave slot free?
505 return 0;
506 }
507
508 if (!computeUnit->localMemoryPipe.
509 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
510 // Can we insert a new request to the LDS Request FIFO?
511 return 0;
512 }
513 // can we schedule source & destination operands on the VRF?
514 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
515 VrfAccessType::RD_WR)) {
516 return 0;
517 }
518 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
519 return 0;
520 }
521 ready_inst = true;
522 } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) ||
523 IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
524 // Here for Private memory instruction ------------------------ //
525 if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) {
526 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
527 return 0;
528 }
529 }
530
531 if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) ||
532 IS_OT_HIST_PM(ii->opType())) {
533 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) {
534 return 0;
535 }
536 }
537
538 if (!glbMemBusRdy) {
539 // Is there an available VRF->Global memory read bus?
540 return 0;
541 }
542
543 if (!glbMemIssueRdy) {
544 // Is wave slot free?
545 return 0;
546 }
547
548 if (!computeUnit->globalMemoryPipe.
549 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
550 // Can we insert a new request to the Global Mem Request FIFO?
551 return 0;
552 }
553 // can we schedule source & destination operands on the VRF?
554 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
555 VrfAccessType::RD_WR)) {
556 return 0;
557 }
558 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
559 return 0;
560 }
561 ready_inst = true;
562 } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) {
563 if (!glbMemBusRdy) {
564 // Is there an available VRF->Global memory read bus?
565 return 0;
566 }
567
568 if (!locMemBusRdy) {
569 // Is there an available VRF->LDS read bus?
570 return 0;
571 }
572
573 if (!glbMemIssueRdy) {
574 // Is wave slot free?
575 return 0;
576 }
577
578 if (!locMemIssueRdy) {
579 return 0;
580 }
581 if (!computeUnit->globalMemoryPipe.
582 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
583 // Can we insert a new request to the Global Mem Request FIFO?
584 return 0;
585 }
586
587 if (!computeUnit->localMemoryPipe.
588 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
589 // Can we insert a new request to the LDS Request FIFO?
590 return 0;
591 }
592 // can we schedule source & destination operands on the VRF?
593 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
594 VrfAccessType::RD_WR)) {
595 return 0;
596 }
597 // are all the operands ready? (RAW, WAW and WAR depedencies met?)
598 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
599 return 0;
600 }
601 ready_inst = true;
602 } else {
603 return 0;
604 }
605
606 assert(ready_inst);
607
608 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
609 simdId, wfSlotId, ii->disassemble());
610 return 1;
611}
612
613void
614Wavefront::updateResources()
615{
616 // Get current instruction
617 GPUDynInstPtr ii = instructionBuffer.front();
618 assert(ii);
619 computeUnit->vrf[simdId]->updateResources(this, ii);
620 // Single precision ALU or Branch or Return or Special instruction
621 if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
622 ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
623 // FIXME: Kernel argument loads are currently treated as ALU operations
624 // since we don't send memory packets at execution. If we fix that then
625 // we should map them to one of the memory pipelines
626 ii->opType()==Enums::OT_KERN_READ ||
627 ii->opType()==Enums::OT_ARG ||
628 ii->opType()==Enums::OT_RET) {
629 computeUnit->aluPipe[simdId].preset(computeUnit->shader->
630 ticks(computeUnit->spBypassLength()));
631 // this is to enforce a fixed number of cycles per issue slot per SIMD
632 computeUnit->wfWait[simdId].preset(computeUnit->shader->
633 ticks(computeUnit->issuePeriod));
634 } else if (ii->opType() == Enums::OT_BARRIER) {
635 computeUnit->wfWait[simdId].preset(computeUnit->shader->
636 ticks(computeUnit->issuePeriod));
637 } else if (ii->opType() == Enums::OT_FLAT_READ) {
638 assert(Enums::SC_NONE != ii->executedAs());
639 memReqsInPipe++;
640 rdGmReqsInPipe++;
641 if ( Enums::SC_SHARED == ii->executedAs() ) {
642 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
643 preset(computeUnit->shader->ticks(4));
644 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
645 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
646 } else {
647 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
648 preset(computeUnit->shader->ticks(4));
649 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
650 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
651 }
652 } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
653 assert(Enums::SC_NONE != ii->executedAs());
654 memReqsInPipe++;
655 wrGmReqsInPipe++;
656 if (Enums::SC_SHARED == ii->executedAs()) {
657 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
658 preset(computeUnit->shader->ticks(8));
659 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
660 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
661 } else {
662 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
663 preset(computeUnit->shader->ticks(8));
664 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
665 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
666 }
667 } else if (IS_OT_READ_GM(ii->opType())) {
668 memReqsInPipe++;
669 rdGmReqsInPipe++;
670 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
671 preset(computeUnit->shader->ticks(4));
672 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
673 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
674 } else if (IS_OT_WRITE_GM(ii->opType())) {
675 memReqsInPipe++;
676 wrGmReqsInPipe++;
677 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
678 preset(computeUnit->shader->ticks(8));
679 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
680 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
681 } else if (IS_OT_ATOMIC_GM(ii->opType())) {
682 memReqsInPipe++;
683 wrGmReqsInPipe++;
684 rdGmReqsInPipe++;
685 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
686 preset(computeUnit->shader->ticks(8));
687 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
688 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
689 } else if (IS_OT_READ_LM(ii->opType())) {
690 memReqsInPipe++;
691 rdLmReqsInPipe++;
692 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
693 preset(computeUnit->shader->ticks(4));
694 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
695 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
696 } else if (IS_OT_WRITE_LM(ii->opType())) {
697 memReqsInPipe++;
698 wrLmReqsInPipe++;
699 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
700 preset(computeUnit->shader->ticks(8));
701 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
702 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
703 } else if (IS_OT_ATOMIC_LM(ii->opType())) {
704 memReqsInPipe++;
705 wrLmReqsInPipe++;
706 rdLmReqsInPipe++;
707 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
708 preset(computeUnit->shader->ticks(8));
709 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
710 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
711 } else if (IS_OT_READ_PM(ii->opType())) {
712 memReqsInPipe++;
713 rdGmReqsInPipe++;
714 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
715 preset(computeUnit->shader->ticks(4));
716 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
717 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
718 } else if (IS_OT_WRITE_PM(ii->opType())) {
719 memReqsInPipe++;
720 wrGmReqsInPipe++;
721 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
722 preset(computeUnit->shader->ticks(8));
723 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
724 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
725 } else if (IS_OT_ATOMIC_PM(ii->opType())) {
726 memReqsInPipe++;
727 wrGmReqsInPipe++;
728 rdGmReqsInPipe++;
729 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
730 preset(computeUnit->shader->ticks(8));
731 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
732 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
733 }
734}
735
736void
737Wavefront::exec()
738{
739 // ---- Exit if wavefront is inactive ----------------------------- //
740
741 if (status == S_STOPPED || status == S_RETURNING ||
742 instructionBuffer.empty()) {
743 return;
744 }
745
746 // Get current instruction
747
748 GPUDynInstPtr ii = instructionBuffer.front();
749
750 const uint32_t old_pc = pc();
751 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
752 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
753 ii->disassemble(), old_pc);
754 ii->execute();
755 // access the VRF
756 computeUnit->vrf[simdId]->exec(ii, this);
757 srcRegOpDist.sample(ii->numSrcRegOperands());
758 dstRegOpDist.sample(ii->numDstRegOperands());
759 computeUnit->numInstrExecuted++;
760 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
761 computeUnit->lastExecCycle[simdId]);
762 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
763 if (pc() == old_pc) {
764 uint32_t new_pc = old_pc + 1;
765 // PC not modified by instruction, proceed to next or pop frame
766 pc(new_pc);
767 if (new_pc == rpc()) {
768 popFromReconvergenceStack();
769 discardFetch();
770 } else {
771 instructionBuffer.pop_front();
772 }
773 }
774
775 if (computeUnit->shader->hsail_mode==Shader::SIMT) {
776 const int num_active_lanes = execMask().count();
777 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
778 computeUnit->numVecOpsExecuted += num_active_lanes;
779 if (isGmInstruction(ii)) {
780 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
781 } else if (isLmInstruction(ii)) {
782 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
783 }
784 }
785
786 // ---- Update Vector ALU pipeline and other resources ------------------ //
787 // Single precision ALU or Branch or Return or Special instruction
788 if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
789 ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
790 // FIXME: Kernel argument loads are currently treated as ALU operations
791 // since we don't send memory packets at execution. If we fix that then
792 // we should map them to one of the memory pipelines
793 ii->opType() == Enums::OT_KERN_READ ||
794 ii->opType() == Enums::OT_ARG ||
795 ii->opType() == Enums::OT_RET) {
796 computeUnit->aluPipe[simdId].set(computeUnit->shader->
797 ticks(computeUnit->spBypassLength()));
798
799 // this is to enforce a fixed number of cycles per issue slot per SIMD
800 computeUnit->wfWait[simdId].set(computeUnit->shader->
801 ticks(computeUnit->issuePeriod));
802 } else if (ii->opType() == Enums::OT_BARRIER) {
803 computeUnit->wfWait[simdId].set(computeUnit->shader->
804 ticks(computeUnit->issuePeriod));
805 } else if (ii->opType() == Enums::OT_FLAT_READ) {
806 assert(Enums::SC_NONE != ii->executedAs());
807
808 if (Enums::SC_SHARED == ii->executedAs()) {
809 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
810 set(computeUnit->shader->ticks(4));
811 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
812 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
813 } else {
814 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
815 set(computeUnit->shader->ticks(4));
816 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
817 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
818 }
819 } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
820 assert(Enums::SC_NONE != ii->executedAs());
821 if (Enums::SC_SHARED == ii->executedAs()) {
822 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
823 set(computeUnit->shader->ticks(8));
824 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
825 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
826 } else {
827 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
828 set(computeUnit->shader->ticks(8));
829 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
830 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
831 }
832 } else if (IS_OT_READ_GM(ii->opType())) {
833 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
834 set(computeUnit->shader->ticks(4));
835 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
836 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
837 } else if (IS_OT_WRITE_GM(ii->opType())) {
838 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
839 set(computeUnit->shader->ticks(8));
840 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
841 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
842 } else if (IS_OT_ATOMIC_GM(ii->opType())) {
843 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
844 set(computeUnit->shader->ticks(8));
845 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
846 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
847 } else if (IS_OT_READ_LM(ii->opType())) {
848 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
849 set(computeUnit->shader->ticks(4));
850 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
851 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
852 } else if (IS_OT_WRITE_LM(ii->opType())) {
853 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
854 set(computeUnit->shader->ticks(8));
855 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
856 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
857 } else if (IS_OT_ATOMIC_LM(ii->opType())) {
858 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
859 set(computeUnit->shader->ticks(8));
860 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
861 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
862 }
863}
864
865bool
866Wavefront::waitingAtBarrier(int lane)
867{
868 return barCnt[lane] < maxBarCnt;
869}
870
871void
872Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
873 const VectorMask& mask)
874{
875 assert(mask.count());
876 reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
877}
878
879void
880Wavefront::popFromReconvergenceStack()
881{
882 assert(!reconvergenceStack.empty());
883
884 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
885 computeUnit->cu_id, simdId, wfSlotId, wfDynId,
886 execMask().to_string<char, std::string::traits_type,
887 std::string::allocator_type>().c_str(), pc());
888
889 reconvergenceStack.pop_back();
890
891 DPRINTF(WavefrontStack, "%3i %s\n", pc(),
892 execMask().to_string<char, std::string::traits_type,
893 std::string::allocator_type>().c_str());
894
895}
896
897void
898Wavefront::discardFetch()
899{
900 instructionBuffer.clear();
901 dropFetch |=pendingFetch;
902}
903
904uint32_t
905Wavefront::pc() const
906{
907 return reconvergenceStack.back()->pc;
908}
909
910uint32_t
911Wavefront::rpc() const
912{
913 return reconvergenceStack.back()->rpc;
914}
915
916VectorMask
917Wavefront::execMask() const
918{
919 return reconvergenceStack.back()->execMask;
920}
921
922bool
923Wavefront::execMask(int lane) const
924{
925 return reconvergenceStack.back()->execMask[lane];
926}
927
928
929void
930Wavefront::pc(uint32_t new_pc)
931{
932 reconvergenceStack.back()->pc = new_pc;
933}
934
935uint32_t
936Wavefront::getStaticContextSize() const
937{
938 return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
939 sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
940 sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
941 sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
942 computeUnit->wfSize() * sizeof(ReconvergenceStackEntry);
943}
944
945void
946Wavefront::getContext(const void *out)
947{
948 uint8_t *iter = (uint8_t *)out;
949 for (int i = 0; i < barCnt.size(); i++) {
950 *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
951 }
952 *(int *)iter = wfId; iter += sizeof(wfId);
953 *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
954 *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
955 *(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
956 *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
957 *(uint32_t *)iter = wgId; iter += sizeof(wgId);
958 *(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
959 *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
960 *(Addr *)iter = privBase; iter += sizeof(privBase);
961 *(Addr *)iter = spillBase; iter += sizeof(spillBase);
962
963 int stackSize = reconvergenceStack.size();
964 ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
965 std::numeric_limits<uint32_t>::max(),
966 std::numeric_limits<uint64_t>::max()};
967 for (int i = 0; i < workItemId[0].size(); i++) {
968 if (i < stackSize) {
969 *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
970 iter += sizeof(ReconvergenceStackEntry);
971 reconvergenceStack.pop_back();
972 } else {
973 *(ReconvergenceStackEntry *)iter = empty;
974 iter += sizeof(ReconvergenceStackEntry);
975 }
976 }
977
978 int wf_size = computeUnit->wfSize();
979 for (int i = 0; i < maxSpVgprs; i++) {
980 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
981 for (int lane = 0; lane < wf_size; lane++) {
982 uint32_t regVal = computeUnit->vrf[simdId]->
983 read<uint32_t>(vgprIdx,lane);
984 *(uint32_t *)iter = regVal; iter += sizeof(regVal);
985 }
986 }
987
988 for (int i = 0; i < maxDpVgprs; i++) {
989 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
990 for (int lane = 0; lane < wf_size; lane++) {
991 uint64_t regVal = computeUnit->vrf[simdId]->
992 read<uint64_t>(vgprIdx,lane);
993 *(uint64_t *)iter = regVal; iter += sizeof(regVal);
994 }
995 }
996
997 for (int i = 0; i < condRegState->numRegs(); i++) {
998 for (int lane = 0; lane < wf_size; lane++) {
999 uint64_t regVal = condRegState->read<uint64_t>(i, lane);
1000 *(uint64_t *)iter = regVal; iter += sizeof(regVal);
1001 }
1002 }
1003
1004 /* saving LDS content */
1005 if (ldsChunk)
1006 for (int i = 0; i < ldsChunk->size(); i++) {
1007 char val = ldsChunk->read<char>(i);
1008 *(char *) iter = val; iter += sizeof(val);
1009 }
1010}
1011
1012void
1013Wavefront::setContext(const void *in)
1014{
1015 uint8_t *iter = (uint8_t *)in;
1016 for (int i = 0; i < barCnt.size(); i++) {
1017 barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
1018 }
1019 wfId = *(int *)iter; iter += sizeof(wfId);
1020 maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
1021 oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
1022 barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
1023 computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
1024 wgId = *(uint32_t *)iter; iter += sizeof(wgId);
1025 barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
1026 initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
1027 privBase = *(Addr *)iter; iter += sizeof(privBase);
1028 spillBase = *(Addr *)iter; iter += sizeof(spillBase);
1029
1030 for (int i = 0; i < workItemId[0].size(); i++) {
1031 ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter;
1032 iter += sizeof(ReconvergenceStackEntry);
1033 if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
1034 pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
1035 newEntry.execMask);
1036 }
1037 }
1038 int wf_size = computeUnit->wfSize();
1039
1040 for (int i = 0; i < maxSpVgprs; i++) {
1041 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
1042 for (int lane = 0; lane < wf_size; lane++) {
1043 uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
1044 computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
1045 }
1046 }
1047
1048 for (int i = 0; i < maxDpVgprs; i++) {
1049 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
1050 for (int lane = 0; lane < wf_size; lane++) {
1051 uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
1052 computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
1053 }
1054 }
1055
1056 for (int i = 0; i < condRegState->numRegs(); i++) {
1057 for (int lane = 0; lane < wf_size; lane++) {
1058 uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
1059 condRegState->write<uint64_t>(i, lane, regVal);
1060 }
1061 }
1062 /** Restoring LDS contents */
1063 if (ldsChunk)
1064 for (int i = 0; i < ldsChunk->size(); i++) {
1065 char val = *(char *) iter; iter += sizeof(val);
1066 ldsChunk->write<char>(i, val);
1067 }
1068}
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#include "gpu-compute/wavefront.hh"
37
38#include "debug/GPUExec.hh"
39#include "debug/WavefrontStack.hh"
40#include "gpu-compute/code_enums.hh"
41#include "gpu-compute/compute_unit.hh"
42#include "gpu-compute/gpu_dyn_inst.hh"
43#include "gpu-compute/shader.hh"
44#include "gpu-compute/vector_register_file.hh"
45
46Wavefront*
47WavefrontParams::create()
48{
49 return new Wavefront(this);
50}
51
52Wavefront::Wavefront(const Params *p)
53 : SimObject(p), callArgMem(nullptr)
54{
55 lastTrace = 0;
56 simdId = p->simdId;
57 wfSlotId = p->wf_slot_id;
58 status = S_STOPPED;
59 reservedVectorRegs = 0;
60 startVgprIndex = 0;
61 outstandingReqs = 0;
62 memReqsInPipe = 0;
63 outstandingReqsWrGm = 0;
64 outstandingReqsWrLm = 0;
65 outstandingReqsRdGm = 0;
66 outstandingReqsRdLm = 0;
67 rdLmReqsInPipe = 0;
68 rdGmReqsInPipe = 0;
69 wrLmReqsInPipe = 0;
70 wrGmReqsInPipe = 0;
71
72 barrierCnt = 0;
73 oldBarrierCnt = 0;
74 stalledAtBarrier = false;
75
76 memTraceBusy = 0;
77 oldVgprTcnt = 0xffffffffffffffffll;
78 oldDgprTcnt = 0xffffffffffffffffll;
79 oldVgpr.resize(p->wfSize);
80
81 pendingFetch = false;
82 dropFetch = false;
83 condRegState = new ConditionRegisterState();
84 maxSpVgprs = 0;
85 maxDpVgprs = 0;
86 lastAddr.resize(p->wfSize);
87 workItemFlatId.resize(p->wfSize);
88 oldDgpr.resize(p->wfSize);
89 barCnt.resize(p->wfSize);
90 for (int i = 0; i < 3; ++i) {
91 workItemId[i].resize(p->wfSize);
92 }
93}
94
95void
96Wavefront::regStats()
97{
98 SimObject::regStats();
99
100 srcRegOpDist
101 .init(0, 4, 2)
102 .name(name() + ".src_reg_operand_dist")
103 .desc("number of executed instructions with N source register operands")
104 ;
105
106 dstRegOpDist
107 .init(0, 3, 2)
108 .name(name() + ".dst_reg_operand_dist")
109 .desc("number of executed instructions with N destination register "
110 "operands")
111 ;
112
113 // FIXME: the name of the WF needs to be unique
114 numTimesBlockedDueWAXDependencies
115 .name(name() + ".timesBlockedDueWAXDependencies")
116 .desc("number of times the wf's instructions are blocked due to WAW "
117 "or WAR dependencies")
118 ;
119
120 // FIXME: the name of the WF needs to be unique
121 numTimesBlockedDueRAWDependencies
122 .name(name() + ".timesBlockedDueRAWDependencies")
123 .desc("number of times the wf's instructions are blocked due to RAW "
124 "dependencies")
125 ;
126
127 // FIXME: the name of the WF needs to be unique
128 numTimesBlockedDueVrfPortAvail
129 .name(name() + ".timesBlockedDueVrfPortAvail")
130 .desc("number of times instructions are blocked due to VRF port "
131 "availability")
132 ;
133}
134
135void
136Wavefront::init()
137{
138 reservedVectorRegs = 0;
139 startVgprIndex = 0;
140}
141
142void
143Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
144{
145 condRegState->init(num_cregs);
146 maxSpVgprs = num_sregs;
147 maxDpVgprs = num_dregs;
148}
149
150Wavefront::~Wavefront()
151{
152 if (callArgMem)
153 delete callArgMem;
154 delete condRegState;
155}
156
157void
158Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
159{
160 wfDynId = _wf_dyn_id;
161 basePtr = _base_ptr;
162 status = S_RUNNING;
163}
164
165bool
166Wavefront::isGmInstruction(GPUDynInstPtr ii)
167{
168 if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
169 IS_OT_ATOMIC_PM(ii->opType())) {
170 return true;
171 }
172
173 if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
174 IS_OT_ATOMIC_GM(ii->opType())) {
175 return true;
176 }
177
178 if (IS_OT_FLAT(ii->opType())) {
179 return true;
180 }
181
182 return false;
183}
184
185bool
186Wavefront::isLmInstruction(GPUDynInstPtr ii)
187{
188 if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) ||
189 IS_OT_ATOMIC_LM(ii->opType())) {
190 return true;
191 }
192
193 return false;
194}
195
196bool
197Wavefront::isOldestInstALU()
198{
199 assert(!instructionBuffer.empty());
200 GPUDynInstPtr ii = instructionBuffer.front();
201
202 if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP ||
203 ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
204 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
205 ii->opType() == Enums::OT_KERN_READ)) {
206 return true;
207 }
208
209 return false;
210}
211
212bool
213Wavefront::isOldestInstBarrier()
214{
215 assert(!instructionBuffer.empty());
216 GPUDynInstPtr ii = instructionBuffer.front();
217
218 if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) {
219 return true;
220 }
221
222 return false;
223}
224
225bool
226Wavefront::isOldestInstGMem()
227{
228 assert(!instructionBuffer.empty());
229 GPUDynInstPtr ii = instructionBuffer.front();
230
231 if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) ||
232 IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
233
234 return true;
235 }
236
237 return false;
238}
239
240bool
241Wavefront::isOldestInstLMem()
242{
243 assert(!instructionBuffer.empty());
244 GPUDynInstPtr ii = instructionBuffer.front();
245
246 if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) ||
247 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
248
249 return true;
250 }
251
252 return false;
253}
254
255bool
256Wavefront::isOldestInstPrivMem()
257{
258 assert(!instructionBuffer.empty());
259 GPUDynInstPtr ii = instructionBuffer.front();
260
261 if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) ||
262 IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
263
264 return true;
265 }
266
267 return false;
268}
269
270bool
271Wavefront::isOldestInstFlatMem()
272{
273 assert(!instructionBuffer.empty());
274 GPUDynInstPtr ii = instructionBuffer.front();
275
276 if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) {
277
278 return true;
279 }
280
281 return false;
282}
283
284// Return true if the Wavefront's instruction
285// buffer has branch instruction.
286bool
287Wavefront::instructionBufferHasBranch()
288{
289 for (auto it : instructionBuffer) {
290 GPUDynInstPtr ii = it;
291
292 if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) {
293 return true;
294 }
295 }
296
297 return false;
298}
299
300// Remap HSAIL register to physical VGPR.
301// HSAIL register = virtual register assigned to an operand by HLC compiler
302uint32_t
303Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
304{
305 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
306 // add the offset from where the VGPRs of the wavefront have been assigned
307 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
308 // HSAIL double precision (DP) register: calculate the physical VGPR index
309 // assuming that DP registers are placed after SP ones in the VRF. The DP
310 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
311 // the DP VGPR index before mapping it to the physical VRF address space
312 if (mode == 1 && size > 4) {
313 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
314 }
315
316 assert((startVgprIndex <= physicalVgprIndex) &&
317 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
318
319 // calculate absolute physical VGPR index
320 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
321}
322
323// Return true if this wavefront is ready
324// to execute an instruction of the specified type.
325int
326Wavefront::ready(itype_e type)
327{
328 // Check to make sure wave is running
329 if (status == S_STOPPED || status == S_RETURNING ||
330 instructionBuffer.empty()) {
331 return 0;
332 }
333
334 // Is the wave waiting at a barrier
335 if (stalledAtBarrier) {
336 if (!computeUnit->AllAtBarrier(barrierId,barrierCnt,
337 computeUnit->getRefCounter(dispatchId, wgId))) {
338 // Are all threads at barrier?
339 return 0;
340 }
341 oldBarrierCnt = barrierCnt;
342 stalledAtBarrier = false;
343 }
344
345 // Read instruction
346 GPUDynInstPtr ii = instructionBuffer.front();
347
348 bool ready_inst M5_VAR_USED = false;
349 bool glbMemBusRdy = false;
350 bool glbMemIssueRdy = false;
351 if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
352 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
353 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
354 glbMemBusRdy = true;
355 if (computeUnit->wfWait[j].prerdy())
356 glbMemIssueRdy = true;
357 }
358 }
359 bool locMemBusRdy = false;
360 bool locMemIssueRdy = false;
361 if (type == I_SHARED || type == I_FLAT) {
362 for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
363 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
364 locMemBusRdy = true;
365 if (computeUnit->wfWait[j].prerdy())
366 locMemIssueRdy = true;
367 }
368 }
369
370 // The following code is very error prone and the entire process for
371 // checking readiness will be fixed eventually. In the meantime, let's
372 // make sure that we do not silently let an instruction type slip
373 // through this logic and always return not ready.
374 if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP ||
375 ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
376 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
377 ii->opType() == Enums::OT_KERN_READ ||
378 ii->opType() == Enums::OT_ARG ||
379 IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
380 IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) ||
381 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
382 IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
383 IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) {
384 panic("next instruction: %s is of unknown type\n", ii->disassemble());
385 }
386
387 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
388 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
389
390 if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) {
391 // Here for ALU instruction (barrier)
392 if (!computeUnit->wfWait[simdId].prerdy()) {
393 // Is wave slot free?
394 return 0;
395 }
396
397 // Are there in pipe or outstanding memory requests?
398 if ((outstandingReqs + memReqsInPipe) > 0) {
399 return 0;
400 }
401
402 ready_inst = true;
403 } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) {
404 // Here for ALU instruction (nop)
405 if (!computeUnit->wfWait[simdId].prerdy()) {
406 // Is wave slot free?
407 return 0;
408 }
409
410 ready_inst = true;
411 } else if (type == I_ALU && ii->opType() == Enums::OT_RET) {
412 // Here for ALU instruction (return)
413 if (!computeUnit->wfWait[simdId].prerdy()) {
414 // Is wave slot free?
415 return 0;
416 }
417
418 // Are there in pipe or outstanding memory requests?
419 if ((outstandingReqs + memReqsInPipe) > 0) {
420 return 0;
421 }
422
423 ready_inst = true;
424 } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH ||
425 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
426 ii->opType() == Enums::OT_KERN_READ ||
427 ii->opType() == Enums::OT_ARG)) {
428 // Here for ALU instruction (all others)
429 if (!computeUnit->wfWait[simdId].prerdy()) {
430 // Is alu slot free?
431 return 0;
432 }
433 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
434 VrfAccessType::RD_WR)) {
435 return 0;
436 }
437
438 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
439 return 0;
440 }
441 ready_inst = true;
442 } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) ||
443 IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
444 // Here Global memory instruction
445 if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) {
446 // Are there in pipe or outstanding global memory write requests?
447 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
448 return 0;
449 }
450 }
451
452 if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) ||
453 IS_OT_HIST_GM(ii->opType())) {
454 // Are there in pipe or outstanding global memory read requests?
455 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0)
456 return 0;
457 }
458
459 if (!glbMemIssueRdy) {
460 // Is WV issue slot free?
461 return 0;
462 }
463
464 if (!glbMemBusRdy) {
465 // Is there an available VRF->Global memory read bus?
466 return 0;
467 }
468
469 if (!computeUnit->globalMemoryPipe.
470 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
471 // Can we insert a new request to the Global Mem Request FIFO?
472 return 0;
473 }
474 // can we schedule source & destination operands on the VRF?
475 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
476 VrfAccessType::RD_WR)) {
477 return 0;
478 }
479 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
480 return 0;
481 }
482 ready_inst = true;
483 } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) ||
484 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
485 // Here for Shared memory instruction
486 if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) {
487 if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
488 return 0;
489 }
490 }
491
492 if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
493 IS_OT_HIST_LM(ii->opType())) {
494 if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
495 return 0;
496 }
497 }
498
499 if (!locMemBusRdy) {
500 // Is there an available VRF->LDS read bus?
501 return 0;
502 }
503 if (!locMemIssueRdy) {
504 // Is wave slot free?
505 return 0;
506 }
507
508 if (!computeUnit->localMemoryPipe.
509 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
510 // Can we insert a new request to the LDS Request FIFO?
511 return 0;
512 }
513 // can we schedule source & destination operands on the VRF?
514 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
515 VrfAccessType::RD_WR)) {
516 return 0;
517 }
518 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
519 return 0;
520 }
521 ready_inst = true;
522 } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) ||
523 IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
524 // Here for Private memory instruction ------------------------ //
525 if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) {
526 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
527 return 0;
528 }
529 }
530
531 if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) ||
532 IS_OT_HIST_PM(ii->opType())) {
533 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) {
534 return 0;
535 }
536 }
537
538 if (!glbMemBusRdy) {
539 // Is there an available VRF->Global memory read bus?
540 return 0;
541 }
542
543 if (!glbMemIssueRdy) {
544 // Is wave slot free?
545 return 0;
546 }
547
548 if (!computeUnit->globalMemoryPipe.
549 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
550 // Can we insert a new request to the Global Mem Request FIFO?
551 return 0;
552 }
553 // can we schedule source & destination operands on the VRF?
554 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
555 VrfAccessType::RD_WR)) {
556 return 0;
557 }
558 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
559 return 0;
560 }
561 ready_inst = true;
562 } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) {
563 if (!glbMemBusRdy) {
564 // Is there an available VRF->Global memory read bus?
565 return 0;
566 }
567
568 if (!locMemBusRdy) {
569 // Is there an available VRF->LDS read bus?
570 return 0;
571 }
572
573 if (!glbMemIssueRdy) {
574 // Is wave slot free?
575 return 0;
576 }
577
578 if (!locMemIssueRdy) {
579 return 0;
580 }
581 if (!computeUnit->globalMemoryPipe.
582 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
583 // Can we insert a new request to the Global Mem Request FIFO?
584 return 0;
585 }
586
587 if (!computeUnit->localMemoryPipe.
588 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
589 // Can we insert a new request to the LDS Request FIFO?
590 return 0;
591 }
592 // can we schedule source & destination operands on the VRF?
593 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
594 VrfAccessType::RD_WR)) {
595 return 0;
596 }
597 // are all the operands ready? (RAW, WAW and WAR depedencies met?)
598 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
599 return 0;
600 }
601 ready_inst = true;
602 } else {
603 return 0;
604 }
605
606 assert(ready_inst);
607
608 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
609 simdId, wfSlotId, ii->disassemble());
610 return 1;
611}
612
613void
614Wavefront::updateResources()
615{
616 // Get current instruction
617 GPUDynInstPtr ii = instructionBuffer.front();
618 assert(ii);
619 computeUnit->vrf[simdId]->updateResources(this, ii);
620 // Single precision ALU or Branch or Return or Special instruction
621 if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
622 ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
623 // FIXME: Kernel argument loads are currently treated as ALU operations
624 // since we don't send memory packets at execution. If we fix that then
625 // we should map them to one of the memory pipelines
626 ii->opType()==Enums::OT_KERN_READ ||
627 ii->opType()==Enums::OT_ARG ||
628 ii->opType()==Enums::OT_RET) {
629 computeUnit->aluPipe[simdId].preset(computeUnit->shader->
630 ticks(computeUnit->spBypassLength()));
631 // this is to enforce a fixed number of cycles per issue slot per SIMD
632 computeUnit->wfWait[simdId].preset(computeUnit->shader->
633 ticks(computeUnit->issuePeriod));
634 } else if (ii->opType() == Enums::OT_BARRIER) {
635 computeUnit->wfWait[simdId].preset(computeUnit->shader->
636 ticks(computeUnit->issuePeriod));
637 } else if (ii->opType() == Enums::OT_FLAT_READ) {
638 assert(Enums::SC_NONE != ii->executedAs());
639 memReqsInPipe++;
640 rdGmReqsInPipe++;
641 if ( Enums::SC_SHARED == ii->executedAs() ) {
642 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
643 preset(computeUnit->shader->ticks(4));
644 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
645 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
646 } else {
647 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
648 preset(computeUnit->shader->ticks(4));
649 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
650 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
651 }
652 } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
653 assert(Enums::SC_NONE != ii->executedAs());
654 memReqsInPipe++;
655 wrGmReqsInPipe++;
656 if (Enums::SC_SHARED == ii->executedAs()) {
657 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
658 preset(computeUnit->shader->ticks(8));
659 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
660 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
661 } else {
662 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
663 preset(computeUnit->shader->ticks(8));
664 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
665 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
666 }
667 } else if (IS_OT_READ_GM(ii->opType())) {
668 memReqsInPipe++;
669 rdGmReqsInPipe++;
670 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
671 preset(computeUnit->shader->ticks(4));
672 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
673 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
674 } else if (IS_OT_WRITE_GM(ii->opType())) {
675 memReqsInPipe++;
676 wrGmReqsInPipe++;
677 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
678 preset(computeUnit->shader->ticks(8));
679 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
680 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
681 } else if (IS_OT_ATOMIC_GM(ii->opType())) {
682 memReqsInPipe++;
683 wrGmReqsInPipe++;
684 rdGmReqsInPipe++;
685 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
686 preset(computeUnit->shader->ticks(8));
687 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
688 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
689 } else if (IS_OT_READ_LM(ii->opType())) {
690 memReqsInPipe++;
691 rdLmReqsInPipe++;
692 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
693 preset(computeUnit->shader->ticks(4));
694 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
695 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
696 } else if (IS_OT_WRITE_LM(ii->opType())) {
697 memReqsInPipe++;
698 wrLmReqsInPipe++;
699 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
700 preset(computeUnit->shader->ticks(8));
701 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
702 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
703 } else if (IS_OT_ATOMIC_LM(ii->opType())) {
704 memReqsInPipe++;
705 wrLmReqsInPipe++;
706 rdLmReqsInPipe++;
707 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
708 preset(computeUnit->shader->ticks(8));
709 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
710 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
711 } else if (IS_OT_READ_PM(ii->opType())) {
712 memReqsInPipe++;
713 rdGmReqsInPipe++;
714 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
715 preset(computeUnit->shader->ticks(4));
716 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
717 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
718 } else if (IS_OT_WRITE_PM(ii->opType())) {
719 memReqsInPipe++;
720 wrGmReqsInPipe++;
721 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
722 preset(computeUnit->shader->ticks(8));
723 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
724 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
725 } else if (IS_OT_ATOMIC_PM(ii->opType())) {
726 memReqsInPipe++;
727 wrGmReqsInPipe++;
728 rdGmReqsInPipe++;
729 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
730 preset(computeUnit->shader->ticks(8));
731 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
732 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
733 }
734}
735
736void
737Wavefront::exec()
738{
739 // ---- Exit if wavefront is inactive ----------------------------- //
740
741 if (status == S_STOPPED || status == S_RETURNING ||
742 instructionBuffer.empty()) {
743 return;
744 }
745
746 // Get current instruction
747
748 GPUDynInstPtr ii = instructionBuffer.front();
749
750 const uint32_t old_pc = pc();
751 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
752 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
753 ii->disassemble(), old_pc);
754 ii->execute();
755 // access the VRF
756 computeUnit->vrf[simdId]->exec(ii, this);
757 srcRegOpDist.sample(ii->numSrcRegOperands());
758 dstRegOpDist.sample(ii->numDstRegOperands());
759 computeUnit->numInstrExecuted++;
760 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
761 computeUnit->lastExecCycle[simdId]);
762 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
763 if (pc() == old_pc) {
764 uint32_t new_pc = old_pc + 1;
765 // PC not modified by instruction, proceed to next or pop frame
766 pc(new_pc);
767 if (new_pc == rpc()) {
768 popFromReconvergenceStack();
769 discardFetch();
770 } else {
771 instructionBuffer.pop_front();
772 }
773 }
774
775 if (computeUnit->shader->hsail_mode==Shader::SIMT) {
776 const int num_active_lanes = execMask().count();
777 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
778 computeUnit->numVecOpsExecuted += num_active_lanes;
779 if (isGmInstruction(ii)) {
780 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
781 } else if (isLmInstruction(ii)) {
782 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
783 }
784 }
785
786 // ---- Update Vector ALU pipeline and other resources ------------------ //
787 // Single precision ALU or Branch or Return or Special instruction
788 if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
789 ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
790 // FIXME: Kernel argument loads are currently treated as ALU operations
791 // since we don't send memory packets at execution. If we fix that then
792 // we should map them to one of the memory pipelines
793 ii->opType() == Enums::OT_KERN_READ ||
794 ii->opType() == Enums::OT_ARG ||
795 ii->opType() == Enums::OT_RET) {
796 computeUnit->aluPipe[simdId].set(computeUnit->shader->
797 ticks(computeUnit->spBypassLength()));
798
799 // this is to enforce a fixed number of cycles per issue slot per SIMD
800 computeUnit->wfWait[simdId].set(computeUnit->shader->
801 ticks(computeUnit->issuePeriod));
802 } else if (ii->opType() == Enums::OT_BARRIER) {
803 computeUnit->wfWait[simdId].set(computeUnit->shader->
804 ticks(computeUnit->issuePeriod));
805 } else if (ii->opType() == Enums::OT_FLAT_READ) {
806 assert(Enums::SC_NONE != ii->executedAs());
807
808 if (Enums::SC_SHARED == ii->executedAs()) {
809 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
810 set(computeUnit->shader->ticks(4));
811 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
812 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
813 } else {
814 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
815 set(computeUnit->shader->ticks(4));
816 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
817 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
818 }
819 } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
820 assert(Enums::SC_NONE != ii->executedAs());
821 if (Enums::SC_SHARED == ii->executedAs()) {
822 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
823 set(computeUnit->shader->ticks(8));
824 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
825 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
826 } else {
827 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
828 set(computeUnit->shader->ticks(8));
829 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
830 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
831 }
832 } else if (IS_OT_READ_GM(ii->opType())) {
833 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
834 set(computeUnit->shader->ticks(4));
835 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
836 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
837 } else if (IS_OT_WRITE_GM(ii->opType())) {
838 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
839 set(computeUnit->shader->ticks(8));
840 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
841 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
842 } else if (IS_OT_ATOMIC_GM(ii->opType())) {
843 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
844 set(computeUnit->shader->ticks(8));
845 computeUnit->wfWait[computeUnit->GlbMemUnitId()].
846 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
847 } else if (IS_OT_READ_LM(ii->opType())) {
848 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
849 set(computeUnit->shader->ticks(4));
850 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
851 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
852 } else if (IS_OT_WRITE_LM(ii->opType())) {
853 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
854 set(computeUnit->shader->ticks(8));
855 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
856 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
857 } else if (IS_OT_ATOMIC_LM(ii->opType())) {
858 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
859 set(computeUnit->shader->ticks(8));
860 computeUnit->wfWait[computeUnit->ShrMemUnitId()].
861 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
862 }
863}
864
865bool
866Wavefront::waitingAtBarrier(int lane)
867{
868 return barCnt[lane] < maxBarCnt;
869}
870
871void
872Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
873 const VectorMask& mask)
874{
875 assert(mask.count());
876 reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
877}
878
879void
880Wavefront::popFromReconvergenceStack()
881{
882 assert(!reconvergenceStack.empty());
883
884 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
885 computeUnit->cu_id, simdId, wfSlotId, wfDynId,
886 execMask().to_string<char, std::string::traits_type,
887 std::string::allocator_type>().c_str(), pc());
888
889 reconvergenceStack.pop_back();
890
891 DPRINTF(WavefrontStack, "%3i %s\n", pc(),
892 execMask().to_string<char, std::string::traits_type,
893 std::string::allocator_type>().c_str());
894
895}
896
897void
898Wavefront::discardFetch()
899{
900 instructionBuffer.clear();
901 dropFetch |=pendingFetch;
902}
903
904uint32_t
905Wavefront::pc() const
906{
907 return reconvergenceStack.back()->pc;
908}
909
910uint32_t
911Wavefront::rpc() const
912{
913 return reconvergenceStack.back()->rpc;
914}
915
916VectorMask
917Wavefront::execMask() const
918{
919 return reconvergenceStack.back()->execMask;
920}
921
922bool
923Wavefront::execMask(int lane) const
924{
925 return reconvergenceStack.back()->execMask[lane];
926}
927
928
929void
930Wavefront::pc(uint32_t new_pc)
931{
932 reconvergenceStack.back()->pc = new_pc;
933}
934
935uint32_t
936Wavefront::getStaticContextSize() const
937{
938 return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
939 sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
940 sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
941 sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
942 computeUnit->wfSize() * sizeof(ReconvergenceStackEntry);
943}
944
945void
946Wavefront::getContext(const void *out)
947{
948 uint8_t *iter = (uint8_t *)out;
949 for (int i = 0; i < barCnt.size(); i++) {
950 *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
951 }
952 *(int *)iter = wfId; iter += sizeof(wfId);
953 *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
954 *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
955 *(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
956 *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
957 *(uint32_t *)iter = wgId; iter += sizeof(wgId);
958 *(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
959 *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
960 *(Addr *)iter = privBase; iter += sizeof(privBase);
961 *(Addr *)iter = spillBase; iter += sizeof(spillBase);
962
963 int stackSize = reconvergenceStack.size();
964 ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
965 std::numeric_limits<uint32_t>::max(),
966 std::numeric_limits<uint64_t>::max()};
967 for (int i = 0; i < workItemId[0].size(); i++) {
968 if (i < stackSize) {
969 *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
970 iter += sizeof(ReconvergenceStackEntry);
971 reconvergenceStack.pop_back();
972 } else {
973 *(ReconvergenceStackEntry *)iter = empty;
974 iter += sizeof(ReconvergenceStackEntry);
975 }
976 }
977
978 int wf_size = computeUnit->wfSize();
979 for (int i = 0; i < maxSpVgprs; i++) {
980 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
981 for (int lane = 0; lane < wf_size; lane++) {
982 uint32_t regVal = computeUnit->vrf[simdId]->
983 read<uint32_t>(vgprIdx,lane);
984 *(uint32_t *)iter = regVal; iter += sizeof(regVal);
985 }
986 }
987
988 for (int i = 0; i < maxDpVgprs; i++) {
989 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
990 for (int lane = 0; lane < wf_size; lane++) {
991 uint64_t regVal = computeUnit->vrf[simdId]->
992 read<uint64_t>(vgprIdx,lane);
993 *(uint64_t *)iter = regVal; iter += sizeof(regVal);
994 }
995 }
996
997 for (int i = 0; i < condRegState->numRegs(); i++) {
998 for (int lane = 0; lane < wf_size; lane++) {
999 uint64_t regVal = condRegState->read<uint64_t>(i, lane);
1000 *(uint64_t *)iter = regVal; iter += sizeof(regVal);
1001 }
1002 }
1003
1004 /* saving LDS content */
1005 if (ldsChunk)
1006 for (int i = 0; i < ldsChunk->size(); i++) {
1007 char val = ldsChunk->read<char>(i);
1008 *(char *) iter = val; iter += sizeof(val);
1009 }
1010}
1011
1012void
1013Wavefront::setContext(const void *in)
1014{
1015 uint8_t *iter = (uint8_t *)in;
1016 for (int i = 0; i < barCnt.size(); i++) {
1017 barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
1018 }
1019 wfId = *(int *)iter; iter += sizeof(wfId);
1020 maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
1021 oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
1022 barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
1023 computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
1024 wgId = *(uint32_t *)iter; iter += sizeof(wgId);
1025 barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
1026 initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
1027 privBase = *(Addr *)iter; iter += sizeof(privBase);
1028 spillBase = *(Addr *)iter; iter += sizeof(spillBase);
1029
1030 for (int i = 0; i < workItemId[0].size(); i++) {
1031 ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter;
1032 iter += sizeof(ReconvergenceStackEntry);
1033 if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
1034 pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
1035 newEntry.execMask);
1036 }
1037 }
1038 int wf_size = computeUnit->wfSize();
1039
1040 for (int i = 0; i < maxSpVgprs; i++) {
1041 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
1042 for (int lane = 0; lane < wf_size; lane++) {
1043 uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
1044 computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
1045 }
1046 }
1047
1048 for (int i = 0; i < maxDpVgprs; i++) {
1049 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
1050 for (int lane = 0; lane < wf_size; lane++) {
1051 uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
1052 computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
1053 }
1054 }
1055
1056 for (int i = 0; i < condRegState->numRegs(); i++) {
1057 for (int lane = 0; lane < wf_size; lane++) {
1058 uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
1059 condRegState->write<uint64_t>(i, lane, regVal);
1060 }
1061 }
1062 /** Restoring LDS contents */
1063 if (ldsChunk)
1064 for (int i = 0; i < ldsChunk->size(); i++) {
1065 char val = *(char *) iter; iter += sizeof(val);
1066 ldsChunk->write<char>(i, val);
1067 }
1068}
1069
1070void
1071Wavefront::computeActualWgSz(NDRange *ndr)
1072{
1073 actualWgSzTotal = 1;
1074 for (int d = 0; d < 3; ++d) {
1075 actualWgSz[d] = std::min(workGroupSz[d],
1076 gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
1077 actualWgSzTotal *= actualWgSz[d];
1078 }
1079}