1/* 2 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Lisa Hsu 34 */ 35 36#include "gpu-compute/wavefront.hh" 37 38#include "debug/GPUExec.hh" 39#include "debug/WavefrontStack.hh" 40#include "gpu-compute/compute_unit.hh" 41#include "gpu-compute/gpu_dyn_inst.hh" 42#include "gpu-compute/shader.hh" 43#include "gpu-compute/vector_register_file.hh" 44 45Wavefront* 46WavefrontParams::create() 47{ 48 return new Wavefront(this); 49} 50 51Wavefront::Wavefront(const Params *p) 52 : SimObject(p), callArgMem(nullptr), _gpuISA() 53{ 54 lastTrace = 0; 55 simdId = p->simdId; 56 wfSlotId = p->wf_slot_id; 57 status = S_STOPPED; 58 reservedVectorRegs = 0; 59 startVgprIndex = 0; 60 outstandingReqs = 0; 61 memReqsInPipe = 0; 62 outstandingReqsWrGm = 0; 63 outstandingReqsWrLm = 0; 64 outstandingReqsRdGm = 0; 65 outstandingReqsRdLm = 0; 66 rdLmReqsInPipe = 0; 67 rdGmReqsInPipe = 0; 68 wrLmReqsInPipe = 0; 69 wrGmReqsInPipe = 0; 70 71 barrierCnt = 0; 72 oldBarrierCnt = 0; 73 stalledAtBarrier = false; 74 75 memTraceBusy = 0; 76 oldVgprTcnt = 0xffffffffffffffffll; 77 oldDgprTcnt = 0xffffffffffffffffll; 78 oldVgpr.resize(p->wfSize); 79 80 pendingFetch = false; 81 dropFetch = false; 82 condRegState = new ConditionRegisterState(); 83 maxSpVgprs = 0; 84 maxDpVgprs = 0; 85 lastAddr.resize(p->wfSize); 86 workItemFlatId.resize(p->wfSize); 87 oldDgpr.resize(p->wfSize); 88 barCnt.resize(p->wfSize); 89 for (int i = 0; i < 3; ++i) { 90 workItemId[i].resize(p->wfSize); 91 } 92} 93 94void 95Wavefront::regStats() 96{ 97 SimObject::regStats(); 98 99 srcRegOpDist 100 .init(0, 4, 2) 101 .name(name() + ".src_reg_operand_dist") 102 .desc("number of executed instructions with N source register operands") 103 ; 104 105 dstRegOpDist 106 .init(0, 3, 2) 107 .name(name() + ".dst_reg_operand_dist") 108 .desc("number of executed instructions with N destination register " 109 "operands") 110 ; 111 112 // FIXME: the name of the WF needs to be unique 113 numTimesBlockedDueWAXDependencies 114 .name(name() + ".timesBlockedDueWAXDependencies") 115 .desc("number of times the wf's instructions are blocked due to WAW " 116 "or WAR dependencies") 117 ; 118 119 // FIXME: the name of the WF needs to be unique 120 numTimesBlockedDueRAWDependencies 121 .name(name() + ".timesBlockedDueRAWDependencies") 122 .desc("number of times the wf's instructions are blocked due to RAW " 123 "dependencies") 124 ; 125 126 // FIXME: the name of the WF needs to be unique 127 numTimesBlockedDueVrfPortAvail 128 .name(name() + ".timesBlockedDueVrfPortAvail") 129 .desc("number of times instructions are blocked due to VRF port " 130 "availability") 131 ; 132} 133 134void 135Wavefront::init() 136{ 137 reservedVectorRegs = 0; 138 startVgprIndex = 0; 139} 140 141void 142Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) 143{ 144 condRegState->init(num_cregs); 145 maxSpVgprs = num_sregs; 146 maxDpVgprs = num_dregs; 147} 148 149Wavefront::~Wavefront() 150{ 151 if (callArgMem) 152 delete callArgMem; 153 delete condRegState; 154} 155 156void 157Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr) 158{ 159 wfDynId = _wf_dyn_id; 160 basePtr = _base_ptr; 161 status = S_RUNNING; 162} 163 164bool 165Wavefront::isGmInstruction(GPUDynInstPtr ii) 166{ 167 if (ii->isGlobalMem() || ii->isFlat()) 168 return true; 169 170 return false; 171} 172 173bool 174Wavefront::isLmInstruction(GPUDynInstPtr ii) 175{ 176 if (ii->isLocalMem()) { 177 return true; 178 } 179 180 return false; 181} 182 183bool 184Wavefront::isOldestInstALU() 185{ 186 assert(!instructionBuffer.empty()); 187 GPUDynInstPtr ii = instructionBuffer.front(); 188 189 if (status != S_STOPPED && (ii->isNop() || 190 ii->isReturn() || ii->isBranch() || 191 ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) { 192 return true; 193 } 194 195 return false; 196} 197 198bool 199Wavefront::isOldestInstBarrier() 200{ 201 assert(!instructionBuffer.empty()); 202 GPUDynInstPtr ii = instructionBuffer.front(); 203 204 if (status != S_STOPPED && ii->isBarrier()) { 205 return true; 206 } 207 208 return false; 209} 210 211bool 212Wavefront::isOldestInstGMem() 213{ 214 assert(!instructionBuffer.empty()); 215 GPUDynInstPtr ii = instructionBuffer.front(); 216 217 if (status != S_STOPPED && ii->isGlobalMem()) { 218 return true; 219 } 220 221 return false; 222} 223 224bool 225Wavefront::isOldestInstLMem() 226{ 227 assert(!instructionBuffer.empty()); 228 GPUDynInstPtr ii = instructionBuffer.front(); 229 230 if (status != S_STOPPED && ii->isLocalMem()) { 231 return true; 232 } 233 234 return false; 235} 236 237bool 238Wavefront::isOldestInstPrivMem() 239{ 240 assert(!instructionBuffer.empty()); 241 GPUDynInstPtr ii = instructionBuffer.front(); 242 243 if (status != S_STOPPED && ii->isPrivateSeg()) { 244 return true; 245 } 246 247 return false; 248} 249 250bool 251Wavefront::isOldestInstFlatMem() 252{ 253 assert(!instructionBuffer.empty()); 254 GPUDynInstPtr ii = instructionBuffer.front(); 255 256 if (status != S_STOPPED && ii->isFlat()) { 257 return true; 258 } 259 260 return false; 261} 262 263// Return true if the Wavefront's instruction 264// buffer has branch instruction. 265bool 266Wavefront::instructionBufferHasBranch() 267{ 268 for (auto it : instructionBuffer) { 269 GPUDynInstPtr ii = it; 270 271 if (ii->isReturn() || ii->isBranch()) { 272 return true; 273 } 274 } 275 276 return false; 277} 278 279// Remap HSAIL register to physical VGPR. 280// HSAIL register = virtual register assigned to an operand by HLC compiler 281uint32_t 282Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) 283{ 284 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); 285 // add the offset from where the VGPRs of the wavefront have been assigned 286 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; 287 // HSAIL double precision (DP) register: calculate the physical VGPR index 288 // assuming that DP registers are placed after SP ones in the VRF. The DP 289 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust 290 // the DP VGPR index before mapping it to the physical VRF address space 291 if (mode == 1 && size > 4) { 292 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); 293 } 294 295 assert((startVgprIndex <= physicalVgprIndex) && 296 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); 297 298 // calculate absolute physical VGPR index 299 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); 300} 301 302// Return true if this wavefront is ready 303// to execute an instruction of the specified type. 304int 305Wavefront::ready(itype_e type) 306{ 307 // Check to make sure wave is running 308 if (status == S_STOPPED || status == S_RETURNING || 309 instructionBuffer.empty()) { 310 return 0; 311 } 312 313 // Is the wave waiting at a barrier 314 if (stalledAtBarrier) { 315 if (!computeUnit->AllAtBarrier(barrierId,barrierCnt, 316 computeUnit->getRefCounter(dispatchId, wgId))) { 317 // Are all threads at barrier? 318 return 0; 319 } 320 oldBarrierCnt = barrierCnt; 321 stalledAtBarrier = false; 322 } 323 324 // Read instruction 325 GPUDynInstPtr ii = instructionBuffer.front(); 326 327 bool ready_inst M5_VAR_USED = false; 328 bool glbMemBusRdy = false; 329 bool glbMemIssueRdy = false; 330 if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) { 331 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { 332 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) 333 glbMemBusRdy = true; 334 if (computeUnit->wfWait[j].prerdy()) 335 glbMemIssueRdy = true; 336 } 337 } 338 bool locMemBusRdy = false; 339 bool locMemIssueRdy = false; 340 if (type == I_SHARED || type == I_FLAT) { 341 for (int j=0; j < computeUnit->numLocMemUnits; ++j) { 342 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) 343 locMemBusRdy = true; 344 if (computeUnit->wfWait[j].prerdy()) 345 locMemIssueRdy = true; 346 } 347 } 348 349 // The following code is very error prone and the entire process for 350 // checking readiness will be fixed eventually. In the meantime, let's 351 // make sure that we do not silently let an instruction type slip 352 // through this logic and always return not ready. 353 if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() || 354 ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() || 355 ii->isMemFence() || ii->isFlat())) { 356 panic("next instruction: %s is of unknown type\n", ii->disassemble()); 357 } 358 359 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", 360 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); 361 362 if (type == I_ALU && ii->isBarrier()) { 363 // Here for ALU instruction (barrier) 364 if (!computeUnit->wfWait[simdId].prerdy()) { 365 // Is wave slot free? 366 return 0; 367 } 368 369 // Are there in pipe or outstanding memory requests? 370 if ((outstandingReqs + memReqsInPipe) > 0) { 371 return 0; 372 } 373 374 ready_inst = true; 375 } else if (type == I_ALU && ii->isNop()) { 376 // Here for ALU instruction (nop) 377 if (!computeUnit->wfWait[simdId].prerdy()) { 378 // Is wave slot free? 379 return 0; 380 } 381 382 ready_inst = true; 383 } else if (type == I_ALU && ii->isReturn()) { 384 // Here for ALU instruction (return) 385 if (!computeUnit->wfWait[simdId].prerdy()) { 386 // Is wave slot free? 387 return 0; 388 } 389 390 // Are there in pipe or outstanding memory requests? 391 if ((outstandingReqs + memReqsInPipe) > 0) { 392 return 0; 393 } 394 395 ready_inst = true; 396 } else if (type == I_ALU && (ii->isBranch() || 397 ii->isALU() || 398 (ii->isKernArgSeg() && ii->isLoad()) || 399 ii->isArgSeg())) { 400 // Here for ALU instruction (all others) 401 if (!computeUnit->wfWait[simdId].prerdy()) { 402 // Is alu slot free? 403 return 0; 404 } 405 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 406 VrfAccessType::RD_WR)) { 407 return 0; 408 } 409 410 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 411 return 0; 412 } 413 ready_inst = true; 414 } else if (type == I_GLOBAL && ii->isGlobalMem()) { 415 // Here Global memory instruction 416 if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { 417 // Are there in pipe or outstanding global memory write requests? 418 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { 419 return 0; 420 } 421 } 422 423 if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { 424 // Are there in pipe or outstanding global memory read requests? 425 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) 426 return 0; 427 } 428 429 if (!glbMemIssueRdy) { 430 // Is WV issue slot free? 431 return 0; 432 } 433 434 if (!glbMemBusRdy) { 435 // Is there an available VRF->Global memory read bus? 436 return 0; 437 } 438 439 if (!computeUnit->globalMemoryPipe. 440 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 441 // Can we insert a new request to the Global Mem Request FIFO? 442 return 0; 443 } 444 // can we schedule source & destination operands on the VRF? 445 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 446 VrfAccessType::RD_WR)) { 447 return 0; 448 } 449 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 450 return 0; 451 } 452 ready_inst = true; 453 } else if (type == I_SHARED && ii->isLocalMem()) { 454 // Here for Shared memory instruction 455 if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { 456 if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) { 457 return 0; 458 } 459 } 460 461 if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { 462 if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) { 463 return 0; 464 } 465 } 466 467 if (!locMemBusRdy) { 468 // Is there an available VRF->LDS read bus? 469 return 0; 470 } 471 if (!locMemIssueRdy) { 472 // Is wave slot free? 473 return 0; 474 } 475 476 if (!computeUnit->localMemoryPipe. 477 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { 478 // Can we insert a new request to the LDS Request FIFO? 479 return 0; 480 } 481 // can we schedule source & destination operands on the VRF? 482 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 483 VrfAccessType::RD_WR)) { 484 return 0; 485 } 486 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 487 return 0; 488 } 489 ready_inst = true; 490 } else if (type == I_FLAT && ii->isFlat()) { 491 if (!glbMemBusRdy) { 492 // Is there an available VRF->Global memory read bus? 493 return 0; 494 } 495 496 if (!locMemBusRdy) { 497 // Is there an available VRF->LDS read bus? 498 return 0; 499 } 500 501 if (!glbMemIssueRdy) { 502 // Is wave slot free? 503 return 0; 504 } 505 506 if (!locMemIssueRdy) { 507 return 0; 508 } 509 if (!computeUnit->globalMemoryPipe. 510 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 511 // Can we insert a new request to the Global Mem Request FIFO? 512 return 0; 513 } 514 515 if (!computeUnit->localMemoryPipe. 516 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { 517 // Can we insert a new request to the LDS Request FIFO? 518 return 0; 519 } 520 // can we schedule source & destination operands on the VRF? 521 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 522 VrfAccessType::RD_WR)) { 523 return 0; 524 } 525 // are all the operands ready? (RAW, WAW and WAR depedencies met?) 526 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 527 return 0; 528 } 529 ready_inst = true; 530 } else { 531 return 0; 532 } 533 534 assert(ready_inst); 535 536 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, 537 simdId, wfSlotId, ii->disassemble()); 538 return 1; 539} 540 541void 542Wavefront::updateResources() 543{ 544 // Get current instruction 545 GPUDynInstPtr ii = instructionBuffer.front(); 546 assert(ii); 547 computeUnit->vrf[simdId]->updateResources(this, ii); 548 // Single precision ALU or Branch or Return or Special instruction 549 if (ii->isALU() || ii->isSpecialOp() || 550 ii->isBranch() || 551 // FIXME: Kernel argument loads are currently treated as ALU operations 552 // since we don't send memory packets at execution. If we fix that then 553 // we should map them to one of the memory pipelines 554 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() || 555 ii->isReturn()) { 556 computeUnit->aluPipe[simdId].preset(computeUnit->shader-> 557 ticks(computeUnit->spBypassLength())); 558 // this is to enforce a fixed number of cycles per issue slot per SIMD 559 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 560 ticks(computeUnit->issuePeriod)); 561 } else if (ii->isBarrier()) { 562 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 563 ticks(computeUnit->issuePeriod)); 564 } else if (ii->isLoad() && ii->isFlat()) { 565 assert(Enums::SC_NONE != ii->executedAs()); 566 memReqsInPipe++; 567 rdGmReqsInPipe++; 568 if ( Enums::SC_SHARED == ii->executedAs() ) { 569 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 570 preset(computeUnit->shader->ticks(4)); 571 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 572 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 573 } else { 574 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 575 preset(computeUnit->shader->ticks(4)); 576 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 577 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 578 } 579 } else if (ii->isStore() && ii->isFlat()) { 580 assert(Enums::SC_NONE != ii->executedAs()); 581 memReqsInPipe++; 582 wrGmReqsInPipe++; 583 if (Enums::SC_SHARED == ii->executedAs()) { 584 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 585 preset(computeUnit->shader->ticks(8)); 586 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 587 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 588 } else { 589 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 590 preset(computeUnit->shader->ticks(8)); 591 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 592 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 593 } 594 } else if (ii->isLoad() && ii->isGlobalMem()) { 595 memReqsInPipe++; 596 rdGmReqsInPipe++; 597 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 598 preset(computeUnit->shader->ticks(4)); 599 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 600 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 601 } else if (ii->isStore() && ii->isGlobalMem()) { 602 memReqsInPipe++; 603 wrGmReqsInPipe++; 604 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 605 preset(computeUnit->shader->ticks(8)); 606 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 607 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 608 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { 609 memReqsInPipe++; 610 wrGmReqsInPipe++; 611 rdGmReqsInPipe++; 612 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 613 preset(computeUnit->shader->ticks(8)); 614 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 615 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 616 } else if (ii->isLoad() && ii->isLocalMem()) { 617 memReqsInPipe++; 618 rdLmReqsInPipe++; 619 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 620 preset(computeUnit->shader->ticks(4)); 621 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 622 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 623 } else if (ii->isStore() && ii->isLocalMem()) { 624 memReqsInPipe++; 625 wrLmReqsInPipe++; 626 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 627 preset(computeUnit->shader->ticks(8)); 628 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 629 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 630 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { 631 memReqsInPipe++; 632 wrLmReqsInPipe++; 633 rdLmReqsInPipe++; 634 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 635 preset(computeUnit->shader->ticks(8)); 636 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 637 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 638 } 639} 640 641void 642Wavefront::exec() 643{ 644 // ---- Exit if wavefront is inactive ----------------------------- // 645 646 if (status == S_STOPPED || status == S_RETURNING || 647 instructionBuffer.empty()) { 648 return; 649 } 650 651 // Get current instruction 652 653 GPUDynInstPtr ii = instructionBuffer.front(); 654 655 const uint32_t old_pc = pc(); 656 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " 657 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, 658 ii->disassemble(), old_pc); 659 660 // update the instruction stats in the CU 661 662 ii->execute(ii); 663 computeUnit->updateInstStats(ii); 664 // access the VRF 665 computeUnit->vrf[simdId]->exec(ii, this); 666 srcRegOpDist.sample(ii->numSrcRegOperands()); 667 dstRegOpDist.sample(ii->numDstRegOperands()); 668 computeUnit->numInstrExecuted++; 669 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - 670 computeUnit->lastExecCycle[simdId]); 671 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); 672 if (pc() == old_pc) { 673 uint32_t new_pc = _gpuISA.advancePC(old_pc, ii); 674 // PC not modified by instruction, proceed to next or pop frame 675 pc(new_pc); 676 if (new_pc == rpc()) { 677 popFromReconvergenceStack(); 678 discardFetch(); 679 } else { 680 instructionBuffer.pop_front(); 681 } 682 } else { 683 discardFetch(); 684 } 685 686 if (computeUnit->shader->hsail_mode==Shader::SIMT) { 687 const int num_active_lanes = execMask().count(); 688 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); 689 computeUnit->numVecOpsExecuted += num_active_lanes; 690 if (isGmInstruction(ii)) { 691 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); 692 } else if (isLmInstruction(ii)) { 693 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); 694 } 695 } 696 697 // ---- Update Vector ALU pipeline and other resources ------------------ // 698 // Single precision ALU or Branch or Return or Special instruction 699 if (ii->isALU() || ii->isSpecialOp() || 700 ii->isBranch() || 701 // FIXME: Kernel argument loads are currently treated as ALU operations 702 // since we don't send memory packets at execution. If we fix that then 703 // we should map them to one of the memory pipelines 704 (ii->isKernArgSeg() && ii->isLoad()) || 705 ii->isArgSeg() || 706 ii->isReturn()) { 707 computeUnit->aluPipe[simdId].set(computeUnit->shader-> 708 ticks(computeUnit->spBypassLength())); 709 710 // this is to enforce a fixed number of cycles per issue slot per SIMD 711 computeUnit->wfWait[simdId].set(computeUnit->shader-> 712 ticks(computeUnit->issuePeriod)); 713 } else if (ii->isBarrier()) { 714 computeUnit->wfWait[simdId].set(computeUnit->shader-> 715 ticks(computeUnit->issuePeriod)); 716 } else if (ii->isLoad() && ii->isFlat()) { 717 assert(Enums::SC_NONE != ii->executedAs()); 718 719 if (Enums::SC_SHARED == ii->executedAs()) { 720 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 721 set(computeUnit->shader->ticks(4)); 722 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 723 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 724 } else { 725 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 726 set(computeUnit->shader->ticks(4)); 727 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 728 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 729 } 730 } else if (ii->isStore() && ii->isFlat()) { 731 assert(Enums::SC_NONE != ii->executedAs()); 732 if (Enums::SC_SHARED == ii->executedAs()) { 733 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 734 set(computeUnit->shader->ticks(8)); 735 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 736 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 737 } else { 738 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 739 set(computeUnit->shader->ticks(8)); 740 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 741 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 742 } 743 } else if (ii->isLoad() && ii->isGlobalMem()) { 744 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 745 set(computeUnit->shader->ticks(4)); 746 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 747 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 748 } else if (ii->isStore() && ii->isGlobalMem()) { 749 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 750 set(computeUnit->shader->ticks(8)); 751 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 752 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 753 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { 754 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 755 set(computeUnit->shader->ticks(8)); 756 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 757 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 758 } else if (ii->isLoad() && ii->isLocalMem()) { 759 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 760 set(computeUnit->shader->ticks(4)); 761 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 762 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 763 } else if (ii->isStore() && ii->isLocalMem()) { 764 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 765 set(computeUnit->shader->ticks(8)); 766 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 767 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 768 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { 769 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 770 set(computeUnit->shader->ticks(8)); 771 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 772 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 773 } 774} 775 776bool 777Wavefront::waitingAtBarrier(int lane) 778{ 779 return barCnt[lane] < maxBarCnt; 780} 781 782void 783Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 784 const VectorMask& mask) 785{ 786 assert(mask.count()); 787 reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask}); 788} 789 790void 791Wavefront::popFromReconvergenceStack() 792{ 793 assert(!reconvergenceStack.empty()); 794 795 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", 796 computeUnit->cu_id, simdId, wfSlotId, wfDynId, 797 execMask().to_string<char, std::string::traits_type, 798 std::string::allocator_type>().c_str(), pc()); 799 800 reconvergenceStack.pop_back(); 801 802 DPRINTF(WavefrontStack, "%3i %s\n", pc(), 803 execMask().to_string<char, std::string::traits_type, 804 std::string::allocator_type>().c_str()); 805 806} 807 808void 809Wavefront::discardFetch() 810{ 811 instructionBuffer.clear(); 812 dropFetch |=pendingFetch; 813} 814 815uint32_t 816Wavefront::pc() const 817{ 818 return reconvergenceStack.back()->pc; 819} 820 821uint32_t 822Wavefront::rpc() const 823{ 824 return reconvergenceStack.back()->rpc; 825} 826 827VectorMask 828Wavefront::execMask() const 829{ 830 return reconvergenceStack.back()->execMask; 831} 832 833bool 834Wavefront::execMask(int lane) const 835{ 836 return reconvergenceStack.back()->execMask[lane]; 837} 838 839 840void 841Wavefront::pc(uint32_t new_pc) 842{ 843 reconvergenceStack.back()->pc = new_pc; 844} 845 846uint32_t 847Wavefront::getStaticContextSize() const 848{ 849 return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) + 850 sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) + 851 sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) + 852 sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) + 853 computeUnit->wfSize() * sizeof(ReconvergenceStackEntry); 854} 855 856void 857Wavefront::getContext(const void *out) 858{ 859 uint8_t *iter = (uint8_t *)out; 860 for (int i = 0; i < barCnt.size(); i++) { 861 *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]); 862 } 863 *(int *)iter = wfId; iter += sizeof(wfId); 864 *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt); 865 *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt); 866 *(int *)iter = barrierCnt; iter += sizeof(barrierCnt); 867 *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id); 868 *(uint32_t *)iter = wgId; iter += sizeof(wgId); 869 *(uint32_t *)iter = barrierId; iter += sizeof(barrierId); 870 *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong()); 871 *(Addr *)iter = privBase; iter += sizeof(privBase); 872 *(Addr *)iter = spillBase; iter += sizeof(spillBase); 873 874 int stackSize = reconvergenceStack.size(); 875 ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(), 876 std::numeric_limits<uint32_t>::max(), 877 std::numeric_limits<uint64_t>::max()}; 878 for (int i = 0; i < workItemId[0].size(); i++) { 879 if (i < stackSize) { 880 *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back(); 881 iter += sizeof(ReconvergenceStackEntry); 882 reconvergenceStack.pop_back(); 883 } else { 884 *(ReconvergenceStackEntry *)iter = empty; 885 iter += sizeof(ReconvergenceStackEntry); 886 } 887 } 888 889 int wf_size = computeUnit->wfSize(); 890 for (int i = 0; i < maxSpVgprs; i++) { 891 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); 892 for (int lane = 0; lane < wf_size; lane++) { 893 uint32_t regVal = computeUnit->vrf[simdId]-> 894 read<uint32_t>(vgprIdx,lane); 895 *(uint32_t *)iter = regVal; iter += sizeof(regVal); 896 } 897 } 898 899 for (int i = 0; i < maxDpVgprs; i++) { 900 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); 901 for (int lane = 0; lane < wf_size; lane++) { 902 uint64_t regVal = computeUnit->vrf[simdId]-> 903 read<uint64_t>(vgprIdx,lane); 904 *(uint64_t *)iter = regVal; iter += sizeof(regVal); 905 } 906 } 907 908 for (int i = 0; i < condRegState->numRegs(); i++) { 909 for (int lane = 0; lane < wf_size; lane++) { 910 uint64_t regVal = condRegState->read<uint64_t>(i, lane); 911 *(uint64_t *)iter = regVal; iter += sizeof(regVal); 912 } 913 } 914 915 /* saving LDS content */ 916 if (ldsChunk) 917 for (int i = 0; i < ldsChunk->size(); i++) { 918 char val = ldsChunk->read<char>(i); 919 *(char *) iter = val; iter += sizeof(val); 920 } 921} 922 923void 924Wavefront::setContext(const void *in) 925{ 926 uint8_t *iter = (uint8_t *)in; 927 for (int i = 0; i < barCnt.size(); i++) { 928 barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]); 929 } 930 wfId = *(int *)iter; iter += sizeof(wfId); 931 maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt); 932 oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt); 933 barrierCnt = *(int *)iter; iter += sizeof(barrierCnt); 934 computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id); 935 wgId = *(uint32_t *)iter; iter += sizeof(wgId); 936 barrierId = *(uint32_t *)iter; iter += sizeof(barrierId); 937 initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask); 938 privBase = *(Addr *)iter; iter += sizeof(privBase); 939 spillBase = *(Addr *)iter; iter += sizeof(spillBase); 940 941 for (int i = 0; i < workItemId[0].size(); i++) { 942 ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter; 943 iter += sizeof(ReconvergenceStackEntry); 944 if (newEntry.pc != std::numeric_limits<uint32_t>::max()) { 945 pushToReconvergenceStack(newEntry.pc, newEntry.rpc, 946 newEntry.execMask); 947 } 948 } 949 int wf_size = computeUnit->wfSize(); 950 951 for (int i = 0; i < maxSpVgprs; i++) { 952 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); 953 for (int lane = 0; lane < wf_size; lane++) { 954 uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal); 955 computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane); 956 } 957 } 958 959 for (int i = 0; i < maxDpVgprs; i++) { 960 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); 961 for (int lane = 0; lane < wf_size; lane++) { 962 uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal); 963 computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane); 964 } 965 } 966 967 for (int i = 0; i < condRegState->numRegs(); i++) { 968 for (int lane = 0; lane < wf_size; lane++) { 969 uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal); 970 condRegState->write<uint64_t>(i, lane, regVal); 971 } 972 } 973 /** Restoring LDS contents */ 974 if (ldsChunk) 975 for (int i = 0; i < ldsChunk->size(); i++) { 976 char val = *(char *) iter; iter += sizeof(val); 977 ldsChunk->write<char>(i, val); 978 } 979} 980 981void 982Wavefront::computeActualWgSz(NDRange *ndr) 983{ 984 actualWgSzTotal = 1; 985 for (int d = 0; d < 3; ++d) { 986 actualWgSz[d] = std::min(workGroupSz[d], 987 gridSz[d] - ndr->wgId[d] * workGroupSz[d]); 988 actualWgSzTotal *= actualWgSz[d]; 989 } 990} 991