wavefront.cc revision 11692:e772fdcd3809
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#include "gpu-compute/wavefront.hh" 37 38#include "debug/GPUExec.hh" 39#include "debug/WavefrontStack.hh" 40#include "gpu-compute/compute_unit.hh" 41#include "gpu-compute/gpu_dyn_inst.hh" 42#include "gpu-compute/shader.hh" 43#include "gpu-compute/vector_register_file.hh" 44 45Wavefront* 46WavefrontParams::create() 47{ 48 return new Wavefront(this); 49} 50 51Wavefront::Wavefront(const Params *p) 52 : SimObject(p), callArgMem(nullptr) 53{ 54 lastTrace = 0; 55 simdId = p->simdId; 56 wfSlotId = p->wf_slot_id; 57 status = S_STOPPED; 58 reservedVectorRegs = 0; 59 startVgprIndex = 0; 60 outstandingReqs = 0; 61 memReqsInPipe = 0; 62 outstandingReqsWrGm = 0; 63 outstandingReqsWrLm = 0; 64 outstandingReqsRdGm = 0; 65 outstandingReqsRdLm = 0; 66 rdLmReqsInPipe = 0; 67 rdGmReqsInPipe = 0; 68 wrLmReqsInPipe = 0; 69 wrGmReqsInPipe = 0; 70 71 barrierCnt = 0; 72 oldBarrierCnt = 0; 73 stalledAtBarrier = false; 74 75 memTraceBusy = 0; 76 oldVgprTcnt = 0xffffffffffffffffll; 77 oldDgprTcnt = 0xffffffffffffffffll; 78 oldVgpr.resize(p->wfSize); 79 80 pendingFetch = false; 81 dropFetch = false; 82 condRegState = new ConditionRegisterState(); 83 maxSpVgprs = 0; 84 maxDpVgprs = 0; 85 lastAddr.resize(p->wfSize); 86 workItemFlatId.resize(p->wfSize); 87 oldDgpr.resize(p->wfSize); 88 barCnt.resize(p->wfSize); 89 for (int i = 0; i < 3; ++i) { 90 workItemId[i].resize(p->wfSize); 91 } 92} 93 94void 95Wavefront::regStats() 96{ 97 SimObject::regStats(); 98 99 srcRegOpDist 100 .init(0, 4, 2) 101 .name(name() + ".src_reg_operand_dist") 102 .desc("number of executed instructions with N source register operands") 103 ; 104 105 dstRegOpDist 106 .init(0, 3, 2) 107 .name(name() + ".dst_reg_operand_dist") 108 .desc("number of executed instructions with N destination register " 109 "operands") 110 ; 111 112 // FIXME: the name of the WF needs to be unique 113 numTimesBlockedDueWAXDependencies 114 .name(name() + ".timesBlockedDueWAXDependencies") 115 .desc("number of times the wf's instructions are blocked due to WAW " 116 "or WAR dependencies") 117 ; 118 119 // FIXME: the name of the WF needs to be unique 120 numTimesBlockedDueRAWDependencies 121 .name(name() + ".timesBlockedDueRAWDependencies") 122 .desc("number of times the wf's instructions are blocked due to RAW " 123 "dependencies") 124 ; 125 126 // FIXME: the name of the WF needs to be unique 127 numTimesBlockedDueVrfPortAvail 128 .name(name() + ".timesBlockedDueVrfPortAvail") 129 .desc("number of times instructions are blocked due to VRF port " 130 "availability") 131 ; 132} 133 134void 135Wavefront::init() 136{ 137 reservedVectorRegs = 0; 138 startVgprIndex = 0; 139} 140 141void 142Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) 143{ 144 condRegState->init(num_cregs); 145 maxSpVgprs = num_sregs; 146 maxDpVgprs = num_dregs; 147} 148 149Wavefront::~Wavefront() 150{ 151 if (callArgMem) 152 delete callArgMem; 153 delete condRegState; 154} 155 156void 157Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr) 158{ 159 wfDynId = _wf_dyn_id; 160 basePtr = _base_ptr; 161 status = S_RUNNING; 162} 163 164bool 165Wavefront::isGmInstruction(GPUDynInstPtr ii) 166{ 167 if (ii->isGlobalMem() || ii->isFlat()) 168 return true; 169 170 return false; 171} 172 173bool 174Wavefront::isLmInstruction(GPUDynInstPtr ii) 175{ 176 if (ii->isLocalMem()) { 177 return true; 178 } 179 180 return false; 181} 182 183bool 184Wavefront::isOldestInstALU() 185{ 186 assert(!instructionBuffer.empty()); 187 GPUDynInstPtr ii = instructionBuffer.front(); 188 189 if (status != S_STOPPED && (ii->isNop() || 190 ii->isReturn() || ii->isBranch() || 191 ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) { 192 return true; 193 } 194 195 return false; 196} 197 198bool 199Wavefront::isOldestInstBarrier() 200{ 201 assert(!instructionBuffer.empty()); 202 GPUDynInstPtr ii = instructionBuffer.front(); 203 204 if (status != S_STOPPED && ii->isBarrier()) { 205 return true; 206 } 207 208 return false; 209} 210 211bool 212Wavefront::isOldestInstGMem() 213{ 214 assert(!instructionBuffer.empty()); 215 GPUDynInstPtr ii = instructionBuffer.front(); 216 217 if (status != S_STOPPED && ii->isGlobalMem()) { 218 return true; 219 } 220 221 return false; 222} 223 224bool 225Wavefront::isOldestInstLMem() 226{ 227 assert(!instructionBuffer.empty()); 228 GPUDynInstPtr ii = instructionBuffer.front(); 229 230 if (status != S_STOPPED && ii->isLocalMem()) { 231 return true; 232 } 233 234 return false; 235} 236 237bool 238Wavefront::isOldestInstPrivMem() 239{ 240 assert(!instructionBuffer.empty()); 241 GPUDynInstPtr ii = instructionBuffer.front(); 242 243 if (status != S_STOPPED && ii->isPrivateSeg()) { 244 return true; 245 } 246 247 return false; 248} 249 250bool 251Wavefront::isOldestInstFlatMem() 252{ 253 assert(!instructionBuffer.empty()); 254 GPUDynInstPtr ii = instructionBuffer.front(); 255 256 if (status != S_STOPPED && ii->isFlat()) { 257 return true; 258 } 259 260 return false; 261} 262 263// Return true if the Wavefront's instruction 264// buffer has branch instruction. 265bool 266Wavefront::instructionBufferHasBranch() 267{ 268 for (auto it : instructionBuffer) { 269 GPUDynInstPtr ii = it; 270 271 if (ii->isReturn() || ii->isBranch()) { 272 return true; 273 } 274 } 275 276 return false; 277} 278 279// Remap HSAIL register to physical VGPR. 280// HSAIL register = virtual register assigned to an operand by HLC compiler 281uint32_t 282Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) 283{ 284 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); 285 // add the offset from where the VGPRs of the wavefront have been assigned 286 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; 287 // HSAIL double precision (DP) register: calculate the physical VGPR index 288 // assuming that DP registers are placed after SP ones in the VRF. The DP 289 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust 290 // the DP VGPR index before mapping it to the physical VRF address space 291 if (mode == 1 && size > 4) { 292 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); 293 } 294 295 assert((startVgprIndex <= physicalVgprIndex) && 296 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); 297 298 // calculate absolute physical VGPR index 299 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); 300} 301 302// Return true if this wavefront is ready 303// to execute an instruction of the specified type. 304int 305Wavefront::ready(itype_e type) 306{ 307 // Check to make sure wave is running 308 if (status == S_STOPPED || status == S_RETURNING || 309 instructionBuffer.empty()) { 310 return 0; 311 } 312 313 // Is the wave waiting at a barrier 314 if (stalledAtBarrier) { 315 if (!computeUnit->AllAtBarrier(barrierId,barrierCnt, 316 computeUnit->getRefCounter(dispatchId, wgId))) { 317 // Are all threads at barrier? 318 return 0; 319 } 320 oldBarrierCnt = barrierCnt; 321 stalledAtBarrier = false; 322 } 323 324 // Read instruction 325 GPUDynInstPtr ii = instructionBuffer.front(); 326 327 bool ready_inst M5_VAR_USED = false; 328 bool glbMemBusRdy = false; 329 bool glbMemIssueRdy = false; 330 if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) { 331 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { 332 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) 333 glbMemBusRdy = true; 334 if (computeUnit->wfWait[j].prerdy()) 335 glbMemIssueRdy = true; 336 } 337 } 338 bool locMemBusRdy = false; 339 bool locMemIssueRdy = false; 340 if (type == I_SHARED || type == I_FLAT) { 341 for (int j=0; j < computeUnit->numLocMemUnits; ++j) { 342 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) 343 locMemBusRdy = true; 344 if (computeUnit->wfWait[j].prerdy()) 345 locMemIssueRdy = true; 346 } 347 } 348 349 // The following code is very error prone and the entire process for 350 // checking readiness will be fixed eventually. In the meantime, let's 351 // make sure that we do not silently let an instruction type slip 352 // through this logic and always return not ready. 353 if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() || 354 ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() || 355 ii->isMemFence() || ii->isFlat())) { 356 panic("next instruction: %s is of unknown type\n", ii->disassemble()); 357 } 358 359 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", 360 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); 361 362 if (type == I_ALU && ii->isBarrier()) { 363 // Here for ALU instruction (barrier) 364 if (!computeUnit->wfWait[simdId].prerdy()) { 365 // Is wave slot free? 366 return 0; 367 } 368 369 // Are there in pipe or outstanding memory requests? 370 if ((outstandingReqs + memReqsInPipe) > 0) { 371 return 0; 372 } 373 374 ready_inst = true; 375 } else if (type == I_ALU && ii->isNop()) { 376 // Here for ALU instruction (nop) 377 if (!computeUnit->wfWait[simdId].prerdy()) { 378 // Is wave slot free? 379 return 0; 380 } 381 382 ready_inst = true; 383 } else if (type == I_ALU && ii->isReturn()) { 384 // Here for ALU instruction (return) 385 if (!computeUnit->wfWait[simdId].prerdy()) { 386 // Is wave slot free? 387 return 0; 388 } 389 390 // Are there in pipe or outstanding memory requests? 391 if ((outstandingReqs + memReqsInPipe) > 0) { 392 return 0; 393 } 394 395 ready_inst = true; 396 } else if (type == I_ALU && (ii->isBranch() || 397 ii->isALU() || 398 (ii->isKernArgSeg() && ii->isLoad()) || 399 ii->isArgSeg())) { 400 // Here for ALU instruction (all others) 401 if (!computeUnit->wfWait[simdId].prerdy()) { 402 // Is alu slot free? 403 return 0; 404 } 405 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 406 VrfAccessType::RD_WR)) { 407 return 0; 408 } 409 410 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 411 return 0; 412 } 413 ready_inst = true; 414 } else if (type == I_GLOBAL && ii->isGlobalMem()) { 415 // Here Global memory instruction 416 if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { 417 // Are there in pipe or outstanding global memory write requests? 418 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { 419 return 0; 420 } 421 } 422 423 if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { 424 // Are there in pipe or outstanding global memory read requests? 425 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) 426 return 0; 427 } 428 429 if (!glbMemIssueRdy) { 430 // Is WV issue slot free? 431 return 0; 432 } 433 434 if (!glbMemBusRdy) { 435 // Is there an available VRF->Global memory read bus? 436 return 0; 437 } 438 439 if (!computeUnit->globalMemoryPipe. 440 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 441 // Can we insert a new request to the Global Mem Request FIFO? 442 return 0; 443 } 444 // can we schedule source & destination operands on the VRF? 445 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 446 VrfAccessType::RD_WR)) { 447 return 0; 448 } 449 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 450 return 0; 451 } 452 ready_inst = true; 453 } else if (type == I_SHARED && ii->isLocalMem()) { 454 // Here for Shared memory instruction 455 if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { 456 if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) { 457 return 0; 458 } 459 } 460 461 if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { 462 if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) { 463 return 0; 464 } 465 } 466 467 if (!locMemBusRdy) { 468 // Is there an available VRF->LDS read bus? 469 return 0; 470 } 471 if (!locMemIssueRdy) { 472 // Is wave slot free? 473 return 0; 474 } 475 476 if (!computeUnit->localMemoryPipe. 477 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { 478 // Can we insert a new request to the LDS Request FIFO? 479 return 0; 480 } 481 // can we schedule source & destination operands on the VRF? 482 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 483 VrfAccessType::RD_WR)) { 484 return 0; 485 } 486 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 487 return 0; 488 } 489 ready_inst = true; 490 } else if (type == I_FLAT && ii->isFlat()) { 491 if (!glbMemBusRdy) { 492 // Is there an available VRF->Global memory read bus? 493 return 0; 494 } 495 496 if (!locMemBusRdy) { 497 // Is there an available VRF->LDS read bus? 498 return 0; 499 } 500 501 if (!glbMemIssueRdy) { 502 // Is wave slot free? 503 return 0; 504 } 505 506 if (!locMemIssueRdy) { 507 return 0; 508 } 509 if (!computeUnit->globalMemoryPipe. 510 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 511 // Can we insert a new request to the Global Mem Request FIFO? 512 return 0; 513 } 514 515 if (!computeUnit->localMemoryPipe. 516 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { 517 // Can we insert a new request to the LDS Request FIFO? 518 return 0; 519 } 520 // can we schedule source & destination operands on the VRF? 521 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 522 VrfAccessType::RD_WR)) { 523 return 0; 524 } 525 // are all the operands ready? (RAW, WAW and WAR depedencies met?) 526 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 527 return 0; 528 } 529 ready_inst = true; 530 } else { 531 return 0; 532 } 533 534 assert(ready_inst); 535 536 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, 537 simdId, wfSlotId, ii->disassemble()); 538 return 1; 539} 540 541void 542Wavefront::updateResources() 543{ 544 // Get current instruction 545 GPUDynInstPtr ii = instructionBuffer.front(); 546 assert(ii); 547 computeUnit->vrf[simdId]->updateResources(this, ii); 548 // Single precision ALU or Branch or Return or Special instruction 549 if (ii->isALU() || ii->isSpecialOp() || 550 ii->isBranch() || 551 // FIXME: Kernel argument loads are currently treated as ALU operations 552 // since we don't send memory packets at execution. If we fix that then 553 // we should map them to one of the memory pipelines 554 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() || 555 ii->isReturn()) { 556 computeUnit->aluPipe[simdId].preset(computeUnit->shader-> 557 ticks(computeUnit->spBypassLength())); 558 // this is to enforce a fixed number of cycles per issue slot per SIMD 559 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 560 ticks(computeUnit->issuePeriod)); 561 } else if (ii->isBarrier()) { 562 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 563 ticks(computeUnit->issuePeriod)); 564 } else if (ii->isLoad() && ii->isFlat()) { 565 assert(Enums::SC_NONE != ii->executedAs()); 566 memReqsInPipe++; 567 rdGmReqsInPipe++; 568 if ( Enums::SC_SHARED == ii->executedAs() ) { 569 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 570 preset(computeUnit->shader->ticks(4)); 571 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 572 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 573 } else { 574 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 575 preset(computeUnit->shader->ticks(4)); 576 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 577 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 578 } 579 } else if (ii->isStore() && ii->isFlat()) { 580 assert(Enums::SC_NONE != ii->executedAs()); 581 memReqsInPipe++; 582 wrGmReqsInPipe++; 583 if (Enums::SC_SHARED == ii->executedAs()) { 584 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 585 preset(computeUnit->shader->ticks(8)); 586 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 587 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 588 } else { 589 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 590 preset(computeUnit->shader->ticks(8)); 591 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 592 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 593 } 594 } else if (ii->isLoad() && ii->isGlobalMem()) { 595 memReqsInPipe++; 596 rdGmReqsInPipe++; 597 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 598 preset(computeUnit->shader->ticks(4)); 599 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 600 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 601 } else if (ii->isStore() && ii->isGlobalMem()) { 602 memReqsInPipe++; 603 wrGmReqsInPipe++; 604 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 605 preset(computeUnit->shader->ticks(8)); 606 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 607 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 608 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { 609 memReqsInPipe++; 610 wrGmReqsInPipe++; 611 rdGmReqsInPipe++; 612 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 613 preset(computeUnit->shader->ticks(8)); 614 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 615 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 616 } else if (ii->isLoad() && ii->isLocalMem()) { 617 memReqsInPipe++; 618 rdLmReqsInPipe++; 619 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 620 preset(computeUnit->shader->ticks(4)); 621 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 622 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 623 } else if (ii->isStore() && ii->isLocalMem()) { 624 memReqsInPipe++; 625 wrLmReqsInPipe++; 626 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 627 preset(computeUnit->shader->ticks(8)); 628 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 629 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 630 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { 631 memReqsInPipe++; 632 wrLmReqsInPipe++; 633 rdLmReqsInPipe++; 634 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 635 preset(computeUnit->shader->ticks(8)); 636 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 637 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 638 } 639} 640 641void 642Wavefront::exec() 643{ 644 // ---- Exit if wavefront is inactive ----------------------------- // 645 646 if (status == S_STOPPED || status == S_RETURNING || 647 instructionBuffer.empty()) { 648 return; 649 } 650 651 // Get current instruction 652 653 GPUDynInstPtr ii = instructionBuffer.front(); 654 655 const uint32_t old_pc = pc(); 656 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " 657 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, 658 ii->disassemble(), old_pc); 659 ii->execute(ii); 660 // access the VRF 661 computeUnit->vrf[simdId]->exec(ii, this); 662 srcRegOpDist.sample(ii->numSrcRegOperands()); 663 dstRegOpDist.sample(ii->numDstRegOperands()); 664 computeUnit->numInstrExecuted++; 665 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - 666 computeUnit->lastExecCycle[simdId]); 667 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); 668 if (pc() == old_pc) { 669 uint32_t new_pc = old_pc + 1; 670 // PC not modified by instruction, proceed to next or pop frame 671 pc(new_pc); 672 if (new_pc == rpc()) { 673 popFromReconvergenceStack(); 674 discardFetch(); 675 } else { 676 instructionBuffer.pop_front(); 677 } 678 } 679 680 if (computeUnit->shader->hsail_mode==Shader::SIMT) { 681 const int num_active_lanes = execMask().count(); 682 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); 683 computeUnit->numVecOpsExecuted += num_active_lanes; 684 if (isGmInstruction(ii)) { 685 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); 686 } else if (isLmInstruction(ii)) { 687 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); 688 } 689 } 690 691 // ---- Update Vector ALU pipeline and other resources ------------------ // 692 // Single precision ALU or Branch or Return or Special instruction 693 if (ii->isALU() || ii->isSpecialOp() || 694 ii->isBranch() || 695 // FIXME: Kernel argument loads are currently treated as ALU operations 696 // since we don't send memory packets at execution. If we fix that then 697 // we should map them to one of the memory pipelines 698 (ii->isKernArgSeg() && ii->isLoad()) || 699 ii->isArgSeg() || 700 ii->isReturn()) { 701 computeUnit->aluPipe[simdId].set(computeUnit->shader-> 702 ticks(computeUnit->spBypassLength())); 703 704 // this is to enforce a fixed number of cycles per issue slot per SIMD 705 computeUnit->wfWait[simdId].set(computeUnit->shader-> 706 ticks(computeUnit->issuePeriod)); 707 } else if (ii->isBarrier()) { 708 computeUnit->wfWait[simdId].set(computeUnit->shader-> 709 ticks(computeUnit->issuePeriod)); 710 } else if (ii->isLoad() && ii->isFlat()) { 711 assert(Enums::SC_NONE != ii->executedAs()); 712 713 if (Enums::SC_SHARED == ii->executedAs()) { 714 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 715 set(computeUnit->shader->ticks(4)); 716 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 717 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 718 } else { 719 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 720 set(computeUnit->shader->ticks(4)); 721 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 722 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 723 } 724 } else if (ii->isStore() && ii->isFlat()) { 725 assert(Enums::SC_NONE != ii->executedAs()); 726 if (Enums::SC_SHARED == ii->executedAs()) { 727 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 728 set(computeUnit->shader->ticks(8)); 729 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 730 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 731 } else { 732 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 733 set(computeUnit->shader->ticks(8)); 734 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 735 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 736 } 737 } else if (ii->isLoad() && ii->isGlobalMem()) { 738 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 739 set(computeUnit->shader->ticks(4)); 740 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 741 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 742 } else if (ii->isStore() && ii->isGlobalMem()) { 743 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 744 set(computeUnit->shader->ticks(8)); 745 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 746 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 747 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { 748 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 749 set(computeUnit->shader->ticks(8)); 750 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 751 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 752 } else if (ii->isLoad() && ii->isLocalMem()) { 753 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 754 set(computeUnit->shader->ticks(4)); 755 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 756 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 757 } else if (ii->isStore() && ii->isLocalMem()) { 758 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 759 set(computeUnit->shader->ticks(8)); 760 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 761 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 762 } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { 763 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 764 set(computeUnit->shader->ticks(8)); 765 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 766 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 767 } 768} 769 770bool 771Wavefront::waitingAtBarrier(int lane) 772{ 773 return barCnt[lane] < maxBarCnt; 774} 775 776void 777Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 778 const VectorMask& mask) 779{ 780 assert(mask.count()); 781 reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask}); 782} 783 784void 785Wavefront::popFromReconvergenceStack() 786{ 787 assert(!reconvergenceStack.empty()); 788 789 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", 790 computeUnit->cu_id, simdId, wfSlotId, wfDynId, 791 execMask().to_string<char, std::string::traits_type, 792 std::string::allocator_type>().c_str(), pc()); 793 794 reconvergenceStack.pop_back(); 795 796 DPRINTF(WavefrontStack, "%3i %s\n", pc(), 797 execMask().to_string<char, std::string::traits_type, 798 std::string::allocator_type>().c_str()); 799 800} 801 802void 803Wavefront::discardFetch() 804{ 805 instructionBuffer.clear(); 806 dropFetch |=pendingFetch; 807} 808 809uint32_t 810Wavefront::pc() const 811{ 812 return reconvergenceStack.back()->pc; 813} 814 815uint32_t 816Wavefront::rpc() const 817{ 818 return reconvergenceStack.back()->rpc; 819} 820 821VectorMask 822Wavefront::execMask() const 823{ 824 return reconvergenceStack.back()->execMask; 825} 826 827bool 828Wavefront::execMask(int lane) const 829{ 830 return reconvergenceStack.back()->execMask[lane]; 831} 832 833 834void 835Wavefront::pc(uint32_t new_pc) 836{ 837 reconvergenceStack.back()->pc = new_pc; 838} 839 840uint32_t 841Wavefront::getStaticContextSize() const 842{ 843 return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) + 844 sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) + 845 sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) + 846 sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) + 847 computeUnit->wfSize() * sizeof(ReconvergenceStackEntry); 848} 849 850void 851Wavefront::getContext(const void *out) 852{ 853 uint8_t *iter = (uint8_t *)out; 854 for (int i = 0; i < barCnt.size(); i++) { 855 *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]); 856 } 857 *(int *)iter = wfId; iter += sizeof(wfId); 858 *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt); 859 *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt); 860 *(int *)iter = barrierCnt; iter += sizeof(barrierCnt); 861 *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id); 862 *(uint32_t *)iter = wgId; iter += sizeof(wgId); 863 *(uint32_t *)iter = barrierId; iter += sizeof(barrierId); 864 *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong()); 865 *(Addr *)iter = privBase; iter += sizeof(privBase); 866 *(Addr *)iter = spillBase; iter += sizeof(spillBase); 867 868 int stackSize = reconvergenceStack.size(); 869 ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(), 870 std::numeric_limits<uint32_t>::max(), 871 std::numeric_limits<uint64_t>::max()}; 872 for (int i = 0; i < workItemId[0].size(); i++) { 873 if (i < stackSize) { 874 *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back(); 875 iter += sizeof(ReconvergenceStackEntry); 876 reconvergenceStack.pop_back(); 877 } else { 878 *(ReconvergenceStackEntry *)iter = empty; 879 iter += sizeof(ReconvergenceStackEntry); 880 } 881 } 882 883 int wf_size = computeUnit->wfSize(); 884 for (int i = 0; i < maxSpVgprs; i++) { 885 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); 886 for (int lane = 0; lane < wf_size; lane++) { 887 uint32_t regVal = computeUnit->vrf[simdId]-> 888 read<uint32_t>(vgprIdx,lane); 889 *(uint32_t *)iter = regVal; iter += sizeof(regVal); 890 } 891 } 892 893 for (int i = 0; i < maxDpVgprs; i++) { 894 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); 895 for (int lane = 0; lane < wf_size; lane++) { 896 uint64_t regVal = computeUnit->vrf[simdId]-> 897 read<uint64_t>(vgprIdx,lane); 898 *(uint64_t *)iter = regVal; iter += sizeof(regVal); 899 } 900 } 901 902 for (int i = 0; i < condRegState->numRegs(); i++) { 903 for (int lane = 0; lane < wf_size; lane++) { 904 uint64_t regVal = condRegState->read<uint64_t>(i, lane); 905 *(uint64_t *)iter = regVal; iter += sizeof(regVal); 906 } 907 } 908 909 /* saving LDS content */ 910 if (ldsChunk) 911 for (int i = 0; i < ldsChunk->size(); i++) { 912 char val = ldsChunk->read<char>(i); 913 *(char *) iter = val; iter += sizeof(val); 914 } 915} 916 917void 918Wavefront::setContext(const void *in) 919{ 920 uint8_t *iter = (uint8_t *)in; 921 for (int i = 0; i < barCnt.size(); i++) { 922 barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]); 923 } 924 wfId = *(int *)iter; iter += sizeof(wfId); 925 maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt); 926 oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt); 927 barrierCnt = *(int *)iter; iter += sizeof(barrierCnt); 928 computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id); 929 wgId = *(uint32_t *)iter; iter += sizeof(wgId); 930 barrierId = *(uint32_t *)iter; iter += sizeof(barrierId); 931 initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask); 932 privBase = *(Addr *)iter; iter += sizeof(privBase); 933 spillBase = *(Addr *)iter; iter += sizeof(spillBase); 934 935 for (int i = 0; i < workItemId[0].size(); i++) { 936 ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter; 937 iter += sizeof(ReconvergenceStackEntry); 938 if (newEntry.pc != std::numeric_limits<uint32_t>::max()) { 939 pushToReconvergenceStack(newEntry.pc, newEntry.rpc, 940 newEntry.execMask); 941 } 942 } 943 int wf_size = computeUnit->wfSize(); 944 945 for (int i = 0; i < maxSpVgprs; i++) { 946 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); 947 for (int lane = 0; lane < wf_size; lane++) { 948 uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal); 949 computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane); 950 } 951 } 952 953 for (int i = 0; i < maxDpVgprs; i++) { 954 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); 955 for (int lane = 0; lane < wf_size; lane++) { 956 uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal); 957 computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane); 958 } 959 } 960 961 for (int i = 0; i < condRegState->numRegs(); i++) { 962 for (int lane = 0; lane < wf_size; lane++) { 963 uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal); 964 condRegState->write<uint64_t>(i, lane, regVal); 965 } 966 } 967 /** Restoring LDS contents */ 968 if (ldsChunk) 969 for (int i = 0; i < ldsChunk->size(); i++) { 970 char val = *(char *) iter; iter += sizeof(val); 971 ldsChunk->write<char>(i, val); 972 } 973} 974 975void 976Wavefront::computeActualWgSz(NDRange *ndr) 977{ 978 actualWgSzTotal = 1; 979 for (int d = 0; d < 3; ++d) { 980 actualWgSz[d] = std::min(workGroupSz[d], 981 gridSz[d] - ndr->wgId[d] * workGroupSz[d]); 982 actualWgSzTotal *= actualWgSz[d]; 983 } 984} 985