Cross Reference: /gem5/src/gpu-compute/wavefront.cc

Deleted Added

sdiff udiff text old ( 11694:c3b4d57a15c5 ) new ( 11695:0a65922d564d )

full compact

wavefront.cc (11694:c3b4d57a15c5)	wavefront.cc (11695:0a65922d564d)
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 / 35 36#include "gpu-compute/wavefront.hh" 37 38#include "debug/GPUExec.hh" 39#include "debug/WavefrontStack.hh" 40#include "gpu-compute/compute_unit.hh" 41#include "gpu-compute/gpu_dyn_inst.hh" 42#include "gpu-compute/shader.hh" 43#include "gpu-compute/vector_register_file.hh" 44 45Wavefront 46WavefrontParams::create() 47{ 48 return new Wavefront(this); 49} 50 51Wavefront::Wavefront(const Params p) 52 : SimObject(p), callArgMem(nullptr) 53{ 54 lastTrace = 0; 55 simdId = p->simdId; 56 wfSlotId = p->wf_slot_id; 57 status = S_STOPPED; 58 reservedVectorRegs = 0; 59 startVgprIndex = 0; 60 outstandingReqs = 0; 61 memReqsInPipe = 0; 62 outstandingReqsWrGm = 0; 63 outstandingReqsWrLm = 0; 64 outstandingReqsRdGm = 0; 65 outstandingReqsRdLm = 0; 66 rdLmReqsInPipe = 0; 67 rdGmReqsInPipe = 0; 68 wrLmReqsInPipe = 0; 69 wrGmReqsInPipe = 0; 70 71 barrierCnt = 0; 72 oldBarrierCnt = 0; 73 stalledAtBarrier = false; 74 75 memTraceBusy = 0; 76 oldVgprTcnt = 0xffffffffffffffffll; 77 oldDgprTcnt = 0xffffffffffffffffll; 78 oldVgpr.resize(p->wfSize); 79 80 pendingFetch = false; 81 dropFetch = false; 82 condRegState = new ConditionRegisterState(); 83 maxSpVgprs = 0; 84 maxDpVgprs = 0; 85 lastAddr.resize(p->wfSize); 86 workItemFlatId.resize(p->wfSize); 87 oldDgpr.resize(p->wfSize); 88 barCnt.resize(p->wfSize); 89 for (int i = 0; i < 3; ++i) { 90 workItemId[i].resize(p->wfSize); 91 } 92} 93 94void 95Wavefront::regStats() 96{ 97 SimObject::regStats(); 98 99 srcRegOpDist 100* .init(0, 4, 2) 101 .name(name() + ".src_reg_operand_dist") 102 .desc("number of executed instructions with N source register operands") 103 ; 104 105 dstRegOpDist 106 .init(0, 3, 2) 107 .name(name() + ".dst_reg_operand_dist") 108 .desc("number of executed instructions with N destination register " 109 "operands") 110 ; 111 112 // FIXME: the name of the WF needs to be unique 113 numTimesBlockedDueWAXDependencies 114 .name(name() + ".timesBlockedDueWAXDependencies") 115 .desc("number of times the wf's instructions are blocked due to WAW " 116 "or WAR dependencies") 117 ; 118 119 // FIXME: the name of the WF needs to be unique 120 numTimesBlockedDueRAWDependencies 121 .name(name() + ".timesBlockedDueRAWDependencies") 122 .desc("number of times the wf's instructions are blocked due to RAW " 123 "dependencies") 124 ; 125 126 // FIXME: the name of the WF needs to be unique 127 numTimesBlockedDueVrfPortAvail 128 .name(name() + ".timesBlockedDueVrfPortAvail") 129 .desc("number of times instructions are blocked due to VRF port " 130 "availability") 131 ; 132} 133 134void 135Wavefront::init() 136{ 137 reservedVectorRegs = 0; 138 startVgprIndex = 0; 139} 140 141void 142Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) 143{ 144 condRegState->init(num_cregs); 145 maxSpVgprs = num_sregs; 146 maxDpVgprs = num_dregs; 147} 148 149Wavefront::~Wavefront() 150{ 151 if (callArgMem) 152 delete callArgMem; 153 delete condRegState; 154} 155 156void 157Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr) 158{ 159 wfDynId = _wf_dyn_id; 160 basePtr = _base_ptr; 161 status = S_RUNNING; 162} 163 164bool 165Wavefront::isGmInstruction(GPUDynInstPtr ii) 166{ 167 if (ii->isGlobalMem() \|\| ii->isFlat()) 168 return true; 169 170 return false; 171} 172 173bool 174Wavefront::isLmInstruction(GPUDynInstPtr ii) 175{ 176 if (ii->isLocalMem()) { 177 return true; 178 } 179 180 return false; 181} 182 183bool 184Wavefront::isOldestInstALU() 185{ 186 assert(!instructionBuffer.empty()); 187 GPUDynInstPtr ii = instructionBuffer.front(); 188 189 if (status != S_STOPPED && (ii->isNop() \|\| 190 ii->isReturn() \|\| ii->isBranch() \|\| 191 ii->isALU() \|\| (ii->isKernArgSeg() && ii->isLoad()))) { 192 return true; 193 } 194 195 return false; 196} 197 198bool 199Wavefront::isOldestInstBarrier() 200{ 201 assert(!instructionBuffer.empty()); 202 GPUDynInstPtr ii = instructionBuffer.front(); 203 204 if (status != S_STOPPED && ii->isBarrier()) { 205 return true; 206 } 207 208 return false; 209} 210 211bool 212Wavefront::isOldestInstGMem() 213{ 214 assert(!instructionBuffer.empty()); 215 GPUDynInstPtr ii = instructionBuffer.front(); 216 217 if (status != S_STOPPED && ii->isGlobalMem()) { 218 return true; 219 } 220 221 return false; 222} 223 224bool 225Wavefront::isOldestInstLMem() 226{ 227 assert(!instructionBuffer.empty()); 228 GPUDynInstPtr ii = instructionBuffer.front(); 229 230 if (status != S_STOPPED && ii->isLocalMem()) { 231 return true; 232 } 233 234 return false; 235} 236 237bool 238Wavefront::isOldestInstPrivMem() 239{ 240 assert(!instructionBuffer.empty()); 241 GPUDynInstPtr ii = instructionBuffer.front(); 242 243 if (status != S_STOPPED && ii->isPrivateSeg()) { 244 return true; 245 } 246 247 return false; 248} 249 250bool 251Wavefront::isOldestInstFlatMem() 252{ 253 assert(!instructionBuffer.empty()); 254 GPUDynInstPtr ii = instructionBuffer.front(); 255 256 if (status != S_STOPPED && ii->isFlat()) { 257 return true; 258 } 259 260 return false; 261} 262 263// Return true if the Wavefront's instruction 264// buffer has branch instruction. 265bool 266Wavefront::instructionBufferHasBranch() 267{ 268 for (auto it : instructionBuffer) { 269 GPUDynInstPtr ii = it; 270 271 if (ii->isReturn() \|\| ii->isBranch()) { 272 return true; 273 } 274 } 275 276 return false; 277} 278 279// Remap HSAIL register to physical VGPR. 280// HSAIL register = virtual register assigned to an operand by HLC compiler 281uint32_t 282Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) 283{ 284 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); 285 // add the offset from where the VGPRs of the wavefront have been assigned 286 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; 287 // HSAIL double precision (DP) register: calculate the physical VGPR index 288 // assuming that DP registers are placed after SP ones in the VRF. The DP 289 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust 290 // the DP VGPR index before mapping it to the physical VRF address space 291 if (mode == 1 && size > 4) { 292 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); 293 } 294 295 assert((startVgprIndex <= physicalVgprIndex) && 296 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); 297 298 // calculate absolute physical VGPR index 299 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); 300} 301 302// Return true if this wavefront is ready 303// to execute an instruction of the specified type. 304int 305Wavefront::ready(itype_e type) 306{ 307 // Check to make sure wave is running 308 if (status == S_STOPPED \|\| status == S_RETURNING \|\| 309 instructionBuffer.empty()) { 310 return 0; 311 } 312 313 // Is the wave waiting at a barrier 314 if (stalledAtBarrier) { 315 if (!computeUnit->AllAtBarrier(barrierId,barrierCnt, 316 computeUnit->getRefCounter(dispatchId, wgId))) { 317 // Are all threads at barrier? 318 return 0; 319 } 320 oldBarrierCnt = barrierCnt; 321 stalledAtBarrier = false; 322 } 323 324 // Read instruction 325 GPUDynInstPtr ii = instructionBuffer.front(); 326 327 bool ready_inst M5_VAR_USED = false; 328 bool glbMemBusRdy = false; 329 bool glbMemIssueRdy = false; 330 if (type == I_GLOBAL \|\| type == I_FLAT \|\| type == I_PRIVATE) { 331 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { 332 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) 333 glbMemBusRdy = true; 334 if (computeUnit->wfWait[j].prerdy()) 335 glbMemIssueRdy = true; 336 } 337 } 338 bool locMemBusRdy = false; 339 bool locMemIssueRdy = false; 340 if (type == I_SHARED \|\| type == I_FLAT) { 341 for (int j=0; j < computeUnit->numLocMemUnits; ++j) { 342 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) 343 locMemBusRdy = true; 344 if (computeUnit->wfWait[j].prerdy()) 345 locMemIssueRdy = true; 346 } 347 } 348 349 // The following code is very error prone and the entire process for 350 // checking readiness will be fixed eventually. In the meantime, let's 351 // make sure that we do not silently let an instruction type slip 352 // through this logic and always return not ready. 353 if (!(ii->isBarrier() \|\| ii->isNop() \|\| ii->isReturn() \|\| ii->isBranch() \|\| 354 ii->isALU() \|\| ii->isLoad() \|\| ii->isStore() \|\| ii->isAtomic() \|\| 355 ii->isMemFence() \|\| ii->isFlat())) { 356 panic("next instruction: %s is of unknown type\n", ii->disassemble()); 357 } 358 359 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", 360 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); 361 362 if (type == I_ALU && ii->isBarrier()) { 363 // Here for ALU instruction (barrier) 364 if (!computeUnit->wfWait[simdId].prerdy()) { 365 // Is wave slot free? 366 return 0; 367 } 368 369 // Are there in pipe or outstanding memory requests? 370 if ((outstandingReqs + memReqsInPipe) > 0) { 371 return 0; 372 } 373 374 ready_inst = true; 375 } else if (type == I_ALU && ii->isNop()) { 376 // Here for ALU instruction (nop) 377 if (!computeUnit->wfWait[simdId].prerdy()) { 378 // Is wave slot free? 379 return 0; 380 } 381 382 ready_inst = true; 383 } else if (type == I_ALU && ii->isReturn()) { 384 // Here for ALU instruction (return) 385 if (!computeUnit->wfWait[simdId].prerdy()) { 386 // Is wave slot free? 387 return 0; 388 } 389 390 // Are there in pipe or outstanding memory requests? 391 if ((outstandingReqs + memReqsInPipe) > 0) { 392 return 0; 393 } 394 395 ready_inst = true; 396 } else if (type == I_ALU && (ii->isBranch() \|\| 397 ii->isALU() \|\| 398 (ii->isKernArgSeg() && ii->isLoad()) \|\| 399 ii->isArgSeg())) { 400 // Here for ALU instruction (all others) 401 if (!computeUnit->wfWait[simdId].prerdy()) { 402 // Is alu slot free? 403 return 0; 404 } 405 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 406 VrfAccessType::RD_WR)) { 407 return 0; 408 } 409 410 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 411 return 0; 412 } 413 ready_inst = true; 414 } else if (type == I_GLOBAL && ii->isGlobalMem()) { 415 // Here Global memory instruction 416 if (ii->isLoad() \|\| ii->isAtomic() \|\| ii->isMemFence()) { 417 // Are there in pipe or outstanding global memory write requests? 418 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { 419 return 0; 420 } 421 } 422 423 if (ii->isStore() \|\| ii->isAtomic() \|\| ii->isMemFence()) { 424 // Are there in pipe or outstanding global memory read requests? 425 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) 426 return 0; 427 } 428 429 if (!glbMemIssueRdy) { 430 // Is WV issue slot free? 431 return 0; 432 } 433 434 if (!glbMemBusRdy) { 435 // Is there an available VRF->Global memory read bus? 436 return 0; 437 } 438 439 if (!computeUnit->globalMemoryPipe. 440 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 441 // Can we insert a new request to the Global Mem Request FIFO? 442 return 0; 443 } 444 // can we schedule source & destination operands on the VRF? 445 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 446 VrfAccessType::RD_WR)) { 447 return 0; 448 } 449 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 450 return 0; 451 } 452 ready_inst = true; 453 } else if (type == I_SHARED && ii->isLocalMem()) { 454 // Here for Shared memory instruction 455 if (ii->isLoad() \|\| ii->isAtomic() \|\| ii->isMemFence()) { 456 if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) { 457 return 0; 458 } 459 } 460 461 if (ii->isStore() \|\| ii->isAtomic() \|\| ii->isMemFence()) { 462 if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) { 463 return 0; 464 } 465 } 466 467 if (!locMemBusRdy) { 468 // Is there an available VRF->LDS read bus? 469 return 0; 470 } 471 if (!locMemIssueRdy) { 472 // Is wave slot free? 473 return 0; 474 } 475 476 if (!computeUnit->localMemoryPipe. 477 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { 478 // Can we insert a new request to the LDS Request FIFO? 479 return 0; 480 } 481 // can we schedule source & destination operands on the VRF? 482 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 483 VrfAccessType::RD_WR)) { 484 return 0; 485 } 486 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 487 return 0; 488 } 489 ready_inst = true; 490 } else if (type == I_FLAT && ii->isFlat()) { 491 if (!glbMemBusRdy) { 492 // Is there an available VRF->Global memory read bus? 493 return 0; 494 } 495 496 if (!locMemBusRdy) { 497 // Is there an available VRF->LDS read bus? 498 return 0; 499 } 500 501 if (!glbMemIssueRdy) { 502 // Is wave slot free? 503 return 0; 504 } 505 506 if (!locMemIssueRdy) { 507 return 0; 508 } 509 if (!computeUnit->globalMemoryPipe. 510 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 511 // Can we insert a new request to the Global Mem Request FIFO? 512 return 0; 513 } 514 515 if (!computeUnit->localMemoryPipe. 516 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { 517 // Can we insert a new request to the LDS Request FIFO? 518 return 0; 519 } 520 // can we schedule source & destination operands on the VRF? 521 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 522 VrfAccessType::RD_WR)) { 523 return 0; 524 } 525 // are all the operands ready? (RAW, WAW and WAR depedencies met?) 526 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 527 return 0; 528 } 529 ready_inst = true; 530 } else { 531 return 0; 532 } 533 534 assert(ready_inst); 535 536 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, 537 simdId, wfSlotId, ii->disassemble()); 538 return 1; 539} 540 541void 542Wavefront::updateResources() 543{ 544 // Get current instruction 545 GPUDynInstPtr ii = instructionBuffer.front(); 546 assert(ii); 547 computeUnit->vrf[simdId]->updateResources(this, ii); 548 // Single precision ALU or Branch or Return or Special instruction 549 if (ii->isALU() \|\| ii->isSpecialOp() \|\| 550 ii->isBranch() \|\| 551 // FIXME: Kernel argument loads are currently treated as ALU operations 552 // since we don't send memory packets at execution. If we fix that then 553 // we should map them to one of the memory pipelines 554 (ii->isKernArgSeg() && ii->isLoad()) \|\| ii->isArgSeg() \|\| 555 ii->isReturn()) { 556 computeUnit->aluPipe[simdId].preset(computeUnit->shader-> 557 ticks(computeUnit->spBypassLength())); 558 // this is to enforce a fixed number of cycles per issue slot per SIMD 559 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 560 ticks(computeUnit->issuePeriod)); 561 } else if (ii->isBarrier()) { 562 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 563 ticks(computeUnit->issuePeriod)); 564 } else if (ii->isLoad() && ii->isFlat()) { 565 assert(Enums::SC_NONE != ii->executedAs()); 566 memReqsInPipe++; 567 rdGmReqsInPipe++; 568 if ( Enums::SC_SHARED == ii->executedAs() ) { 569 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 570 preset(computeUnit->shader->ticks(4)); 571 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 572 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 573 } else { 574 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 575 preset(computeUnit->shader->ticks(4)); 576 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 577 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 578 } 579 } else if (ii->isStore() && ii->isFlat()) { 580 assert(Enums::SC_NONE != ii->executedAs()); 581 memReqsInPipe++; 582 wrGmReqsInPipe++; 583 if (Enums::SC_SHARED == ii->executedAs()) { 584 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 585 preset(computeUnit->shader->ticks(8)); 586 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 587 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 588 } else { 589 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 590 preset(computeUnit->shader->ticks(8)); 591 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 592 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 593 } 594 } else if (ii->isLoad() && ii->isGlobalMem()) { 595 memReqsInPipe++; 596 rdGmReqsInPipe++; 597 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 598 preset(computeUnit->shader->ticks(4)); 599 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 600 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 601 } else if (ii->isStore() && ii->isGlobalMem()) { 602 memReqsInPipe++; 603 wrGmReqsInPipe++; 604 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 605 preset(computeUnit->shader->ticks(8)); 606 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 607 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 608 } else if ((ii->isAtomic() \|\| ii->isMemFence()) && ii->isGlobalMem()) { 609 memReqsInPipe++; 610 wrGmReqsInPipe++; 611 rdGmReqsInPipe++; 612 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 613 preset(computeUnit->shader->ticks(8)); 614 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 615 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 616 } else if (ii->isLoad() && ii->isLocalMem()) { 617 memReqsInPipe++; 618 rdLmReqsInPipe++; 619 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 620 preset(computeUnit->shader->ticks(4)); 621 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 622 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 623 } else if (ii->isStore() && ii->isLocalMem()) { 624 memReqsInPipe++; 625 wrLmReqsInPipe++; 626 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 627 preset(computeUnit->shader->ticks(8)); 628 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 629 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 630 } else if ((ii->isAtomic() \|\| ii->isMemFence()) && ii->isLocalMem()) { 631 memReqsInPipe++; 632 wrLmReqsInPipe++; 633 rdLmReqsInPipe++; 634 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 635 preset(computeUnit->shader->ticks(8)); 636 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 637 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 638 } 639} 640 641void 642Wavefront::exec() 643{ 644 // ---- Exit if wavefront is inactive ----------------------------- // 645 646 if (status == S_STOPPED \|\| status == S_RETURNING \|\| 647 instructionBuffer.empty()) { 648 return; 649 } 650 651 // Get current instruction 652 653 GPUDynInstPtr ii = instructionBuffer.front(); 654 655 const uint32_t old_pc = pc(); 656 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " 657 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, 658 ii->disassemble(), old_pc);	1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 / 35 36#include "gpu-compute/wavefront.hh" 37 38#include "debug/GPUExec.hh" 39#include "debug/WavefrontStack.hh" 40#include "gpu-compute/compute_unit.hh" 41#include "gpu-compute/gpu_dyn_inst.hh" 42#include "gpu-compute/shader.hh" 43#include "gpu-compute/vector_register_file.hh" 44 45Wavefront 46WavefrontParams::create() 47{ 48 return new Wavefront(this); 49} 50 51Wavefront::Wavefront(const Params p) 52 : SimObject(p), callArgMem(nullptr) 53{ 54 lastTrace = 0; 55 simdId = p->simdId; 56 wfSlotId = p->wf_slot_id; 57 status = S_STOPPED; 58 reservedVectorRegs = 0; 59 startVgprIndex = 0; 60 outstandingReqs = 0; 61 memReqsInPipe = 0; 62 outstandingReqsWrGm = 0; 63 outstandingReqsWrLm = 0; 64 outstandingReqsRdGm = 0; 65 outstandingReqsRdLm = 0; 66 rdLmReqsInPipe = 0; 67 rdGmReqsInPipe = 0; 68 wrLmReqsInPipe = 0; 69 wrGmReqsInPipe = 0; 70 71 barrierCnt = 0; 72 oldBarrierCnt = 0; 73 stalledAtBarrier = false; 74 75 memTraceBusy = 0; 76 oldVgprTcnt = 0xffffffffffffffffll; 77 oldDgprTcnt = 0xffffffffffffffffll; 78 oldVgpr.resize(p->wfSize); 79 80 pendingFetch = false; 81 dropFetch = false; 82 condRegState = new ConditionRegisterState(); 83 maxSpVgprs = 0; 84 maxDpVgprs = 0; 85 lastAddr.resize(p->wfSize); 86 workItemFlatId.resize(p->wfSize); 87 oldDgpr.resize(p->wfSize); 88 barCnt.resize(p->wfSize); 89 for (int i = 0; i < 3; ++i) { 90 workItemId[i].resize(p->wfSize); 91 } 92} 93 94void 95Wavefront::regStats() 96{ 97 SimObject::regStats(); 98 99 srcRegOpDist 100* .init(0, 4, 2) 101 .name(name() + ".src_reg_operand_dist") 102 .desc("number of executed instructions with N source register operands") 103 ; 104 105 dstRegOpDist 106 .init(0, 3, 2) 107 .name(name() + ".dst_reg_operand_dist") 108 .desc("number of executed instructions with N destination register " 109 "operands") 110 ; 111 112 // FIXME: the name of the WF needs to be unique 113 numTimesBlockedDueWAXDependencies 114 .name(name() + ".timesBlockedDueWAXDependencies") 115 .desc("number of times the wf's instructions are blocked due to WAW " 116 "or WAR dependencies") 117 ; 118 119 // FIXME: the name of the WF needs to be unique 120 numTimesBlockedDueRAWDependencies 121 .name(name() + ".timesBlockedDueRAWDependencies") 122 .desc("number of times the wf's instructions are blocked due to RAW " 123 "dependencies") 124 ; 125 126 // FIXME: the name of the WF needs to be unique 127 numTimesBlockedDueVrfPortAvail 128 .name(name() + ".timesBlockedDueVrfPortAvail") 129 .desc("number of times instructions are blocked due to VRF port " 130 "availability") 131 ; 132} 133 134void 135Wavefront::init() 136{ 137 reservedVectorRegs = 0; 138 startVgprIndex = 0; 139} 140 141void 142Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) 143{ 144 condRegState->init(num_cregs); 145 maxSpVgprs = num_sregs; 146 maxDpVgprs = num_dregs; 147} 148 149Wavefront::~Wavefront() 150{ 151 if (callArgMem) 152 delete callArgMem; 153 delete condRegState; 154} 155 156void 157Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr) 158{ 159 wfDynId = _wf_dyn_id; 160 basePtr = _base_ptr; 161 status = S_RUNNING; 162} 163 164bool 165Wavefront::isGmInstruction(GPUDynInstPtr ii) 166{ 167 if (ii->isGlobalMem() \|\| ii->isFlat()) 168 return true; 169 170 return false; 171} 172 173bool 174Wavefront::isLmInstruction(GPUDynInstPtr ii) 175{ 176 if (ii->isLocalMem()) { 177 return true; 178 } 179 180 return false; 181} 182 183bool 184Wavefront::isOldestInstALU() 185{ 186 assert(!instructionBuffer.empty()); 187 GPUDynInstPtr ii = instructionBuffer.front(); 188 189 if (status != S_STOPPED && (ii->isNop() \|\| 190 ii->isReturn() \|\| ii->isBranch() \|\| 191 ii->isALU() \|\| (ii->isKernArgSeg() && ii->isLoad()))) { 192 return true; 193 } 194 195 return false; 196} 197 198bool 199Wavefront::isOldestInstBarrier() 200{ 201 assert(!instructionBuffer.empty()); 202 GPUDynInstPtr ii = instructionBuffer.front(); 203 204 if (status != S_STOPPED && ii->isBarrier()) { 205 return true; 206 } 207 208 return false; 209} 210 211bool 212Wavefront::isOldestInstGMem() 213{ 214 assert(!instructionBuffer.empty()); 215 GPUDynInstPtr ii = instructionBuffer.front(); 216 217 if (status != S_STOPPED && ii->isGlobalMem()) { 218 return true; 219 } 220 221 return false; 222} 223 224bool 225Wavefront::isOldestInstLMem() 226{ 227 assert(!instructionBuffer.empty()); 228 GPUDynInstPtr ii = instructionBuffer.front(); 229 230 if (status != S_STOPPED && ii->isLocalMem()) { 231 return true; 232 } 233 234 return false; 235} 236 237bool 238Wavefront::isOldestInstPrivMem() 239{ 240 assert(!instructionBuffer.empty()); 241 GPUDynInstPtr ii = instructionBuffer.front(); 242 243 if (status != S_STOPPED && ii->isPrivateSeg()) { 244 return true; 245 } 246 247 return false; 248} 249 250bool 251Wavefront::isOldestInstFlatMem() 252{ 253 assert(!instructionBuffer.empty()); 254 GPUDynInstPtr ii = instructionBuffer.front(); 255 256 if (status != S_STOPPED && ii->isFlat()) { 257 return true; 258 } 259 260 return false; 261} 262 263// Return true if the Wavefront's instruction 264// buffer has branch instruction. 265bool 266Wavefront::instructionBufferHasBranch() 267{ 268 for (auto it : instructionBuffer) { 269 GPUDynInstPtr ii = it; 270 271 if (ii->isReturn() \|\| ii->isBranch()) { 272 return true; 273 } 274 } 275 276 return false; 277} 278 279// Remap HSAIL register to physical VGPR. 280// HSAIL register = virtual register assigned to an operand by HLC compiler 281uint32_t 282Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) 283{ 284 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); 285 // add the offset from where the VGPRs of the wavefront have been assigned 286 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; 287 // HSAIL double precision (DP) register: calculate the physical VGPR index 288 // assuming that DP registers are placed after SP ones in the VRF. The DP 289 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust 290 // the DP VGPR index before mapping it to the physical VRF address space 291 if (mode == 1 && size > 4) { 292 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); 293 } 294 295 assert((startVgprIndex <= physicalVgprIndex) && 296 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); 297 298 // calculate absolute physical VGPR index 299 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); 300} 301 302// Return true if this wavefront is ready 303// to execute an instruction of the specified type. 304int 305Wavefront::ready(itype_e type) 306{ 307 // Check to make sure wave is running 308 if (status == S_STOPPED \|\| status == S_RETURNING \|\| 309 instructionBuffer.empty()) { 310 return 0; 311 } 312 313 // Is the wave waiting at a barrier 314 if (stalledAtBarrier) { 315 if (!computeUnit->AllAtBarrier(barrierId,barrierCnt, 316 computeUnit->getRefCounter(dispatchId, wgId))) { 317 // Are all threads at barrier? 318 return 0; 319 } 320 oldBarrierCnt = barrierCnt; 321 stalledAtBarrier = false; 322 } 323 324 // Read instruction 325 GPUDynInstPtr ii = instructionBuffer.front(); 326 327 bool ready_inst M5_VAR_USED = false; 328 bool glbMemBusRdy = false; 329 bool glbMemIssueRdy = false; 330 if (type == I_GLOBAL \|\| type == I_FLAT \|\| type == I_PRIVATE) { 331 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { 332 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) 333 glbMemBusRdy = true; 334 if (computeUnit->wfWait[j].prerdy()) 335 glbMemIssueRdy = true; 336 } 337 } 338 bool locMemBusRdy = false; 339 bool locMemIssueRdy = false; 340 if (type == I_SHARED \|\| type == I_FLAT) { 341 for (int j=0; j < computeUnit->numLocMemUnits; ++j) { 342 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) 343 locMemBusRdy = true; 344 if (computeUnit->wfWait[j].prerdy()) 345 locMemIssueRdy = true; 346 } 347 } 348 349 // The following code is very error prone and the entire process for 350 // checking readiness will be fixed eventually. In the meantime, let's 351 // make sure that we do not silently let an instruction type slip 352 // through this logic and always return not ready. 353 if (!(ii->isBarrier() \|\| ii->isNop() \|\| ii->isReturn() \|\| ii->isBranch() \|\| 354 ii->isALU() \|\| ii->isLoad() \|\| ii->isStore() \|\| ii->isAtomic() \|\| 355 ii->isMemFence() \|\| ii->isFlat())) { 356 panic("next instruction: %s is of unknown type\n", ii->disassemble()); 357 } 358 359 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", 360 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); 361 362 if (type == I_ALU && ii->isBarrier()) { 363 // Here for ALU instruction (barrier) 364 if (!computeUnit->wfWait[simdId].prerdy()) { 365 // Is wave slot free? 366 return 0; 367 } 368 369 // Are there in pipe or outstanding memory requests? 370 if ((outstandingReqs + memReqsInPipe) > 0) { 371 return 0; 372 } 373 374 ready_inst = true; 375 } else if (type == I_ALU && ii->isNop()) { 376 // Here for ALU instruction (nop) 377 if (!computeUnit->wfWait[simdId].prerdy()) { 378 // Is wave slot free? 379 return 0; 380 } 381 382 ready_inst = true; 383 } else if (type == I_ALU && ii->isReturn()) { 384 // Here for ALU instruction (return) 385 if (!computeUnit->wfWait[simdId].prerdy()) { 386 // Is wave slot free? 387 return 0; 388 } 389 390 // Are there in pipe or outstanding memory requests? 391 if ((outstandingReqs + memReqsInPipe) > 0) { 392 return 0; 393 } 394 395 ready_inst = true; 396 } else if (type == I_ALU && (ii->isBranch() \|\| 397 ii->isALU() \|\| 398 (ii->isKernArgSeg() && ii->isLoad()) \|\| 399 ii->isArgSeg())) { 400 // Here for ALU instruction (all others) 401 if (!computeUnit->wfWait[simdId].prerdy()) { 402 // Is alu slot free? 403 return 0; 404 } 405 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 406 VrfAccessType::RD_WR)) { 407 return 0; 408 } 409 410 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 411 return 0; 412 } 413 ready_inst = true; 414 } else if (type == I_GLOBAL && ii->isGlobalMem()) { 415 // Here Global memory instruction 416 if (ii->isLoad() \|\| ii->isAtomic() \|\| ii->isMemFence()) { 417 // Are there in pipe or outstanding global memory write requests? 418 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { 419 return 0; 420 } 421 } 422 423 if (ii->isStore() \|\| ii->isAtomic() \|\| ii->isMemFence()) { 424 // Are there in pipe or outstanding global memory read requests? 425 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) 426 return 0; 427 } 428 429 if (!glbMemIssueRdy) { 430 // Is WV issue slot free? 431 return 0; 432 } 433 434 if (!glbMemBusRdy) { 435 // Is there an available VRF->Global memory read bus? 436 return 0; 437 } 438 439 if (!computeUnit->globalMemoryPipe. 440 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 441 // Can we insert a new request to the Global Mem Request FIFO? 442 return 0; 443 } 444 // can we schedule source & destination operands on the VRF? 445 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 446 VrfAccessType::RD_WR)) { 447 return 0; 448 } 449 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 450 return 0; 451 } 452 ready_inst = true; 453 } else if (type == I_SHARED && ii->isLocalMem()) { 454 // Here for Shared memory instruction 455 if (ii->isLoad() \|\| ii->isAtomic() \|\| ii->isMemFence()) { 456 if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) { 457 return 0; 458 } 459 } 460 461 if (ii->isStore() \|\| ii->isAtomic() \|\| ii->isMemFence()) { 462 if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) { 463 return 0; 464 } 465 } 466 467 if (!locMemBusRdy) { 468 // Is there an available VRF->LDS read bus? 469 return 0; 470 } 471 if (!locMemIssueRdy) { 472 // Is wave slot free? 473 return 0; 474 } 475 476 if (!computeUnit->localMemoryPipe. 477 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { 478 // Can we insert a new request to the LDS Request FIFO? 479 return 0; 480 } 481 // can we schedule source & destination operands on the VRF? 482 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 483 VrfAccessType::RD_WR)) { 484 return 0; 485 } 486 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 487 return 0; 488 } 489 ready_inst = true; 490 } else if (type == I_FLAT && ii->isFlat()) { 491 if (!glbMemBusRdy) { 492 // Is there an available VRF->Global memory read bus? 493 return 0; 494 } 495 496 if (!locMemBusRdy) { 497 // Is there an available VRF->LDS read bus? 498 return 0; 499 } 500 501 if (!glbMemIssueRdy) { 502 // Is wave slot free? 503 return 0; 504 } 505 506 if (!locMemIssueRdy) { 507 return 0; 508 } 509 if (!computeUnit->globalMemoryPipe. 510 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 511 // Can we insert a new request to the Global Mem Request FIFO? 512 return 0; 513 } 514 515 if (!computeUnit->localMemoryPipe. 516 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { 517 // Can we insert a new request to the LDS Request FIFO? 518 return 0; 519 } 520 // can we schedule source & destination operands on the VRF? 521 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 522 VrfAccessType::RD_WR)) { 523 return 0; 524 } 525 // are all the operands ready? (RAW, WAW and WAR depedencies met?) 526 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 527 return 0; 528 } 529 ready_inst = true; 530 } else { 531 return 0; 532 } 533 534 assert(ready_inst); 535 536 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, 537 simdId, wfSlotId, ii->disassemble()); 538 return 1; 539} 540 541void 542Wavefront::updateResources() 543{ 544 // Get current instruction 545 GPUDynInstPtr ii = instructionBuffer.front(); 546 assert(ii); 547 computeUnit->vrf[simdId]->updateResources(this, ii); 548 // Single precision ALU or Branch or Return or Special instruction 549 if (ii->isALU() \|\| ii->isSpecialOp() \|\| 550 ii->isBranch() \|\| 551 // FIXME: Kernel argument loads are currently treated as ALU operations 552 // since we don't send memory packets at execution. If we fix that then 553 // we should map them to one of the memory pipelines 554 (ii->isKernArgSeg() && ii->isLoad()) \|\| ii->isArgSeg() \|\| 555 ii->isReturn()) { 556 computeUnit->aluPipe[simdId].preset(computeUnit->shader-> 557 ticks(computeUnit->spBypassLength())); 558 // this is to enforce a fixed number of cycles per issue slot per SIMD 559 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 560 ticks(computeUnit->issuePeriod)); 561 } else if (ii->isBarrier()) { 562 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 563 ticks(computeUnit->issuePeriod)); 564 } else if (ii->isLoad() && ii->isFlat()) { 565 assert(Enums::SC_NONE != ii->executedAs()); 566 memReqsInPipe++; 567 rdGmReqsInPipe++; 568 if ( Enums::SC_SHARED == ii->executedAs() ) { 569 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 570 preset(computeUnit->shader->ticks(4)); 571 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 572 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 573 } else { 574 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 575 preset(computeUnit->shader->ticks(4)); 576 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 577 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 578 } 579 } else if (ii->isStore() && ii->isFlat()) { 580 assert(Enums::SC_NONE != ii->executedAs()); 581 memReqsInPipe++; 582 wrGmReqsInPipe++; 583 if (Enums::SC_SHARED == ii->executedAs()) { 584 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 585 preset(computeUnit->shader->ticks(8)); 586 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 587 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 588 } else { 589 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 590 preset(computeUnit->shader->ticks(8)); 591 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 592 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 593 } 594 } else if (ii->isLoad() && ii->isGlobalMem()) { 595 memReqsInPipe++; 596 rdGmReqsInPipe++; 597 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 598 preset(computeUnit->shader->ticks(4)); 599 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 600 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 601 } else if (ii->isStore() && ii->isGlobalMem()) { 602 memReqsInPipe++; 603 wrGmReqsInPipe++; 604 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 605 preset(computeUnit->shader->ticks(8)); 606 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 607 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 608 } else if ((ii->isAtomic() \|\| ii->isMemFence()) && ii->isGlobalMem()) { 609 memReqsInPipe++; 610 wrGmReqsInPipe++; 611 rdGmReqsInPipe++; 612 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 613 preset(computeUnit->shader->ticks(8)); 614 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 615 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 616 } else if (ii->isLoad() && ii->isLocalMem()) { 617 memReqsInPipe++; 618 rdLmReqsInPipe++; 619 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 620 preset(computeUnit->shader->ticks(4)); 621 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 622 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 623 } else if (ii->isStore() && ii->isLocalMem()) { 624 memReqsInPipe++; 625 wrLmReqsInPipe++; 626 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 627 preset(computeUnit->shader->ticks(8)); 628 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 629 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 630 } else if ((ii->isAtomic() \|\| ii->isMemFence()) && ii->isLocalMem()) { 631 memReqsInPipe++; 632 wrLmReqsInPipe++; 633 rdLmReqsInPipe++; 634 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 635 preset(computeUnit->shader->ticks(8)); 636 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 637 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 638 } 639} 640 641void 642Wavefront::exec() 643{ 644 // ---- Exit if wavefront is inactive ----------------------------- // 645 646 if (status == S_STOPPED \|\| status == S_RETURNING \|\| 647 instructionBuffer.empty()) { 648 return; 649 } 650 651 // Get current instruction 652 653 GPUDynInstPtr ii = instructionBuffer.front(); 654 655 const uint32_t old_pc = pc(); 656 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " 657 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, 658 ii->disassemble(), old_pc);
	659 660 // update the instruction stats in the CU 661
659 ii->execute(ii);	662 ii->execute(ii);
	663 computeUnit->updateInstStats(ii);
660 // access the VRF 661 computeUnit->vrf[simdId]->exec(ii, this); 662 srcRegOpDist.sample(ii->numSrcRegOperands()); 663 dstRegOpDist.sample(ii->numDstRegOperands()); 664 computeUnit->numInstrExecuted++; 665 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - 666 computeUnit->lastExecCycle[simdId]); 667 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); 668 if (pc() == old_pc) { 669 uint32_t new_pc = old_pc + 1; 670 // PC not modified by instruction, proceed to next or pop frame 671 pc(new_pc); 672 if (new_pc == rpc()) { 673 popFromReconvergenceStack(); 674 discardFetch(); 675 } else { 676 instructionBuffer.pop_front(); 677 } 678 } else { 679 discardFetch(); 680 } 681 682 if (computeUnit->shader->hsail_mode==Shader::SIMT) { 683 const int num_active_lanes = execMask().count(); 684 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); 685 computeUnit->numVecOpsExecuted += num_active_lanes; 686 if (isGmInstruction(ii)) { 687 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); 688 } else if (isLmInstruction(ii)) { 689 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); 690 } 691 } 692 693 // ---- Update Vector ALU pipeline and other resources ------------------ // 694 // Single precision ALU or Branch or Return or Special instruction 695 if (ii->isALU() \|\| ii->isSpecialOp() \|\| 696 ii->isBranch() \|\| 697 // FIXME: Kernel argument loads are currently treated as ALU operations 698 // since we don't send memory packets at execution. If we fix that then 699 // we should map them to one of the memory pipelines 700 (ii->isKernArgSeg() && ii->isLoad()) \|\| 701 ii->isArgSeg() \|\| 702 ii->isReturn()) { 703 computeUnit->aluPipe[simdId].set(computeUnit->shader-> 704 ticks(computeUnit->spBypassLength())); 705 706 // this is to enforce a fixed number of cycles per issue slot per SIMD 707 computeUnit->wfWait[simdId].set(computeUnit->shader-> 708 ticks(computeUnit->issuePeriod)); 709 } else if (ii->isBarrier()) { 710 computeUnit->wfWait[simdId].set(computeUnit->shader-> 711 ticks(computeUnit->issuePeriod)); 712 } else if (ii->isLoad() && ii->isFlat()) { 713 assert(Enums::SC_NONE != ii->executedAs()); 714 715 if (Enums::SC_SHARED == ii->executedAs()) { 716 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 717 set(computeUnit->shader->ticks(4)); 718 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 719 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 720 } else { 721 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 722 set(computeUnit->shader->ticks(4)); 723 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 724 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 725 } 726 } else if (ii->isStore() && ii->isFlat()) { 727 assert(Enums::SC_NONE != ii->executedAs()); 728 if (Enums::SC_SHARED == ii->executedAs()) { 729 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 730 set(computeUnit->shader->ticks(8)); 731 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 732 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 733 } else { 734 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 735 set(computeUnit->shader->ticks(8)); 736 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 737 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 738 } 739 } else if (ii->isLoad() && ii->isGlobalMem()) { 740 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 741 set(computeUnit->shader->ticks(4)); 742 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 743 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 744 } else if (ii->isStore() && ii->isGlobalMem()) { 745 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 746 set(computeUnit->shader->ticks(8)); 747 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 748 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 749 } else if ((ii->isAtomic() \|\| ii->isMemFence()) && ii->isGlobalMem()) { 750 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 751 set(computeUnit->shader->ticks(8)); 752 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 753 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 754 } else if (ii->isLoad() && ii->isLocalMem()) { 755 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 756 set(computeUnit->shader->ticks(4)); 757 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 758 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 759 } else if (ii->isStore() && ii->isLocalMem()) { 760 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 761 set(computeUnit->shader->ticks(8)); 762 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 763 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 764 } else if ((ii->isAtomic() \|\| ii->isMemFence()) && ii->isLocalMem()) { 765 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 766 set(computeUnit->shader->ticks(8)); 767 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 768 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 769 } 770} 771 772bool 773Wavefront::waitingAtBarrier(int lane) 774{ 775 return barCnt[lane] < maxBarCnt; 776} 777 778void 779Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 780 const VectorMask& mask) 781{ 782 assert(mask.count()); 783 reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask}); 784} 785 786void 787Wavefront::popFromReconvergenceStack() 788{ 789 assert(!reconvergenceStack.empty()); 790 791 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", 792 computeUnit->cu_id, simdId, wfSlotId, wfDynId, 793 execMask().to_string<char, std::string::traits_type, 794 std::string::allocator_type>().c_str(), pc()); 795 796 reconvergenceStack.pop_back(); 797 798 DPRINTF(WavefrontStack, "%3i %s\n", pc(), 799 execMask().to_string<char, std::string::traits_type, 800 std::string::allocator_type>().c_str()); 801 802} 803 804void 805Wavefront::discardFetch() 806{ 807 instructionBuffer.clear(); 808 dropFetch \|=pendingFetch; 809} 810 811uint32_t 812Wavefront::pc() const 813{ 814 return reconvergenceStack.back()->pc; 815} 816 817uint32_t 818Wavefront::rpc() const 819{ 820 return reconvergenceStack.back()->rpc; 821} 822 823VectorMask 824Wavefront::execMask() const 825{ 826 return reconvergenceStack.back()->execMask; 827} 828 829bool 830Wavefront::execMask(int lane) const 831{ 832 return reconvergenceStack.back()->execMask[lane]; 833} 834 835 836void 837Wavefront::pc(uint32_t new_pc) 838{ 839 reconvergenceStack.back()->pc = new_pc; 840} 841 842uint32_t 843Wavefront::getStaticContextSize() const 844{ 845 return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) + 846 sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) + 847 sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) + 848 sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) + 849 computeUnit->wfSize() * sizeof(ReconvergenceStackEntry); 850} 851 852void 853Wavefront::getContext(const void out) 854{ 855* uint8_t iter = (uint8_t )out; 856 for (int i = 0; i < barCnt.size(); i++) { 857 (int )iter = barCnt[i]; iter += sizeof(barCnt[i]); 858 } 859 (int )iter = wfId; iter += sizeof(wfId); 860 (int )iter = maxBarCnt; iter += sizeof(maxBarCnt); 861 (int )iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt); 862 (int )iter = barrierCnt; iter += sizeof(barrierCnt); 863 (int )iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id); 864 (uint32_t )iter = wgId; iter += sizeof(wgId); 865 (uint32_t )iter = barrierId; iter += sizeof(barrierId); 866 (uint64_t )iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong()); 867 (Addr )iter = privBase; iter += sizeof(privBase); 868 (Addr )iter = spillBase; iter += sizeof(spillBase); 869 870 int stackSize = reconvergenceStack.size(); 871 ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(), 872 std::numeric_limits<uint32_t>::max(), 873 std::numeric_limits<uint64_t>::max()}; 874 for (int i = 0; i < workItemId[0].size(); i++) { 875 if (i < stackSize) { 876 (ReconvergenceStackEntry )iter = reconvergenceStack.back(); 877* iter += sizeof(ReconvergenceStackEntry); 878 reconvergenceStack.pop_back(); 879 } else { 880 (ReconvergenceStackEntry )iter = empty; 881 iter += sizeof(ReconvergenceStackEntry); 882 } 883 } 884 885 int wf_size = computeUnit->wfSize(); 886 for (int i = 0; i < maxSpVgprs; i++) { 887 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); 888 for (int lane = 0; lane < wf_size; lane++) { 889 uint32_t regVal = computeUnit->vrf[simdId]-> 890 read<uint32_t>(vgprIdx,lane); 891 (uint32_t )iter = regVal; iter += sizeof(regVal); 892 } 893 } 894 895 for (int i = 0; i < maxDpVgprs; i++) { 896 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); 897 for (int lane = 0; lane < wf_size; lane++) { 898 uint64_t regVal = computeUnit->vrf[simdId]-> 899 read<uint64_t>(vgprIdx,lane); 900 (uint64_t )iter = regVal; iter += sizeof(regVal); 901 } 902 } 903 904 for (int i = 0; i < condRegState->numRegs(); i++) { 905 for (int lane = 0; lane < wf_size; lane++) { 906 uint64_t regVal = condRegState->read<uint64_t>(i, lane); 907 (uint64_t )iter = regVal; iter += sizeof(regVal); 908 } 909 } 910 911 /* saving LDS content / 912* if (ldsChunk) 913 for (int i = 0; i < ldsChunk->size(); i++) { 914 char val = ldsChunk->read<char>(i); 915 (char ) iter = val; iter += sizeof(val); 916 } 917} 918 919void 920Wavefront::setContext(const void in) 921{ 922* uint8_t iter = (uint8_t )in; 923 for (int i = 0; i < barCnt.size(); i++) { 924 barCnt[i] = (int )iter; iter += sizeof(barCnt[i]); 925 } 926 wfId = (int )iter; iter += sizeof(wfId); 927 maxBarCnt = (int )iter; iter += sizeof(maxBarCnt); 928 oldBarrierCnt = (int )iter; iter += sizeof(oldBarrierCnt); 929 barrierCnt = (int )iter; iter += sizeof(barrierCnt); 930 computeUnit->cu_id = (int )iter; iter += sizeof(computeUnit->cu_id); 931 wgId = (uint32_t )iter; iter += sizeof(wgId); 932 barrierId = (uint32_t )iter; iter += sizeof(barrierId); 933 initMask = VectorMask((uint64_t )iter); iter += sizeof(initMask); 934 privBase = (Addr )iter; iter += sizeof(privBase); 935 spillBase = (Addr )iter; iter += sizeof(spillBase); 936 937 for (int i = 0; i < workItemId[0].size(); i++) { 938 ReconvergenceStackEntry newEntry = (ReconvergenceStackEntry )iter; 939 iter += sizeof(ReconvergenceStackEntry); 940 if (newEntry.pc != std::numeric_limits<uint32_t>::max()) { 941 pushToReconvergenceStack(newEntry.pc, newEntry.rpc, 942 newEntry.execMask); 943 } 944 } 945 int wf_size = computeUnit->wfSize(); 946 947 for (int i = 0; i < maxSpVgprs; i++) { 948 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); 949 for (int lane = 0; lane < wf_size; lane++) { 950 uint32_t regVal = (uint32_t )iter; iter += sizeof(regVal); 951 computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane); 952 } 953 } 954 955 for (int i = 0; i < maxDpVgprs; i++) { 956 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); 957 for (int lane = 0; lane < wf_size; lane++) { 958 uint64_t regVal = (uint64_t )iter; iter += sizeof(regVal); 959 computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane); 960 } 961 } 962 963 for (int i = 0; i < condRegState->numRegs(); i++) { 964 for (int lane = 0; lane < wf_size; lane++) { 965 uint64_t regVal = (uint64_t )iter; iter += sizeof(regVal); 966 condRegState->write<uint64_t>(i, lane, regVal); 967 } 968 } 969 /** Restoring LDS contents / 970* if (ldsChunk) 971 for (int i = 0; i < ldsChunk->size(); i++) { 972 char val = (char ) iter; iter += sizeof(val); 973 ldsChunk->write<char>(i, val); 974 } 975} 976 977void 978Wavefront::computeActualWgSz(NDRange ndr) 979{ 980* actualWgSzTotal = 1; 981 for (int d = 0; d < 3; ++d) { 982 actualWgSz[d] = std::min(workGroupSz[d], 983 gridSz[d] - ndr->wgId[d] * workGroupSz[d]); 984 actualWgSzTotal = actualWgSz[d]; 985* } 986}	664 // access the VRF 665 computeUnit->vrf[simdId]->exec(ii, this); 666 srcRegOpDist.sample(ii->numSrcRegOperands()); 667 dstRegOpDist.sample(ii->numDstRegOperands()); 668 computeUnit->numInstrExecuted++; 669 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - 670 computeUnit->lastExecCycle[simdId]); 671 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); 672 if (pc() == old_pc) { 673 uint32_t new_pc = old_pc + 1; 674 // PC not modified by instruction, proceed to next or pop frame 675 pc(new_pc); 676 if (new_pc == rpc()) { 677 popFromReconvergenceStack(); 678 discardFetch(); 679 } else { 680 instructionBuffer.pop_front(); 681 } 682 } else { 683 discardFetch(); 684 } 685 686 if (computeUnit->shader->hsail_mode==Shader::SIMT) { 687 const int num_active_lanes = execMask().count(); 688 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); 689 computeUnit->numVecOpsExecuted += num_active_lanes; 690 if (isGmInstruction(ii)) { 691 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); 692 } else if (isLmInstruction(ii)) { 693 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); 694 } 695 } 696 697 // ---- Update Vector ALU pipeline and other resources ------------------ // 698 // Single precision ALU or Branch or Return or Special instruction 699 if (ii->isALU() \|\| ii->isSpecialOp() \|\| 700 ii->isBranch() \|\| 701 // FIXME: Kernel argument loads are currently treated as ALU operations 702 // since we don't send memory packets at execution. If we fix that then 703 // we should map them to one of the memory pipelines 704 (ii->isKernArgSeg() && ii->isLoad()) \|\| 705 ii->isArgSeg() \|\| 706 ii->isReturn()) { 707 computeUnit->aluPipe[simdId].set(computeUnit->shader-> 708 ticks(computeUnit->spBypassLength())); 709 710 // this is to enforce a fixed number of cycles per issue slot per SIMD 711 computeUnit->wfWait[simdId].set(computeUnit->shader-> 712 ticks(computeUnit->issuePeriod)); 713 } else if (ii->isBarrier()) { 714 computeUnit->wfWait[simdId].set(computeUnit->shader-> 715 ticks(computeUnit->issuePeriod)); 716 } else if (ii->isLoad() && ii->isFlat()) { 717 assert(Enums::SC_NONE != ii->executedAs()); 718 719 if (Enums::SC_SHARED == ii->executedAs()) { 720 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 721 set(computeUnit->shader->ticks(4)); 722 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 723 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 724 } else { 725 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 726 set(computeUnit->shader->ticks(4)); 727 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 728 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 729 } 730 } else if (ii->isStore() && ii->isFlat()) { 731 assert(Enums::SC_NONE != ii->executedAs()); 732 if (Enums::SC_SHARED == ii->executedAs()) { 733 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 734 set(computeUnit->shader->ticks(8)); 735 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 736 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 737 } else { 738 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 739 set(computeUnit->shader->ticks(8)); 740 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 741 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 742 } 743 } else if (ii->isLoad() && ii->isGlobalMem()) { 744 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 745 set(computeUnit->shader->ticks(4)); 746 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 747 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 748 } else if (ii->isStore() && ii->isGlobalMem()) { 749 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 750 set(computeUnit->shader->ticks(8)); 751 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 752 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 753 } else if ((ii->isAtomic() \|\| ii->isMemFence()) && ii->isGlobalMem()) { 754 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 755 set(computeUnit->shader->ticks(8)); 756 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 757 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 758 } else if (ii->isLoad() && ii->isLocalMem()) { 759 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 760 set(computeUnit->shader->ticks(4)); 761 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 762 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 763 } else if (ii->isStore() && ii->isLocalMem()) { 764 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 765 set(computeUnit->shader->ticks(8)); 766 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 767 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 768 } else if ((ii->isAtomic() \|\| ii->isMemFence()) && ii->isLocalMem()) { 769 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 770 set(computeUnit->shader->ticks(8)); 771 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 772 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 773 } 774} 775 776bool 777Wavefront::waitingAtBarrier(int lane) 778{ 779 return barCnt[lane] < maxBarCnt; 780} 781 782void 783Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 784 const VectorMask& mask) 785{ 786 assert(mask.count()); 787 reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask}); 788} 789 790void 791Wavefront::popFromReconvergenceStack() 792{ 793 assert(!reconvergenceStack.empty()); 794 795 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", 796 computeUnit->cu_id, simdId, wfSlotId, wfDynId, 797 execMask().to_string<char, std::string::traits_type, 798 std::string::allocator_type>().c_str(), pc()); 799 800 reconvergenceStack.pop_back(); 801 802 DPRINTF(WavefrontStack, "%3i %s\n", pc(), 803 execMask().to_string<char, std::string::traits_type, 804 std::string::allocator_type>().c_str()); 805 806} 807 808void 809Wavefront::discardFetch() 810{ 811 instructionBuffer.clear(); 812 dropFetch \|=pendingFetch; 813} 814 815uint32_t 816Wavefront::pc() const 817{ 818 return reconvergenceStack.back()->pc; 819} 820 821uint32_t 822Wavefront::rpc() const 823{ 824 return reconvergenceStack.back()->rpc; 825} 826 827VectorMask 828Wavefront::execMask() const 829{ 830 return reconvergenceStack.back()->execMask; 831} 832 833bool 834Wavefront::execMask(int lane) const 835{ 836 return reconvergenceStack.back()->execMask[lane]; 837} 838 839 840void 841Wavefront::pc(uint32_t new_pc) 842{ 843 reconvergenceStack.back()->pc = new_pc; 844} 845 846uint32_t 847Wavefront::getStaticContextSize() const 848{ 849 return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) + 850 sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) + 851 sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) + 852 sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) + 853 computeUnit->wfSize() * sizeof(ReconvergenceStackEntry); 854} 855 856void 857Wavefront::getContext(const void out) 858{ 859* uint8_t iter = (uint8_t )out; 860 for (int i = 0; i < barCnt.size(); i++) { 861 (int )iter = barCnt[i]; iter += sizeof(barCnt[i]); 862 } 863 (int )iter = wfId; iter += sizeof(wfId); 864 (int )iter = maxBarCnt; iter += sizeof(maxBarCnt); 865 (int )iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt); 866 (int )iter = barrierCnt; iter += sizeof(barrierCnt); 867 (int )iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id); 868 (uint32_t )iter = wgId; iter += sizeof(wgId); 869 (uint32_t )iter = barrierId; iter += sizeof(barrierId); 870 (uint64_t )iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong()); 871 (Addr )iter = privBase; iter += sizeof(privBase); 872 (Addr )iter = spillBase; iter += sizeof(spillBase); 873 874 int stackSize = reconvergenceStack.size(); 875 ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(), 876 std::numeric_limits<uint32_t>::max(), 877 std::numeric_limits<uint64_t>::max()}; 878 for (int i = 0; i < workItemId[0].size(); i++) { 879 if (i < stackSize) { 880 (ReconvergenceStackEntry )iter = reconvergenceStack.back(); 881* iter += sizeof(ReconvergenceStackEntry); 882 reconvergenceStack.pop_back(); 883 } else { 884 (ReconvergenceStackEntry )iter = empty; 885 iter += sizeof(ReconvergenceStackEntry); 886 } 887 } 888 889 int wf_size = computeUnit->wfSize(); 890 for (int i = 0; i < maxSpVgprs; i++) { 891 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); 892 for (int lane = 0; lane < wf_size; lane++) { 893 uint32_t regVal = computeUnit->vrf[simdId]-> 894 read<uint32_t>(vgprIdx,lane); 895 (uint32_t )iter = regVal; iter += sizeof(regVal); 896 } 897 } 898 899 for (int i = 0; i < maxDpVgprs; i++) { 900 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); 901 for (int lane = 0; lane < wf_size; lane++) { 902 uint64_t regVal = computeUnit->vrf[simdId]-> 903 read<uint64_t>(vgprIdx,lane); 904 (uint64_t )iter = regVal; iter += sizeof(regVal); 905 } 906 } 907 908 for (int i = 0; i < condRegState->numRegs(); i++) { 909 for (int lane = 0; lane < wf_size; lane++) { 910 uint64_t regVal = condRegState->read<uint64_t>(i, lane); 911 (uint64_t )iter = regVal; iter += sizeof(regVal); 912 } 913 } 914 915 /* saving LDS content / 916* if (ldsChunk) 917 for (int i = 0; i < ldsChunk->size(); i++) { 918 char val = ldsChunk->read<char>(i); 919 (char ) iter = val; iter += sizeof(val); 920 } 921} 922 923void 924Wavefront::setContext(const void in) 925{ 926* uint8_t iter = (uint8_t )in; 927 for (int i = 0; i < barCnt.size(); i++) { 928 barCnt[i] = (int )iter; iter += sizeof(barCnt[i]); 929 } 930 wfId = (int )iter; iter += sizeof(wfId); 931 maxBarCnt = (int )iter; iter += sizeof(maxBarCnt); 932 oldBarrierCnt = (int )iter; iter += sizeof(oldBarrierCnt); 933 barrierCnt = (int )iter; iter += sizeof(barrierCnt); 934 computeUnit->cu_id = (int )iter; iter += sizeof(computeUnit->cu_id); 935 wgId = (uint32_t )iter; iter += sizeof(wgId); 936 barrierId = (uint32_t )iter; iter += sizeof(barrierId); 937 initMask = VectorMask((uint64_t )iter); iter += sizeof(initMask); 938 privBase = (Addr )iter; iter += sizeof(privBase); 939 spillBase = (Addr )iter; iter += sizeof(spillBase); 940 941 for (int i = 0; i < workItemId[0].size(); i++) { 942 ReconvergenceStackEntry newEntry = (ReconvergenceStackEntry )iter; 943 iter += sizeof(ReconvergenceStackEntry); 944 if (newEntry.pc != std::numeric_limits<uint32_t>::max()) { 945 pushToReconvergenceStack(newEntry.pc, newEntry.rpc, 946 newEntry.execMask); 947 } 948 } 949 int wf_size = computeUnit->wfSize(); 950 951 for (int i = 0; i < maxSpVgprs; i++) { 952 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); 953 for (int lane = 0; lane < wf_size; lane++) { 954 uint32_t regVal = (uint32_t )iter; iter += sizeof(regVal); 955 computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane); 956 } 957 } 958 959 for (int i = 0; i < maxDpVgprs; i++) { 960 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); 961 for (int lane = 0; lane < wf_size; lane++) { 962 uint64_t regVal = (uint64_t )iter; iter += sizeof(regVal); 963 computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane); 964 } 965 } 966 967 for (int i = 0; i < condRegState->numRegs(); i++) { 968 for (int lane = 0; lane < wf_size; lane++) { 969 uint64_t regVal = (uint64_t )iter; iter += sizeof(regVal); 970 condRegState->write<uint64_t>(i, lane, regVal); 971 } 972 } 973 /** Restoring LDS contents / 974* if (ldsChunk) 975 for (int i = 0; i < ldsChunk->size(); i++) { 976 char val = (char ) iter; iter += sizeof(val); 977 ldsChunk->write<char>(i, val); 978 } 979} 980 981void 982Wavefront::computeActualWgSz(NDRange ndr) 983{ 984* actualWgSzTotal = 1; 985 for (int d = 0; d < 3; ++d) { 986 actualWgSz[d] = std::min(workGroupSz[d], 987 gridSz[d] - ndr->wgId[d] * workGroupSz[d]); 988 actualWgSzTotal = actualWgSz[d]; 989* } 990}