wavefront.cc revision 11308
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#include "gpu-compute/wavefront.hh" 37 38#include "debug/GPUExec.hh" 39#include "debug/WavefrontStack.hh" 40#include "gpu-compute/code_enums.hh" 41#include "gpu-compute/compute_unit.hh" 42#include "gpu-compute/gpu_dyn_inst.hh" 43#include "gpu-compute/shader.hh" 44#include "gpu-compute/vector_register_file.hh" 45 46Wavefront* 47WavefrontParams::create() 48{ 49 return new Wavefront(this); 50} 51 52Wavefront::Wavefront(const Params *p) 53 : SimObject(p), callArgMem(nullptr) 54{ 55 last_trace = 0; 56 simdId = p->simdId; 57 wfSlotId = p->wf_slot_id; 58 59 status = S_STOPPED; 60 reservedVectorRegs = 0; 61 startVgprIndex = 0; 62 outstanding_reqs = 0; 63 mem_reqs_in_pipe = 0; 64 outstanding_reqs_wr_gm = 0; 65 outstanding_reqs_wr_lm = 0; 66 outstanding_reqs_rd_gm = 0; 67 outstanding_reqs_rd_lm = 0; 68 rd_lm_reqs_in_pipe = 0; 69 rd_gm_reqs_in_pipe = 0; 70 wr_lm_reqs_in_pipe = 0; 71 wr_gm_reqs_in_pipe = 0; 72 73 barrier_cnt = 0; 74 old_barrier_cnt = 0; 75 stalledAtBarrier = false; 76 77 mem_trace_busy = 0; 78 old_vgpr_tcnt = 0xffffffffffffffffll; 79 old_dgpr_tcnt = 0xffffffffffffffffll; 80 81 pendingFetch = false; 82 dropFetch = false; 83 condRegState = new ConditionRegisterState(); 84 maxSpVgprs = 0; 85 maxDpVgprs = 0; 86} 87 88void 89Wavefront::regStats() 90{ 91 srcRegOpDist 92 .init(0, 4, 2) 93 .name(name() + ".src_reg_operand_dist") 94 .desc("number of executed instructions with N source register operands") 95 ; 96 97 dstRegOpDist 98 .init(0, 3, 2) 99 .name(name() + ".dst_reg_operand_dist") 100 .desc("number of executed instructions with N destination register " 101 "operands") 102 ; 103 104 // FIXME: the name of the WF needs to be unique 105 numTimesBlockedDueWAXDependencies 106 .name(name() + ".timesBlockedDueWAXDependencies") 107 .desc("number of times the wf's instructions are blocked due to WAW " 108 "or WAR dependencies") 109 ; 110 111 // FIXME: the name of the WF needs to be unique 112 numTimesBlockedDueRAWDependencies 113 .name(name() + ".timesBlockedDueRAWDependencies") 114 .desc("number of times the wf's instructions are blocked due to RAW " 115 "dependencies") 116 ; 117 118 // FIXME: the name of the WF needs to be unique 119 numTimesBlockedDueVrfPortAvail 120 .name(name() + ".timesBlockedDueVrfPortAvail") 121 .desc("number of times instructions are blocked due to VRF port " 122 "availability") 123 ; 124} 125 126void 127Wavefront::init() 128{ 129 reservedVectorRegs = 0; 130 startVgprIndex = 0; 131} 132 133void 134Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) 135{ 136 condRegState->init(num_cregs); 137 maxSpVgprs = num_sregs; 138 maxDpVgprs = num_dregs; 139} 140 141Wavefront::~Wavefront() 142{ 143 if (callArgMem) 144 delete callArgMem; 145} 146 147void 148Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr) 149{ 150 wfDynId = _wfDynId; 151 base_ptr = _base_ptr; 152 status = S_RUNNING; 153} 154 155bool 156Wavefront::isGmInstruction(GPUDynInstPtr ii) 157{ 158 if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || 159 IS_OT_ATOMIC_PM(ii->opType())) { 160 return true; 161 } 162 163 if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || 164 IS_OT_ATOMIC_GM(ii->opType())) { 165 166 return true; 167 } 168 169 if (IS_OT_FLAT(ii->opType())) { 170 return true; 171 } 172 173 return false; 174} 175 176bool 177Wavefront::isLmInstruction(GPUDynInstPtr ii) 178{ 179 if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) || 180 IS_OT_ATOMIC_LM(ii->opType())) { 181 return true; 182 } 183 184 return false; 185} 186 187bool 188Wavefront::isOldestInstALU() 189{ 190 assert(!instructionBuffer.empty()); 191 GPUDynInstPtr ii = instructionBuffer.front(); 192 193 if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP || 194 ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || 195 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || 196 ii->opType() == Enums::OT_KERN_READ)) { 197 return true; 198 } 199 200 return false; 201} 202 203bool 204Wavefront::isOldestInstBarrier() 205{ 206 assert(!instructionBuffer.empty()); 207 GPUDynInstPtr ii = instructionBuffer.front(); 208 209 if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) { 210 return true; 211 } 212 213 return false; 214} 215 216bool 217Wavefront::isOldestInstGMem() 218{ 219 assert(!instructionBuffer.empty()); 220 GPUDynInstPtr ii = instructionBuffer.front(); 221 222 if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) || 223 IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { 224 225 return true; 226 } 227 228 return false; 229} 230 231bool 232Wavefront::isOldestInstLMem() 233{ 234 assert(!instructionBuffer.empty()); 235 GPUDynInstPtr ii = instructionBuffer.front(); 236 237 if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) || 238 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { 239 240 return true; 241 } 242 243 return false; 244} 245 246bool 247Wavefront::isOldestInstPrivMem() 248{ 249 assert(!instructionBuffer.empty()); 250 GPUDynInstPtr ii = instructionBuffer.front(); 251 252 if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) || 253 IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { 254 255 return true; 256 } 257 258 return false; 259} 260 261bool 262Wavefront::isOldestInstFlatMem() 263{ 264 assert(!instructionBuffer.empty()); 265 GPUDynInstPtr ii = instructionBuffer.front(); 266 267 if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) { 268 269 return true; 270 } 271 272 return false; 273} 274 275// Return true if the Wavefront's instruction 276// buffer has branch instruction. 277bool 278Wavefront::instructionBufferHasBranch() 279{ 280 for (auto it : instructionBuffer) { 281 GPUDynInstPtr ii = it; 282 283 if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) { 284 return true; 285 } 286 } 287 288 return false; 289} 290 291// Remap HSAIL register to physical VGPR. 292// HSAIL register = virtual register assigned to an operand by HLC compiler 293uint32_t 294Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) 295{ 296 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); 297 // add the offset from where the VGPRs of the wavefront have been assigned 298 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; 299 // HSAIL double precision (DP) register: calculate the physical VGPR index 300 // assuming that DP registers are placed after SP ones in the VRF. The DP 301 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust 302 // the DP VGPR index before mapping it to the physical VRF address space 303 if (mode == 1 && size > 4) { 304 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); 305 } 306 307 assert((startVgprIndex <= physicalVgprIndex) && 308 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); 309 310 // calculate absolute physical VGPR index 311 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); 312} 313 314// Return true if this wavefront is ready 315// to execute an instruction of the specified type. 316int 317Wavefront::ready(itype_e type) 318{ 319 // Check to make sure wave is running 320 if (status == S_STOPPED || status == S_RETURNING || 321 instructionBuffer.empty()) { 322 return 0; 323 } 324 325 // Is the wave waiting at a barrier 326 if (stalledAtBarrier) { 327 if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt, 328 computeUnit->getRefCounter(dispatchid, wg_id))) { 329 // Are all threads at barrier? 330 return 0; 331 } 332 old_barrier_cnt = barrier_cnt; 333 stalledAtBarrier = false; 334 } 335 336 // Read instruction 337 GPUDynInstPtr ii = instructionBuffer.front(); 338 339 bool ready_inst M5_VAR_USED = false; 340 bool glbMemBusRdy = false; 341 bool glbMemIssueRdy = false; 342 if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) { 343 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { 344 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) 345 glbMemBusRdy = true; 346 if (computeUnit->wfWait[j].prerdy()) 347 glbMemIssueRdy = true; 348 } 349 } 350 bool locMemBusRdy = false; 351 bool locMemIssueRdy = false; 352 if (type == I_SHARED) { 353 for (int j=0; j < computeUnit->numLocMemUnits; ++j) { 354 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) 355 locMemBusRdy = true; 356 if (computeUnit->wfWait[j].prerdy()) 357 locMemIssueRdy = true; 358 } 359 } 360 361 // The following code is very error prone and the entire process for 362 // checking readiness will be fixed eventually. In the meantime, let's 363 // make sure that we do not silently let an instruction type slip 364 // through this logic and always return not ready. 365 if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP || 366 ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || 367 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || 368 ii->opType() == Enums::OT_KERN_READ || 369 ii->opType() == Enums::OT_ARG || 370 IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || 371 IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) || 372 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || 373 IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || 374 IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) { 375 panic("next instruction: %s is of unknown type\n", ii->disassemble()); 376 } 377 378 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", 379 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); 380 381 if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) { 382 // Here for ALU instruction (barrier) 383 if (!computeUnit->wfWait[simdId].prerdy()) { 384 // Is wave slot free? 385 return 0; 386 } 387 388 // Are there in pipe or outstanding memory requests? 389 if ((outstanding_reqs + mem_reqs_in_pipe) > 0) { 390 return 0; 391 } 392 393 ready_inst = true; 394 } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) { 395 // Here for ALU instruction (nop) 396 if (!computeUnit->wfWait[simdId].prerdy()) { 397 // Is wave slot free? 398 return 0; 399 } 400 401 ready_inst = true; 402 } else if (type == I_ALU && ii->opType() == Enums::OT_RET) { 403 // Here for ALU instruction (return) 404 if (!computeUnit->wfWait[simdId].prerdy()) { 405 // Is wave slot free? 406 return 0; 407 } 408 409 // Are there in pipe or outstanding memory requests? 410 if ((outstanding_reqs + mem_reqs_in_pipe) > 0) { 411 return 0; 412 } 413 414 ready_inst = true; 415 } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH || 416 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || 417 ii->opType() == Enums::OT_KERN_READ || 418 ii->opType() == Enums::OT_ARG)) { 419 // Here for ALU instruction (all others) 420 if (!computeUnit->wfWait[simdId].prerdy()) { 421 // Is alu slot free? 422 return 0; 423 } 424 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 425 VrfAccessType::RD_WR)) { 426 return 0; 427 } 428 429 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 430 return 0; 431 } 432 ready_inst = true; 433 } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) || 434 IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { 435 // Here Global memory instruction 436 if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) { 437 // Are there in pipe or outstanding global memory write requests? 438 if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) { 439 return 0; 440 } 441 } 442 443 if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) || 444 IS_OT_HIST_GM(ii->opType())) { 445 // Are there in pipe or outstanding global memory read requests? 446 if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) 447 return 0; 448 } 449 450 if (!glbMemIssueRdy) { 451 // Is WV issue slot free? 452 return 0; 453 } 454 455 if (!glbMemBusRdy) { 456 // Is there an available VRF->Global memory read bus? 457 return 0; 458 } 459 460 if (!computeUnit->globalMemoryPipe. 461 isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { 462 // Can we insert a new request to the Global Mem Request FIFO? 463 return 0; 464 } 465 // can we schedule source & destination operands on the VRF? 466 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 467 VrfAccessType::RD_WR)) { 468 return 0; 469 } 470 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 471 return 0; 472 } 473 ready_inst = true; 474 } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) || 475 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { 476 // Here for Shared memory instruction 477 if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) { 478 if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) { 479 return 0; 480 } 481 } 482 483 if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || 484 IS_OT_HIST_LM(ii->opType())) { 485 if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) { 486 return 0; 487 } 488 } 489 490 if (!locMemBusRdy) { 491 // Is there an available VRF->LDS read bus? 492 return 0; 493 } 494 if (!locMemIssueRdy) { 495 // Is wave slot free? 496 return 0; 497 } 498 499 if (!computeUnit->localMemoryPipe. 500 isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) { 501 // Can we insert a new request to the LDS Request FIFO? 502 return 0; 503 } 504 // can we schedule source & destination operands on the VRF? 505 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 506 VrfAccessType::RD_WR)) { 507 return 0; 508 } 509 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 510 return 0; 511 } 512 ready_inst = true; 513 } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) || 514 IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { 515 // Here for Private memory instruction ------------------------ // 516 if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) { 517 if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) { 518 return 0; 519 } 520 } 521 522 if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) || 523 IS_OT_HIST_PM(ii->opType())) { 524 if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) { 525 return 0; 526 } 527 } 528 529 if (!glbMemBusRdy) { 530 // Is there an available VRF->Global memory read bus? 531 return 0; 532 } 533 534 if (!glbMemIssueRdy) { 535 // Is wave slot free? 536 return 0; 537 } 538 539 if (!computeUnit->globalMemoryPipe. 540 isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { 541 // Can we insert a new request to the Global Mem Request FIFO? 542 return 0; 543 } 544 // can we schedule source & destination operands on the VRF? 545 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 546 VrfAccessType::RD_WR)) { 547 return 0; 548 } 549 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 550 return 0; 551 } 552 ready_inst = true; 553 } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) { 554 if (!glbMemBusRdy) { 555 // Is there an available VRF->Global memory read bus? 556 return 0; 557 } 558 559 if (!locMemBusRdy) { 560 // Is there an available VRF->LDS read bus? 561 return 0; 562 } 563 564 if (!glbMemIssueRdy) { 565 // Is wave slot free? 566 return 0; 567 } 568 569 if (!locMemIssueRdy) { 570 return 0; 571 } 572 if (!computeUnit->globalMemoryPipe. 573 isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { 574 // Can we insert a new request to the Global Mem Request FIFO? 575 return 0; 576 } 577 578 if (!computeUnit->localMemoryPipe. 579 isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) { 580 // Can we insert a new request to the LDS Request FIFO? 581 return 0; 582 } 583 // can we schedule source & destination operands on the VRF? 584 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 585 VrfAccessType::RD_WR)) { 586 return 0; 587 } 588 // are all the operands ready? (RAW, WAW and WAR depedencies met?) 589 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 590 return 0; 591 } 592 ready_inst = true; 593 } else { 594 return 0; 595 } 596 597 assert(ready_inst); 598 599 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, 600 simdId, wfSlotId, ii->disassemble()); 601 602 return 1; 603} 604 605void 606Wavefront::updateResources() 607{ 608 // Get current instruction 609 GPUDynInstPtr ii = instructionBuffer.front(); 610 assert(ii); 611 computeUnit->vrf[simdId]->updateResources(this, ii); 612 // Single precision ALU or Branch or Return or Special instruction 613 if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || 614 ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || 615 // FIXME: Kernel argument loads are currently treated as ALU operations 616 // since we don't send memory packets at execution. If we fix that then 617 // we should map them to one of the memory pipelines 618 ii->opType()==Enums::OT_KERN_READ || 619 ii->opType()==Enums::OT_ARG || 620 ii->opType()==Enums::OT_RET) { 621 computeUnit->aluPipe[simdId].preset(computeUnit->shader-> 622 ticks(computeUnit->spBypassLength())); 623 // this is to enforce a fixed number of cycles per issue slot per SIMD 624 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 625 ticks(computeUnit->issuePeriod)); 626 } else if (ii->opType() == Enums::OT_BARRIER) { 627 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 628 ticks(computeUnit->issuePeriod)); 629 } else if (ii->opType() == Enums::OT_FLAT_READ) { 630 assert(Enums::SC_NONE != ii->executedAs()); 631 mem_reqs_in_pipe++; 632 rd_gm_reqs_in_pipe++; 633 if ( Enums::SC_SHARED == ii->executedAs() ) { 634 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 635 preset(computeUnit->shader->ticks(4)); 636 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 637 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 638 } else { 639 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 640 preset(computeUnit->shader->ticks(4)); 641 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 642 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 643 } 644 } else if (ii->opType() == Enums::OT_FLAT_WRITE) { 645 assert(Enums::SC_NONE != ii->executedAs()); 646 mem_reqs_in_pipe++; 647 wr_gm_reqs_in_pipe++; 648 if (Enums::SC_SHARED == ii->executedAs()) { 649 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 650 preset(computeUnit->shader->ticks(8)); 651 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 652 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 653 } else { 654 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 655 preset(computeUnit->shader->ticks(8)); 656 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 657 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 658 } 659 } else if (IS_OT_READ_GM(ii->opType())) { 660 mem_reqs_in_pipe++; 661 rd_gm_reqs_in_pipe++; 662 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 663 preset(computeUnit->shader->ticks(4)); 664 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 665 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 666 } else if (IS_OT_WRITE_GM(ii->opType())) { 667 mem_reqs_in_pipe++; 668 wr_gm_reqs_in_pipe++; 669 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 670 preset(computeUnit->shader->ticks(8)); 671 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 672 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 673 } else if (IS_OT_ATOMIC_GM(ii->opType())) { 674 mem_reqs_in_pipe++; 675 wr_gm_reqs_in_pipe++; 676 rd_gm_reqs_in_pipe++; 677 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 678 preset(computeUnit->shader->ticks(8)); 679 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 680 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 681 } else if (IS_OT_READ_LM(ii->opType())) { 682 mem_reqs_in_pipe++; 683 rd_lm_reqs_in_pipe++; 684 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 685 preset(computeUnit->shader->ticks(4)); 686 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 687 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 688 } else if (IS_OT_WRITE_LM(ii->opType())) { 689 mem_reqs_in_pipe++; 690 wr_lm_reqs_in_pipe++; 691 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 692 preset(computeUnit->shader->ticks(8)); 693 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 694 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 695 } else if (IS_OT_ATOMIC_LM(ii->opType())) { 696 mem_reqs_in_pipe++; 697 wr_lm_reqs_in_pipe++; 698 rd_lm_reqs_in_pipe++; 699 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 700 preset(computeUnit->shader->ticks(8)); 701 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 702 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 703 } else if (IS_OT_READ_PM(ii->opType())) { 704 mem_reqs_in_pipe++; 705 rd_gm_reqs_in_pipe++; 706 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 707 preset(computeUnit->shader->ticks(4)); 708 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 709 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 710 } else if (IS_OT_WRITE_PM(ii->opType())) { 711 mem_reqs_in_pipe++; 712 wr_gm_reqs_in_pipe++; 713 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 714 preset(computeUnit->shader->ticks(8)); 715 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 716 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 717 } else if (IS_OT_ATOMIC_PM(ii->opType())) { 718 mem_reqs_in_pipe++; 719 wr_gm_reqs_in_pipe++; 720 rd_gm_reqs_in_pipe++; 721 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 722 preset(computeUnit->shader->ticks(8)); 723 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 724 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 725 } 726} 727 728void 729Wavefront::exec() 730{ 731 // ---- Exit if wavefront is inactive ----------------------------- // 732 733 if (status == S_STOPPED || status == S_RETURNING || 734 instructionBuffer.empty()) { 735 return; 736 } 737 738 // Get current instruction 739 740 GPUDynInstPtr ii = instructionBuffer.front(); 741 742 const uint32_t old_pc = pc(); 743 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " 744 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, 745 ii->disassemble(), old_pc); 746 ii->execute(); 747 // access the VRF 748 computeUnit->vrf[simdId]->exec(ii, this); 749 srcRegOpDist.sample(ii->numSrcRegOperands()); 750 dstRegOpDist.sample(ii->numDstRegOperands()); 751 computeUnit->numInstrExecuted++; 752 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - 753 computeUnit->lastExecCycle[simdId]); 754 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); 755 if (pc() == old_pc) { 756 uint32_t new_pc = old_pc + 1; 757 // PC not modified by instruction, proceed to next or pop frame 758 pc(new_pc); 759 if (new_pc == rpc()) { 760 popFromReconvergenceStack(); 761 discardFetch(); 762 } else { 763 instructionBuffer.pop_front(); 764 } 765 } 766 767 if (computeUnit->shader->hsail_mode==Shader::SIMT) { 768 const int num_active_lanes = execMask().count(); 769 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); 770 computeUnit->numVecOpsExecuted += num_active_lanes; 771 if (isGmInstruction(ii)) { 772 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); 773 } else if (isLmInstruction(ii)) { 774 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); 775 } 776 } 777 778 // ---- Update Vector ALU pipeline and other resources ------------------ // 779 // Single precision ALU or Branch or Return or Special instruction 780 if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || 781 ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || 782 // FIXME: Kernel argument loads are currently treated as ALU operations 783 // since we don't send memory packets at execution. If we fix that then 784 // we should map them to one of the memory pipelines 785 ii->opType() == Enums::OT_KERN_READ || 786 ii->opType() == Enums::OT_ARG || 787 ii->opType() == Enums::OT_RET) { 788 computeUnit->aluPipe[simdId].set(computeUnit->shader-> 789 ticks(computeUnit->spBypassLength())); 790 791 // this is to enforce a fixed number of cycles per issue slot per SIMD 792 computeUnit->wfWait[simdId].set(computeUnit->shader-> 793 ticks(computeUnit->issuePeriod)); 794 } else if (ii->opType() == Enums::OT_BARRIER) { 795 computeUnit->wfWait[simdId].set(computeUnit->shader-> 796 ticks(computeUnit->issuePeriod)); 797 } else if (ii->opType() == Enums::OT_FLAT_READ) { 798 assert(Enums::SC_NONE != ii->executedAs()); 799 800 if (Enums::SC_SHARED == ii->executedAs()) { 801 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 802 set(computeUnit->shader->ticks(4)); 803 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 804 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 805 } else { 806 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 807 set(computeUnit->shader->ticks(4)); 808 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 809 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 810 } 811 } else if (ii->opType() == Enums::OT_FLAT_WRITE) { 812 assert(Enums::SC_NONE != ii->executedAs()); 813 if (Enums::SC_SHARED == ii->executedAs()) { 814 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 815 set(computeUnit->shader->ticks(8)); 816 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 817 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 818 } else { 819 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 820 set(computeUnit->shader->ticks(8)); 821 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 822 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 823 } 824 } else if (IS_OT_READ_GM(ii->opType())) { 825 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 826 set(computeUnit->shader->ticks(4)); 827 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 828 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 829 } else if (IS_OT_WRITE_GM(ii->opType())) { 830 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 831 set(computeUnit->shader->ticks(8)); 832 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 833 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 834 } else if (IS_OT_ATOMIC_GM(ii->opType())) { 835 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 836 set(computeUnit->shader->ticks(8)); 837 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 838 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 839 } else if (IS_OT_READ_LM(ii->opType())) { 840 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 841 set(computeUnit->shader->ticks(4)); 842 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 843 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 844 } else if (IS_OT_WRITE_LM(ii->opType())) { 845 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 846 set(computeUnit->shader->ticks(8)); 847 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 848 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 849 } else if (IS_OT_ATOMIC_LM(ii->opType())) { 850 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 851 set(computeUnit->shader->ticks(8)); 852 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 853 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 854 } 855} 856 857bool 858Wavefront::waitingAtBarrier(int lane) 859{ 860 return bar_cnt[lane] < max_bar_cnt; 861} 862 863void 864Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 865 const VectorMask& mask) 866{ 867 assert(mask.count()); 868 reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask)); 869} 870 871void 872Wavefront::popFromReconvergenceStack() 873{ 874 assert(!reconvergenceStack.empty()); 875 876 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", 877 computeUnit->cu_id, simdId, wfSlotId, wfDynId, 878 execMask().to_string<char, std::string::traits_type, 879 std::string::allocator_type>().c_str(), pc()); 880 881 reconvergenceStack.pop(); 882 883 DPRINTF(WavefrontStack, "%3i %s\n", pc(), 884 execMask().to_string<char, std::string::traits_type, 885 std::string::allocator_type>().c_str()); 886 887} 888 889void 890Wavefront::discardFetch() 891{ 892 instructionBuffer.clear(); 893 dropFetch |=pendingFetch; 894} 895 896uint32_t 897Wavefront::pc() const 898{ 899 return reconvergenceStack.top()->pc; 900} 901 902uint32_t 903Wavefront::rpc() const 904{ 905 return reconvergenceStack.top()->rpc; 906} 907 908VectorMask 909Wavefront::execMask() const 910{ 911 return reconvergenceStack.top()->execMask; 912} 913 914bool 915Wavefront::execMask(int lane) const 916{ 917 return reconvergenceStack.top()->execMask[lane]; 918} 919 920 921void 922Wavefront::pc(uint32_t new_pc) 923{ 924 reconvergenceStack.top()->pc = new_pc; 925} 926