wavefront.cc revision 11523
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#include "gpu-compute/wavefront.hh" 37 38#include "debug/GPUExec.hh" 39#include "debug/WavefrontStack.hh" 40#include "gpu-compute/code_enums.hh" 41#include "gpu-compute/compute_unit.hh" 42#include "gpu-compute/gpu_dyn_inst.hh" 43#include "gpu-compute/shader.hh" 44#include "gpu-compute/vector_register_file.hh" 45 46Wavefront* 47WavefrontParams::create() 48{ 49 return new Wavefront(this); 50} 51 52Wavefront::Wavefront(const Params *p) 53 : SimObject(p), callArgMem(nullptr) 54{ 55 last_trace = 0; 56 simdId = p->simdId; 57 wfSlotId = p->wf_slot_id; 58 59 status = S_STOPPED; 60 reservedVectorRegs = 0; 61 startVgprIndex = 0; 62 outstanding_reqs = 0; 63 mem_reqs_in_pipe = 0; 64 outstanding_reqs_wr_gm = 0; 65 outstanding_reqs_wr_lm = 0; 66 outstanding_reqs_rd_gm = 0; 67 outstanding_reqs_rd_lm = 0; 68 rd_lm_reqs_in_pipe = 0; 69 rd_gm_reqs_in_pipe = 0; 70 wr_lm_reqs_in_pipe = 0; 71 wr_gm_reqs_in_pipe = 0; 72 73 barrier_cnt = 0; 74 old_barrier_cnt = 0; 75 stalledAtBarrier = false; 76 77 mem_trace_busy = 0; 78 old_vgpr_tcnt = 0xffffffffffffffffll; 79 old_dgpr_tcnt = 0xffffffffffffffffll; 80 81 pendingFetch = false; 82 dropFetch = false; 83 condRegState = new ConditionRegisterState(); 84 maxSpVgprs = 0; 85 maxDpVgprs = 0; 86} 87 88void 89Wavefront::regStats() 90{ 91 SimObject::regStats(); 92 93 srcRegOpDist 94 .init(0, 4, 2) 95 .name(name() + ".src_reg_operand_dist") 96 .desc("number of executed instructions with N source register operands") 97 ; 98 99 dstRegOpDist 100 .init(0, 3, 2) 101 .name(name() + ".dst_reg_operand_dist") 102 .desc("number of executed instructions with N destination register " 103 "operands") 104 ; 105 106 // FIXME: the name of the WF needs to be unique 107 numTimesBlockedDueWAXDependencies 108 .name(name() + ".timesBlockedDueWAXDependencies") 109 .desc("number of times the wf's instructions are blocked due to WAW " 110 "or WAR dependencies") 111 ; 112 113 // FIXME: the name of the WF needs to be unique 114 numTimesBlockedDueRAWDependencies 115 .name(name() + ".timesBlockedDueRAWDependencies") 116 .desc("number of times the wf's instructions are blocked due to RAW " 117 "dependencies") 118 ; 119 120 // FIXME: the name of the WF needs to be unique 121 numTimesBlockedDueVrfPortAvail 122 .name(name() + ".timesBlockedDueVrfPortAvail") 123 .desc("number of times instructions are blocked due to VRF port " 124 "availability") 125 ; 126} 127 128void 129Wavefront::init() 130{ 131 reservedVectorRegs = 0; 132 startVgprIndex = 0; 133} 134 135void 136Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) 137{ 138 condRegState->init(num_cregs); 139 maxSpVgprs = num_sregs; 140 maxDpVgprs = num_dregs; 141} 142 143Wavefront::~Wavefront() 144{ 145 if (callArgMem) 146 delete callArgMem; 147} 148 149void 150Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr) 151{ 152 wfDynId = _wfDynId; 153 base_ptr = _base_ptr; 154 status = S_RUNNING; 155} 156 157bool 158Wavefront::isGmInstruction(GPUDynInstPtr ii) 159{ 160 if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || 161 IS_OT_ATOMIC_PM(ii->opType())) { 162 return true; 163 } 164 165 if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || 166 IS_OT_ATOMIC_GM(ii->opType())) { 167 return true; 168 } 169 170 if (IS_OT_FLAT(ii->opType())) { 171 return true; 172 } 173 174 return false; 175} 176 177bool 178Wavefront::isLmInstruction(GPUDynInstPtr ii) 179{ 180 if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) || 181 IS_OT_ATOMIC_LM(ii->opType())) { 182 return true; 183 } 184 185 return false; 186} 187 188bool 189Wavefront::isOldestInstALU() 190{ 191 assert(!instructionBuffer.empty()); 192 GPUDynInstPtr ii = instructionBuffer.front(); 193 194 if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP || 195 ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || 196 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || 197 ii->opType() == Enums::OT_KERN_READ)) { 198 return true; 199 } 200 201 return false; 202} 203 204bool 205Wavefront::isOldestInstBarrier() 206{ 207 assert(!instructionBuffer.empty()); 208 GPUDynInstPtr ii = instructionBuffer.front(); 209 210 if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) { 211 return true; 212 } 213 214 return false; 215} 216 217bool 218Wavefront::isOldestInstGMem() 219{ 220 assert(!instructionBuffer.empty()); 221 GPUDynInstPtr ii = instructionBuffer.front(); 222 223 if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) || 224 IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { 225 226 return true; 227 } 228 229 return false; 230} 231 232bool 233Wavefront::isOldestInstLMem() 234{ 235 assert(!instructionBuffer.empty()); 236 GPUDynInstPtr ii = instructionBuffer.front(); 237 238 if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) || 239 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { 240 241 return true; 242 } 243 244 return false; 245} 246 247bool 248Wavefront::isOldestInstPrivMem() 249{ 250 assert(!instructionBuffer.empty()); 251 GPUDynInstPtr ii = instructionBuffer.front(); 252 253 if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) || 254 IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { 255 256 return true; 257 } 258 259 return false; 260} 261 262bool 263Wavefront::isOldestInstFlatMem() 264{ 265 assert(!instructionBuffer.empty()); 266 GPUDynInstPtr ii = instructionBuffer.front(); 267 268 if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) { 269 270 return true; 271 } 272 273 return false; 274} 275 276// Return true if the Wavefront's instruction 277// buffer has branch instruction. 278bool 279Wavefront::instructionBufferHasBranch() 280{ 281 for (auto it : instructionBuffer) { 282 GPUDynInstPtr ii = it; 283 284 if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) { 285 return true; 286 } 287 } 288 289 return false; 290} 291 292// Remap HSAIL register to physical VGPR. 293// HSAIL register = virtual register assigned to an operand by HLC compiler 294uint32_t 295Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) 296{ 297 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); 298 // add the offset from where the VGPRs of the wavefront have been assigned 299 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; 300 // HSAIL double precision (DP) register: calculate the physical VGPR index 301 // assuming that DP registers are placed after SP ones in the VRF. The DP 302 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust 303 // the DP VGPR index before mapping it to the physical VRF address space 304 if (mode == 1 && size > 4) { 305 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); 306 } 307 308 assert((startVgprIndex <= physicalVgprIndex) && 309 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); 310 311 // calculate absolute physical VGPR index 312 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); 313} 314 315// Return true if this wavefront is ready 316// to execute an instruction of the specified type. 317int 318Wavefront::ready(itype_e type) 319{ 320 // Check to make sure wave is running 321 if (status == S_STOPPED || status == S_RETURNING || 322 instructionBuffer.empty()) { 323 return 0; 324 } 325 326 // Is the wave waiting at a barrier 327 if (stalledAtBarrier) { 328 if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt, 329 computeUnit->getRefCounter(dispatchid, wg_id))) { 330 // Are all threads at barrier? 331 return 0; 332 } 333 old_barrier_cnt = barrier_cnt; 334 stalledAtBarrier = false; 335 } 336 337 // Read instruction 338 GPUDynInstPtr ii = instructionBuffer.front(); 339 340 bool ready_inst M5_VAR_USED = false; 341 bool glbMemBusRdy = false; 342 bool glbMemIssueRdy = false; 343 if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) { 344 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { 345 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) 346 glbMemBusRdy = true; 347 if (computeUnit->wfWait[j].prerdy()) 348 glbMemIssueRdy = true; 349 } 350 } 351 bool locMemBusRdy = false; 352 bool locMemIssueRdy = false; 353 if (type == I_SHARED || type == I_FLAT) { 354 for (int j=0; j < computeUnit->numLocMemUnits; ++j) { 355 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) 356 locMemBusRdy = true; 357 if (computeUnit->wfWait[j].prerdy()) 358 locMemIssueRdy = true; 359 } 360 } 361 362 // The following code is very error prone and the entire process for 363 // checking readiness will be fixed eventually. In the meantime, let's 364 // make sure that we do not silently let an instruction type slip 365 // through this logic and always return not ready. 366 if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP || 367 ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || 368 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || 369 ii->opType() == Enums::OT_KERN_READ || 370 ii->opType() == Enums::OT_ARG || 371 IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || 372 IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) || 373 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || 374 IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || 375 IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) { 376 panic("next instruction: %s is of unknown type\n", ii->disassemble()); 377 } 378 379 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", 380 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); 381 382 if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) { 383 // Here for ALU instruction (barrier) 384 if (!computeUnit->wfWait[simdId].prerdy()) { 385 // Is wave slot free? 386 return 0; 387 } 388 389 // Are there in pipe or outstanding memory requests? 390 if ((outstanding_reqs + mem_reqs_in_pipe) > 0) { 391 return 0; 392 } 393 394 ready_inst = true; 395 } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) { 396 // Here for ALU instruction (nop) 397 if (!computeUnit->wfWait[simdId].prerdy()) { 398 // Is wave slot free? 399 return 0; 400 } 401 402 ready_inst = true; 403 } else if (type == I_ALU && ii->opType() == Enums::OT_RET) { 404 // Here for ALU instruction (return) 405 if (!computeUnit->wfWait[simdId].prerdy()) { 406 // Is wave slot free? 407 return 0; 408 } 409 410 // Are there in pipe or outstanding memory requests? 411 if ((outstanding_reqs + mem_reqs_in_pipe) > 0) { 412 return 0; 413 } 414 415 ready_inst = true; 416 } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH || 417 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || 418 ii->opType() == Enums::OT_KERN_READ || 419 ii->opType() == Enums::OT_ARG)) { 420 // Here for ALU instruction (all others) 421 if (!computeUnit->wfWait[simdId].prerdy()) { 422 // Is alu slot free? 423 return 0; 424 } 425 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 426 VrfAccessType::RD_WR)) { 427 return 0; 428 } 429 430 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 431 return 0; 432 } 433 ready_inst = true; 434 } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) || 435 IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { 436 // Here Global memory instruction 437 if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) { 438 // Are there in pipe or outstanding global memory write requests? 439 if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) { 440 return 0; 441 } 442 } 443 444 if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) || 445 IS_OT_HIST_GM(ii->opType())) { 446 // Are there in pipe or outstanding global memory read requests? 447 if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) 448 return 0; 449 } 450 451 if (!glbMemIssueRdy) { 452 // Is WV issue slot free? 453 return 0; 454 } 455 456 if (!glbMemBusRdy) { 457 // Is there an available VRF->Global memory read bus? 458 return 0; 459 } 460 461 if (!computeUnit->globalMemoryPipe. 462 isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { 463 // Can we insert a new request to the Global Mem Request FIFO? 464 return 0; 465 } 466 // can we schedule source & destination operands on the VRF? 467 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 468 VrfAccessType::RD_WR)) { 469 return 0; 470 } 471 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 472 return 0; 473 } 474 ready_inst = true; 475 } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) || 476 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { 477 // Here for Shared memory instruction 478 if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) { 479 if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) { 480 return 0; 481 } 482 } 483 484 if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || 485 IS_OT_HIST_LM(ii->opType())) { 486 if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) { 487 return 0; 488 } 489 } 490 491 if (!locMemBusRdy) { 492 // Is there an available VRF->LDS read bus? 493 return 0; 494 } 495 if (!locMemIssueRdy) { 496 // Is wave slot free? 497 return 0; 498 } 499 500 if (!computeUnit->localMemoryPipe. 501 isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) { 502 // Can we insert a new request to the LDS Request FIFO? 503 return 0; 504 } 505 // can we schedule source & destination operands on the VRF? 506 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 507 VrfAccessType::RD_WR)) { 508 return 0; 509 } 510 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 511 return 0; 512 } 513 ready_inst = true; 514 } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) || 515 IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { 516 // Here for Private memory instruction ------------------------ // 517 if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) { 518 if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) { 519 return 0; 520 } 521 } 522 523 if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) || 524 IS_OT_HIST_PM(ii->opType())) { 525 if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) { 526 return 0; 527 } 528 } 529 530 if (!glbMemBusRdy) { 531 // Is there an available VRF->Global memory read bus? 532 return 0; 533 } 534 535 if (!glbMemIssueRdy) { 536 // Is wave slot free? 537 return 0; 538 } 539 540 if (!computeUnit->globalMemoryPipe. 541 isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { 542 // Can we insert a new request to the Global Mem Request FIFO? 543 return 0; 544 } 545 // can we schedule source & destination operands on the VRF? 546 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 547 VrfAccessType::RD_WR)) { 548 return 0; 549 } 550 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 551 return 0; 552 } 553 ready_inst = true; 554 } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) { 555 if (!glbMemBusRdy) { 556 // Is there an available VRF->Global memory read bus? 557 return 0; 558 } 559 560 if (!locMemBusRdy) { 561 // Is there an available VRF->LDS read bus? 562 return 0; 563 } 564 565 if (!glbMemIssueRdy) { 566 // Is wave slot free? 567 return 0; 568 } 569 570 if (!locMemIssueRdy) { 571 return 0; 572 } 573 if (!computeUnit->globalMemoryPipe. 574 isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { 575 // Can we insert a new request to the Global Mem Request FIFO? 576 return 0; 577 } 578 579 if (!computeUnit->localMemoryPipe. 580 isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) { 581 // Can we insert a new request to the LDS Request FIFO? 582 return 0; 583 } 584 // can we schedule source & destination operands on the VRF? 585 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 586 VrfAccessType::RD_WR)) { 587 return 0; 588 } 589 // are all the operands ready? (RAW, WAW and WAR depedencies met?) 590 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 591 return 0; 592 } 593 ready_inst = true; 594 } else { 595 return 0; 596 } 597 598 assert(ready_inst); 599 600 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, 601 simdId, wfSlotId, ii->disassemble()); 602 return 1; 603} 604 605void 606Wavefront::updateResources() 607{ 608 // Get current instruction 609 GPUDynInstPtr ii = instructionBuffer.front(); 610 assert(ii); 611 computeUnit->vrf[simdId]->updateResources(this, ii); 612 // Single precision ALU or Branch or Return or Special instruction 613 if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || 614 ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || 615 // FIXME: Kernel argument loads are currently treated as ALU operations 616 // since we don't send memory packets at execution. If we fix that then 617 // we should map them to one of the memory pipelines 618 ii->opType()==Enums::OT_KERN_READ || 619 ii->opType()==Enums::OT_ARG || 620 ii->opType()==Enums::OT_RET) { 621 computeUnit->aluPipe[simdId].preset(computeUnit->shader-> 622 ticks(computeUnit->spBypassLength())); 623 // this is to enforce a fixed number of cycles per issue slot per SIMD 624 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 625 ticks(computeUnit->issuePeriod)); 626 } else if (ii->opType() == Enums::OT_BARRIER) { 627 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 628 ticks(computeUnit->issuePeriod)); 629 } else if (ii->opType() == Enums::OT_FLAT_READ) { 630 assert(Enums::SC_NONE != ii->executedAs()); 631 mem_reqs_in_pipe++; 632 rd_gm_reqs_in_pipe++; 633 if ( Enums::SC_SHARED == ii->executedAs() ) { 634 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 635 preset(computeUnit->shader->ticks(4)); 636 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 637 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 638 } else { 639 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 640 preset(computeUnit->shader->ticks(4)); 641 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 642 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 643 } 644 } else if (ii->opType() == Enums::OT_FLAT_WRITE) { 645 assert(Enums::SC_NONE != ii->executedAs()); 646 mem_reqs_in_pipe++; 647 wr_gm_reqs_in_pipe++; 648 if (Enums::SC_SHARED == ii->executedAs()) { 649 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 650 preset(computeUnit->shader->ticks(8)); 651 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 652 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 653 } else { 654 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 655 preset(computeUnit->shader->ticks(8)); 656 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 657 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 658 } 659 } else if (IS_OT_READ_GM(ii->opType())) { 660 mem_reqs_in_pipe++; 661 rd_gm_reqs_in_pipe++; 662 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 663 preset(computeUnit->shader->ticks(4)); 664 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 665 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 666 } else if (IS_OT_WRITE_GM(ii->opType())) { 667 mem_reqs_in_pipe++; 668 wr_gm_reqs_in_pipe++; 669 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 670 preset(computeUnit->shader->ticks(8)); 671 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 672 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 673 } else if (IS_OT_ATOMIC_GM(ii->opType())) { 674 mem_reqs_in_pipe++; 675 wr_gm_reqs_in_pipe++; 676 rd_gm_reqs_in_pipe++; 677 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 678 preset(computeUnit->shader->ticks(8)); 679 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 680 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 681 } else if (IS_OT_READ_LM(ii->opType())) { 682 mem_reqs_in_pipe++; 683 rd_lm_reqs_in_pipe++; 684 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 685 preset(computeUnit->shader->ticks(4)); 686 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 687 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 688 } else if (IS_OT_WRITE_LM(ii->opType())) { 689 mem_reqs_in_pipe++; 690 wr_lm_reqs_in_pipe++; 691 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 692 preset(computeUnit->shader->ticks(8)); 693 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 694 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 695 } else if (IS_OT_ATOMIC_LM(ii->opType())) { 696 mem_reqs_in_pipe++; 697 wr_lm_reqs_in_pipe++; 698 rd_lm_reqs_in_pipe++; 699 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 700 preset(computeUnit->shader->ticks(8)); 701 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 702 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 703 } else if (IS_OT_READ_PM(ii->opType())) { 704 mem_reqs_in_pipe++; 705 rd_gm_reqs_in_pipe++; 706 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 707 preset(computeUnit->shader->ticks(4)); 708 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 709 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 710 } else if (IS_OT_WRITE_PM(ii->opType())) { 711 mem_reqs_in_pipe++; 712 wr_gm_reqs_in_pipe++; 713 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 714 preset(computeUnit->shader->ticks(8)); 715 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 716 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 717 } else if (IS_OT_ATOMIC_PM(ii->opType())) { 718 mem_reqs_in_pipe++; 719 wr_gm_reqs_in_pipe++; 720 rd_gm_reqs_in_pipe++; 721 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 722 preset(computeUnit->shader->ticks(8)); 723 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 724 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 725 } 726} 727 728void 729Wavefront::exec() 730{ 731 // ---- Exit if wavefront is inactive ----------------------------- // 732 733 if (status == S_STOPPED || status == S_RETURNING || 734 instructionBuffer.empty()) { 735 return; 736 } 737 738 // Get current instruction 739 740 GPUDynInstPtr ii = instructionBuffer.front(); 741 742 const uint32_t old_pc = pc(); 743 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " 744 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, 745 ii->disassemble(), old_pc); 746 ii->execute(); 747 // access the VRF 748 computeUnit->vrf[simdId]->exec(ii, this); 749 srcRegOpDist.sample(ii->numSrcRegOperands()); 750 dstRegOpDist.sample(ii->numDstRegOperands()); 751 computeUnit->numInstrExecuted++; 752 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - 753 computeUnit->lastExecCycle[simdId]); 754 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); 755 if (pc() == old_pc) { 756 uint32_t new_pc = old_pc + 1; 757 // PC not modified by instruction, proceed to next or pop frame 758 pc(new_pc); 759 if (new_pc == rpc()) { 760 popFromReconvergenceStack(); 761 discardFetch(); 762 } else { 763 instructionBuffer.pop_front(); 764 } 765 } 766 767 if (computeUnit->shader->hsail_mode==Shader::SIMT) { 768 const int num_active_lanes = execMask().count(); 769 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); 770 computeUnit->numVecOpsExecuted += num_active_lanes; 771 if (isGmInstruction(ii)) { 772 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); 773 } else if (isLmInstruction(ii)) { 774 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); 775 } 776 } 777 778 // ---- Update Vector ALU pipeline and other resources ------------------ // 779 // Single precision ALU or Branch or Return or Special instruction 780 if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || 781 ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || 782 // FIXME: Kernel argument loads are currently treated as ALU operations 783 // since we don't send memory packets at execution. If we fix that then 784 // we should map them to one of the memory pipelines 785 ii->opType() == Enums::OT_KERN_READ || 786 ii->opType() == Enums::OT_ARG || 787 ii->opType() == Enums::OT_RET) { 788 computeUnit->aluPipe[simdId].set(computeUnit->shader-> 789 ticks(computeUnit->spBypassLength())); 790 791 // this is to enforce a fixed number of cycles per issue slot per SIMD 792 computeUnit->wfWait[simdId].set(computeUnit->shader-> 793 ticks(computeUnit->issuePeriod)); 794 } else if (ii->opType() == Enums::OT_BARRIER) { 795 computeUnit->wfWait[simdId].set(computeUnit->shader-> 796 ticks(computeUnit->issuePeriod)); 797 } else if (ii->opType() == Enums::OT_FLAT_READ) { 798 assert(Enums::SC_NONE != ii->executedAs()); 799 800 if (Enums::SC_SHARED == ii->executedAs()) { 801 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 802 set(computeUnit->shader->ticks(4)); 803 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 804 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 805 } else { 806 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 807 set(computeUnit->shader->ticks(4)); 808 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 809 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 810 } 811 } else if (ii->opType() == Enums::OT_FLAT_WRITE) { 812 assert(Enums::SC_NONE != ii->executedAs()); 813 if (Enums::SC_SHARED == ii->executedAs()) { 814 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 815 set(computeUnit->shader->ticks(8)); 816 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 817 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 818 } else { 819 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 820 set(computeUnit->shader->ticks(8)); 821 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 822 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 823 } 824 } else if (IS_OT_READ_GM(ii->opType())) { 825 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 826 set(computeUnit->shader->ticks(4)); 827 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 828 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 829 } else if (IS_OT_WRITE_GM(ii->opType())) { 830 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 831 set(computeUnit->shader->ticks(8)); 832 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 833 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 834 } else if (IS_OT_ATOMIC_GM(ii->opType())) { 835 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 836 set(computeUnit->shader->ticks(8)); 837 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 838 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 839 } else if (IS_OT_READ_LM(ii->opType())) { 840 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 841 set(computeUnit->shader->ticks(4)); 842 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 843 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 844 } else if (IS_OT_WRITE_LM(ii->opType())) { 845 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 846 set(computeUnit->shader->ticks(8)); 847 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 848 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 849 } else if (IS_OT_ATOMIC_LM(ii->opType())) { 850 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 851 set(computeUnit->shader->ticks(8)); 852 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 853 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 854 } 855} 856 857bool 858Wavefront::waitingAtBarrier(int lane) 859{ 860 return bar_cnt[lane] < max_bar_cnt; 861} 862 863void 864Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 865 const VectorMask& mask) 866{ 867 assert(mask.count()); 868 reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask)); 869} 870 871void 872Wavefront::popFromReconvergenceStack() 873{ 874 assert(!reconvergenceStack.empty()); 875 876 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", 877 computeUnit->cu_id, simdId, wfSlotId, wfDynId, 878 execMask().to_string<char, std::string::traits_type, 879 std::string::allocator_type>().c_str(), pc()); 880 881 reconvergenceStack.pop(); 882 883 DPRINTF(WavefrontStack, "%3i %s\n", pc(), 884 execMask().to_string<char, std::string::traits_type, 885 std::string::allocator_type>().c_str()); 886 887} 888 889void 890Wavefront::discardFetch() 891{ 892 instructionBuffer.clear(); 893 dropFetch |=pendingFetch; 894} 895 896uint32_t 897Wavefront::pc() const 898{ 899 return reconvergenceStack.top()->pc; 900} 901 902uint32_t 903Wavefront::rpc() const 904{ 905 return reconvergenceStack.top()->rpc; 906} 907 908VectorMask 909Wavefront::execMask() const 910{ 911 return reconvergenceStack.top()->execMask; 912} 913 914bool 915Wavefront::execMask(int lane) const 916{ 917 return reconvergenceStack.top()->execMask[lane]; 918} 919 920 921void 922Wavefront::pc(uint32_t new_pc) 923{ 924 reconvergenceStack.top()->pc = new_pc; 925} 926