wavefront.cc revision 11345:b6a66a90e0a1
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#include "gpu-compute/wavefront.hh" 37 38#include "debug/GPUExec.hh" 39#include "debug/WavefrontStack.hh" 40#include "gpu-compute/code_enums.hh" 41#include "gpu-compute/compute_unit.hh" 42#include "gpu-compute/gpu_dyn_inst.hh" 43#include "gpu-compute/shader.hh" 44#include "gpu-compute/vector_register_file.hh" 45 46Wavefront* 47WavefrontParams::create() 48{ 49 return new Wavefront(this); 50} 51 52Wavefront::Wavefront(const Params *p) 53 : SimObject(p), callArgMem(nullptr) 54{ 55 last_trace = 0; 56 simdId = p->simdId; 57 wfSlotId = p->wf_slot_id; 58 59 status = S_STOPPED; 60 reservedVectorRegs = 0; 61 startVgprIndex = 0; 62 outstanding_reqs = 0; 63 mem_reqs_in_pipe = 0; 64 outstanding_reqs_wr_gm = 0; 65 outstanding_reqs_wr_lm = 0; 66 outstanding_reqs_rd_gm = 0; 67 outstanding_reqs_rd_lm = 0; 68 rd_lm_reqs_in_pipe = 0; 69 rd_gm_reqs_in_pipe = 0; 70 wr_lm_reqs_in_pipe = 0; 71 wr_gm_reqs_in_pipe = 0; 72 73 barrier_cnt = 0; 74 old_barrier_cnt = 0; 75 stalledAtBarrier = false; 76 77 mem_trace_busy = 0; 78 old_vgpr_tcnt = 0xffffffffffffffffll; 79 old_dgpr_tcnt = 0xffffffffffffffffll; 80 81 pendingFetch = false; 82 dropFetch = false; 83 condRegState = new ConditionRegisterState(); 84 maxSpVgprs = 0; 85 maxDpVgprs = 0; 86} 87 88void 89Wavefront::regStats() 90{ 91 srcRegOpDist 92 .init(0, 4, 2) 93 .name(name() + ".src_reg_operand_dist") 94 .desc("number of executed instructions with N source register operands") 95 ; 96 97 dstRegOpDist 98 .init(0, 3, 2) 99 .name(name() + ".dst_reg_operand_dist") 100 .desc("number of executed instructions with N destination register " 101 "operands") 102 ; 103 104 // FIXME: the name of the WF needs to be unique 105 numTimesBlockedDueWAXDependencies 106 .name(name() + ".timesBlockedDueWAXDependencies") 107 .desc("number of times the wf's instructions are blocked due to WAW " 108 "or WAR dependencies") 109 ; 110 111 // FIXME: the name of the WF needs to be unique 112 numTimesBlockedDueRAWDependencies 113 .name(name() + ".timesBlockedDueRAWDependencies") 114 .desc("number of times the wf's instructions are blocked due to RAW " 115 "dependencies") 116 ; 117 118 // FIXME: the name of the WF needs to be unique 119 numTimesBlockedDueVrfPortAvail 120 .name(name() + ".timesBlockedDueVrfPortAvail") 121 .desc("number of times instructions are blocked due to VRF port " 122 "availability") 123 ; 124} 125 126void 127Wavefront::init() 128{ 129 reservedVectorRegs = 0; 130 startVgprIndex = 0; 131} 132 133void 134Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) 135{ 136 condRegState->init(num_cregs); 137 maxSpVgprs = num_sregs; 138 maxDpVgprs = num_dregs; 139} 140 141Wavefront::~Wavefront() 142{ 143 if (callArgMem) 144 delete callArgMem; 145} 146 147void 148Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr) 149{ 150 wfDynId = _wfDynId; 151 base_ptr = _base_ptr; 152 status = S_RUNNING; 153} 154 155bool 156Wavefront::isGmInstruction(GPUDynInstPtr ii) 157{ 158 if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || 159 IS_OT_ATOMIC_PM(ii->opType())) { 160 return true; 161 } 162 163 if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || 164 IS_OT_ATOMIC_GM(ii->opType())) { 165 return true; 166 } 167 168 if (IS_OT_FLAT(ii->opType())) { 169 return true; 170 } 171 172 return false; 173} 174 175bool 176Wavefront::isLmInstruction(GPUDynInstPtr ii) 177{ 178 if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) || 179 IS_OT_ATOMIC_LM(ii->opType())) { 180 return true; 181 } 182 183 return false; 184} 185 186bool 187Wavefront::isOldestInstALU() 188{ 189 assert(!instructionBuffer.empty()); 190 GPUDynInstPtr ii = instructionBuffer.front(); 191 192 if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP || 193 ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || 194 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || 195 ii->opType() == Enums::OT_KERN_READ)) { 196 return true; 197 } 198 199 return false; 200} 201 202bool 203Wavefront::isOldestInstBarrier() 204{ 205 assert(!instructionBuffer.empty()); 206 GPUDynInstPtr ii = instructionBuffer.front(); 207 208 if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) { 209 return true; 210 } 211 212 return false; 213} 214 215bool 216Wavefront::isOldestInstGMem() 217{ 218 assert(!instructionBuffer.empty()); 219 GPUDynInstPtr ii = instructionBuffer.front(); 220 221 if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) || 222 IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { 223 224 return true; 225 } 226 227 return false; 228} 229 230bool 231Wavefront::isOldestInstLMem() 232{ 233 assert(!instructionBuffer.empty()); 234 GPUDynInstPtr ii = instructionBuffer.front(); 235 236 if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) || 237 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { 238 239 return true; 240 } 241 242 return false; 243} 244 245bool 246Wavefront::isOldestInstPrivMem() 247{ 248 assert(!instructionBuffer.empty()); 249 GPUDynInstPtr ii = instructionBuffer.front(); 250 251 if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) || 252 IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { 253 254 return true; 255 } 256 257 return false; 258} 259 260bool 261Wavefront::isOldestInstFlatMem() 262{ 263 assert(!instructionBuffer.empty()); 264 GPUDynInstPtr ii = instructionBuffer.front(); 265 266 if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) { 267 268 return true; 269 } 270 271 return false; 272} 273 274// Return true if the Wavefront's instruction 275// buffer has branch instruction. 276bool 277Wavefront::instructionBufferHasBranch() 278{ 279 for (auto it : instructionBuffer) { 280 GPUDynInstPtr ii = it; 281 282 if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) { 283 return true; 284 } 285 } 286 287 return false; 288} 289 290// Remap HSAIL register to physical VGPR. 291// HSAIL register = virtual register assigned to an operand by HLC compiler 292uint32_t 293Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) 294{ 295 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); 296 // add the offset from where the VGPRs of the wavefront have been assigned 297 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; 298 // HSAIL double precision (DP) register: calculate the physical VGPR index 299 // assuming that DP registers are placed after SP ones in the VRF. The DP 300 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust 301 // the DP VGPR index before mapping it to the physical VRF address space 302 if (mode == 1 && size > 4) { 303 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); 304 } 305 306 assert((startVgprIndex <= physicalVgprIndex) && 307 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); 308 309 // calculate absolute physical VGPR index 310 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); 311} 312 313// Return true if this wavefront is ready 314// to execute an instruction of the specified type. 315int 316Wavefront::ready(itype_e type) 317{ 318 // Check to make sure wave is running 319 if (status == S_STOPPED || status == S_RETURNING || 320 instructionBuffer.empty()) { 321 return 0; 322 } 323 324 // Is the wave waiting at a barrier 325 if (stalledAtBarrier) { 326 if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt, 327 computeUnit->getRefCounter(dispatchid, wg_id))) { 328 // Are all threads at barrier? 329 return 0; 330 } 331 old_barrier_cnt = barrier_cnt; 332 stalledAtBarrier = false; 333 } 334 335 // Read instruction 336 GPUDynInstPtr ii = instructionBuffer.front(); 337 338 bool ready_inst M5_VAR_USED = false; 339 bool glbMemBusRdy = false; 340 bool glbMemIssueRdy = false; 341 if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) { 342 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { 343 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) 344 glbMemBusRdy = true; 345 if (computeUnit->wfWait[j].prerdy()) 346 glbMemIssueRdy = true; 347 } 348 } 349 bool locMemBusRdy = false; 350 bool locMemIssueRdy = false; 351 if (type == I_SHARED || type == I_FLAT) { 352 for (int j=0; j < computeUnit->numLocMemUnits; ++j) { 353 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) 354 locMemBusRdy = true; 355 if (computeUnit->wfWait[j].prerdy()) 356 locMemIssueRdy = true; 357 } 358 } 359 360 // The following code is very error prone and the entire process for 361 // checking readiness will be fixed eventually. In the meantime, let's 362 // make sure that we do not silently let an instruction type slip 363 // through this logic and always return not ready. 364 if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP || 365 ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || 366 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || 367 ii->opType() == Enums::OT_KERN_READ || 368 ii->opType() == Enums::OT_ARG || 369 IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || 370 IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) || 371 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || 372 IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || 373 IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) { 374 panic("next instruction: %s is of unknown type\n", ii->disassemble()); 375 } 376 377 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", 378 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); 379 380 if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) { 381 // Here for ALU instruction (barrier) 382 if (!computeUnit->wfWait[simdId].prerdy()) { 383 // Is wave slot free? 384 return 0; 385 } 386 387 // Are there in pipe or outstanding memory requests? 388 if ((outstanding_reqs + mem_reqs_in_pipe) > 0) { 389 return 0; 390 } 391 392 ready_inst = true; 393 } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) { 394 // Here for ALU instruction (nop) 395 if (!computeUnit->wfWait[simdId].prerdy()) { 396 // Is wave slot free? 397 return 0; 398 } 399 400 ready_inst = true; 401 } else if (type == I_ALU && ii->opType() == Enums::OT_RET) { 402 // Here for ALU instruction (return) 403 if (!computeUnit->wfWait[simdId].prerdy()) { 404 // Is wave slot free? 405 return 0; 406 } 407 408 // Are there in pipe or outstanding memory requests? 409 if ((outstanding_reqs + mem_reqs_in_pipe) > 0) { 410 return 0; 411 } 412 413 ready_inst = true; 414 } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH || 415 ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || 416 ii->opType() == Enums::OT_KERN_READ || 417 ii->opType() == Enums::OT_ARG)) { 418 // Here for ALU instruction (all others) 419 if (!computeUnit->wfWait[simdId].prerdy()) { 420 // Is alu slot free? 421 return 0; 422 } 423 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 424 VrfAccessType::RD_WR)) { 425 return 0; 426 } 427 428 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 429 return 0; 430 } 431 ready_inst = true; 432 } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) || 433 IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { 434 // Here Global memory instruction 435 if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) { 436 // Are there in pipe or outstanding global memory write requests? 437 if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) { 438 return 0; 439 } 440 } 441 442 if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) || 443 IS_OT_HIST_GM(ii->opType())) { 444 // Are there in pipe or outstanding global memory read requests? 445 if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) 446 return 0; 447 } 448 449 if (!glbMemIssueRdy) { 450 // Is WV issue slot free? 451 return 0; 452 } 453 454 if (!glbMemBusRdy) { 455 // Is there an available VRF->Global memory read bus? 456 return 0; 457 } 458 459 if (!computeUnit->globalMemoryPipe. 460 isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { 461 // Can we insert a new request to the Global Mem Request FIFO? 462 return 0; 463 } 464 // can we schedule source & destination operands on the VRF? 465 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 466 VrfAccessType::RD_WR)) { 467 return 0; 468 } 469 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 470 return 0; 471 } 472 ready_inst = true; 473 } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) || 474 IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { 475 // Here for Shared memory instruction 476 if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) { 477 if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) { 478 return 0; 479 } 480 } 481 482 if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || 483 IS_OT_HIST_LM(ii->opType())) { 484 if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) { 485 return 0; 486 } 487 } 488 489 if (!locMemBusRdy) { 490 // Is there an available VRF->LDS read bus? 491 return 0; 492 } 493 if (!locMemIssueRdy) { 494 // Is wave slot free? 495 return 0; 496 } 497 498 if (!computeUnit->localMemoryPipe. 499 isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) { 500 // Can we insert a new request to the LDS Request FIFO? 501 return 0; 502 } 503 // can we schedule source & destination operands on the VRF? 504 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 505 VrfAccessType::RD_WR)) { 506 return 0; 507 } 508 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 509 return 0; 510 } 511 ready_inst = true; 512 } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) || 513 IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { 514 // Here for Private memory instruction ------------------------ // 515 if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) { 516 if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) { 517 return 0; 518 } 519 } 520 521 if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) || 522 IS_OT_HIST_PM(ii->opType())) { 523 if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) { 524 return 0; 525 } 526 } 527 528 if (!glbMemBusRdy) { 529 // Is there an available VRF->Global memory read bus? 530 return 0; 531 } 532 533 if (!glbMemIssueRdy) { 534 // Is wave slot free? 535 return 0; 536 } 537 538 if (!computeUnit->globalMemoryPipe. 539 isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { 540 // Can we insert a new request to the Global Mem Request FIFO? 541 return 0; 542 } 543 // can we schedule source & destination operands on the VRF? 544 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 545 VrfAccessType::RD_WR)) { 546 return 0; 547 } 548 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 549 return 0; 550 } 551 ready_inst = true; 552 } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) { 553 if (!glbMemBusRdy) { 554 // Is there an available VRF->Global memory read bus? 555 return 0; 556 } 557 558 if (!locMemBusRdy) { 559 // Is there an available VRF->LDS read bus? 560 return 0; 561 } 562 563 if (!glbMemIssueRdy) { 564 // Is wave slot free? 565 return 0; 566 } 567 568 if (!locMemIssueRdy) { 569 return 0; 570 } 571 if (!computeUnit->globalMemoryPipe. 572 isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) { 573 // Can we insert a new request to the Global Mem Request FIFO? 574 return 0; 575 } 576 577 if (!computeUnit->localMemoryPipe. 578 isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) { 579 // Can we insert a new request to the LDS Request FIFO? 580 return 0; 581 } 582 // can we schedule source & destination operands on the VRF? 583 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 584 VrfAccessType::RD_WR)) { 585 return 0; 586 } 587 // are all the operands ready? (RAW, WAW and WAR depedencies met?) 588 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 589 return 0; 590 } 591 ready_inst = true; 592 } else { 593 return 0; 594 } 595 596 assert(ready_inst); 597 598 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, 599 simdId, wfSlotId, ii->disassemble()); 600 return 1; 601} 602 603void 604Wavefront::updateResources() 605{ 606 // Get current instruction 607 GPUDynInstPtr ii = instructionBuffer.front(); 608 assert(ii); 609 computeUnit->vrf[simdId]->updateResources(this, ii); 610 // Single precision ALU or Branch or Return or Special instruction 611 if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || 612 ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || 613 // FIXME: Kernel argument loads are currently treated as ALU operations 614 // since we don't send memory packets at execution. If we fix that then 615 // we should map them to one of the memory pipelines 616 ii->opType()==Enums::OT_KERN_READ || 617 ii->opType()==Enums::OT_ARG || 618 ii->opType()==Enums::OT_RET) { 619 computeUnit->aluPipe[simdId].preset(computeUnit->shader-> 620 ticks(computeUnit->spBypassLength())); 621 // this is to enforce a fixed number of cycles per issue slot per SIMD 622 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 623 ticks(computeUnit->issuePeriod)); 624 } else if (ii->opType() == Enums::OT_BARRIER) { 625 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 626 ticks(computeUnit->issuePeriod)); 627 } else if (ii->opType() == Enums::OT_FLAT_READ) { 628 assert(Enums::SC_NONE != ii->executedAs()); 629 mem_reqs_in_pipe++; 630 rd_gm_reqs_in_pipe++; 631 if ( Enums::SC_SHARED == ii->executedAs() ) { 632 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 633 preset(computeUnit->shader->ticks(4)); 634 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 635 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 636 } else { 637 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 638 preset(computeUnit->shader->ticks(4)); 639 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 640 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 641 } 642 } else if (ii->opType() == Enums::OT_FLAT_WRITE) { 643 assert(Enums::SC_NONE != ii->executedAs()); 644 mem_reqs_in_pipe++; 645 wr_gm_reqs_in_pipe++; 646 if (Enums::SC_SHARED == ii->executedAs()) { 647 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 648 preset(computeUnit->shader->ticks(8)); 649 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 650 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 651 } else { 652 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 653 preset(computeUnit->shader->ticks(8)); 654 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 655 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 656 } 657 } else if (IS_OT_READ_GM(ii->opType())) { 658 mem_reqs_in_pipe++; 659 rd_gm_reqs_in_pipe++; 660 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 661 preset(computeUnit->shader->ticks(4)); 662 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 663 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 664 } else if (IS_OT_WRITE_GM(ii->opType())) { 665 mem_reqs_in_pipe++; 666 wr_gm_reqs_in_pipe++; 667 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 668 preset(computeUnit->shader->ticks(8)); 669 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 670 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 671 } else if (IS_OT_ATOMIC_GM(ii->opType())) { 672 mem_reqs_in_pipe++; 673 wr_gm_reqs_in_pipe++; 674 rd_gm_reqs_in_pipe++; 675 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 676 preset(computeUnit->shader->ticks(8)); 677 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 678 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 679 } else if (IS_OT_READ_LM(ii->opType())) { 680 mem_reqs_in_pipe++; 681 rd_lm_reqs_in_pipe++; 682 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 683 preset(computeUnit->shader->ticks(4)); 684 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 685 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 686 } else if (IS_OT_WRITE_LM(ii->opType())) { 687 mem_reqs_in_pipe++; 688 wr_lm_reqs_in_pipe++; 689 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 690 preset(computeUnit->shader->ticks(8)); 691 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 692 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 693 } else if (IS_OT_ATOMIC_LM(ii->opType())) { 694 mem_reqs_in_pipe++; 695 wr_lm_reqs_in_pipe++; 696 rd_lm_reqs_in_pipe++; 697 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 698 preset(computeUnit->shader->ticks(8)); 699 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 700 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 701 } else if (IS_OT_READ_PM(ii->opType())) { 702 mem_reqs_in_pipe++; 703 rd_gm_reqs_in_pipe++; 704 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 705 preset(computeUnit->shader->ticks(4)); 706 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 707 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 708 } else if (IS_OT_WRITE_PM(ii->opType())) { 709 mem_reqs_in_pipe++; 710 wr_gm_reqs_in_pipe++; 711 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 712 preset(computeUnit->shader->ticks(8)); 713 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 714 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 715 } else if (IS_OT_ATOMIC_PM(ii->opType())) { 716 mem_reqs_in_pipe++; 717 wr_gm_reqs_in_pipe++; 718 rd_gm_reqs_in_pipe++; 719 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 720 preset(computeUnit->shader->ticks(8)); 721 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 722 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 723 } 724} 725 726void 727Wavefront::exec() 728{ 729 // ---- Exit if wavefront is inactive ----------------------------- // 730 731 if (status == S_STOPPED || status == S_RETURNING || 732 instructionBuffer.empty()) { 733 return; 734 } 735 736 // Get current instruction 737 738 GPUDynInstPtr ii = instructionBuffer.front(); 739 740 const uint32_t old_pc = pc(); 741 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " 742 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, 743 ii->disassemble(), old_pc); 744 ii->execute(); 745 // access the VRF 746 computeUnit->vrf[simdId]->exec(ii, this); 747 srcRegOpDist.sample(ii->numSrcRegOperands()); 748 dstRegOpDist.sample(ii->numDstRegOperands()); 749 computeUnit->numInstrExecuted++; 750 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - 751 computeUnit->lastExecCycle[simdId]); 752 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); 753 if (pc() == old_pc) { 754 uint32_t new_pc = old_pc + 1; 755 // PC not modified by instruction, proceed to next or pop frame 756 pc(new_pc); 757 if (new_pc == rpc()) { 758 popFromReconvergenceStack(); 759 discardFetch(); 760 } else { 761 instructionBuffer.pop_front(); 762 } 763 } 764 765 if (computeUnit->shader->hsail_mode==Shader::SIMT) { 766 const int num_active_lanes = execMask().count(); 767 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); 768 computeUnit->numVecOpsExecuted += num_active_lanes; 769 if (isGmInstruction(ii)) { 770 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); 771 } else if (isLmInstruction(ii)) { 772 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); 773 } 774 } 775 776 // ---- Update Vector ALU pipeline and other resources ------------------ // 777 // Single precision ALU or Branch or Return or Special instruction 778 if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || 779 ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || 780 // FIXME: Kernel argument loads are currently treated as ALU operations 781 // since we don't send memory packets at execution. If we fix that then 782 // we should map them to one of the memory pipelines 783 ii->opType() == Enums::OT_KERN_READ || 784 ii->opType() == Enums::OT_ARG || 785 ii->opType() == Enums::OT_RET) { 786 computeUnit->aluPipe[simdId].set(computeUnit->shader-> 787 ticks(computeUnit->spBypassLength())); 788 789 // this is to enforce a fixed number of cycles per issue slot per SIMD 790 computeUnit->wfWait[simdId].set(computeUnit->shader-> 791 ticks(computeUnit->issuePeriod)); 792 } else if (ii->opType() == Enums::OT_BARRIER) { 793 computeUnit->wfWait[simdId].set(computeUnit->shader-> 794 ticks(computeUnit->issuePeriod)); 795 } else if (ii->opType() == Enums::OT_FLAT_READ) { 796 assert(Enums::SC_NONE != ii->executedAs()); 797 798 if (Enums::SC_SHARED == ii->executedAs()) { 799 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 800 set(computeUnit->shader->ticks(4)); 801 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 802 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 803 } else { 804 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 805 set(computeUnit->shader->ticks(4)); 806 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 807 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 808 } 809 } else if (ii->opType() == Enums::OT_FLAT_WRITE) { 810 assert(Enums::SC_NONE != ii->executedAs()); 811 if (Enums::SC_SHARED == ii->executedAs()) { 812 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 813 set(computeUnit->shader->ticks(8)); 814 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 815 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 816 } else { 817 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 818 set(computeUnit->shader->ticks(8)); 819 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 820 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 821 } 822 } else if (IS_OT_READ_GM(ii->opType())) { 823 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 824 set(computeUnit->shader->ticks(4)); 825 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 826 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 827 } else if (IS_OT_WRITE_GM(ii->opType())) { 828 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 829 set(computeUnit->shader->ticks(8)); 830 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 831 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 832 } else if (IS_OT_ATOMIC_GM(ii->opType())) { 833 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 834 set(computeUnit->shader->ticks(8)); 835 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 836 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 837 } else if (IS_OT_READ_LM(ii->opType())) { 838 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 839 set(computeUnit->shader->ticks(4)); 840 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 841 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 842 } else if (IS_OT_WRITE_LM(ii->opType())) { 843 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 844 set(computeUnit->shader->ticks(8)); 845 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 846 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 847 } else if (IS_OT_ATOMIC_LM(ii->opType())) { 848 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 849 set(computeUnit->shader->ticks(8)); 850 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 851 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 852 } 853} 854 855bool 856Wavefront::waitingAtBarrier(int lane) 857{ 858 return bar_cnt[lane] < max_bar_cnt; 859} 860 861void 862Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 863 const VectorMask& mask) 864{ 865 assert(mask.count()); 866 reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask)); 867} 868 869void 870Wavefront::popFromReconvergenceStack() 871{ 872 assert(!reconvergenceStack.empty()); 873 874 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", 875 computeUnit->cu_id, simdId, wfSlotId, wfDynId, 876 execMask().to_string<char, std::string::traits_type, 877 std::string::allocator_type>().c_str(), pc()); 878 879 reconvergenceStack.pop(); 880 881 DPRINTF(WavefrontStack, "%3i %s\n", pc(), 882 execMask().to_string<char, std::string::traits_type, 883 std::string::allocator_type>().c_str()); 884 885} 886 887void 888Wavefront::discardFetch() 889{ 890 instructionBuffer.clear(); 891 dropFetch |=pendingFetch; 892} 893 894uint32_t 895Wavefront::pc() const 896{ 897 return reconvergenceStack.top()->pc; 898} 899 900uint32_t 901Wavefront::rpc() const 902{ 903 return reconvergenceStack.top()->rpc; 904} 905 906VectorMask 907Wavefront::execMask() const 908{ 909 return reconvergenceStack.top()->execMask; 910} 911 912bool 913Wavefront::execMask(int lane) const 914{ 915 return reconvergenceStack.top()->execMask[lane]; 916} 917 918 919void 920Wavefront::pc(uint32_t new_pc) 921{ 922 reconvergenceStack.top()->pc = new_pc; 923} 924