Cross Reference: /gem5/src/gpu-compute/wavefront.cc

Deleted Added

sdiff udiff text old ( 11644:d426728892fe ) new ( 11657:5fad5a37d6fc )

full compact

wavefront.cc (11644:d426728892fe)	wavefront.cc (11657:5fad5a37d6fc)
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 / 35 36#include "gpu-compute/wavefront.hh" 37 38#include "debug/GPUExec.hh" 39#include "debug/WavefrontStack.hh" 40#include "gpu-compute/code_enums.hh" 41#include "gpu-compute/compute_unit.hh" 42#include "gpu-compute/gpu_dyn_inst.hh" 43#include "gpu-compute/shader.hh" 44#include "gpu-compute/vector_register_file.hh" 45 46Wavefront 47WavefrontParams::create() 48{ 49 return new Wavefront(this); 50} 51 52Wavefront::Wavefront(const Params p) 53 : SimObject(p), callArgMem(nullptr) 54{ 55 lastTrace = 0; 56 simdId = p->simdId; 57 wfSlotId = p->wf_slot_id; 58 status = S_STOPPED; 59 reservedVectorRegs = 0; 60 startVgprIndex = 0; 61 outstandingReqs = 0; 62 memReqsInPipe = 0; 63 outstandingReqsWrGm = 0; 64 outstandingReqsWrLm = 0; 65 outstandingReqsRdGm = 0; 66 outstandingReqsRdLm = 0; 67 rdLmReqsInPipe = 0; 68 rdGmReqsInPipe = 0; 69 wrLmReqsInPipe = 0; 70 wrGmReqsInPipe = 0; 71 72 barrierCnt = 0; 73 oldBarrierCnt = 0; 74 stalledAtBarrier = false; 75 76 memTraceBusy = 0; 77 oldVgprTcnt = 0xffffffffffffffffll; 78 oldDgprTcnt = 0xffffffffffffffffll; 79 oldVgpr.resize(p->wfSize); 80 81 pendingFetch = false; 82 dropFetch = false; 83 condRegState = new ConditionRegisterState(); 84 maxSpVgprs = 0; 85 maxDpVgprs = 0; 86 lastAddr.resize(p->wfSize); 87 workItemFlatId.resize(p->wfSize); 88 oldDgpr.resize(p->wfSize); 89 barCnt.resize(p->wfSize); 90 for (int i = 0; i < 3; ++i) { 91 workItemId[i].resize(p->wfSize); 92 } 93} 94 95void 96Wavefront::regStats() 97{ 98 SimObject::regStats(); 99 100* srcRegOpDist 101 .init(0, 4, 2) 102 .name(name() + ".src_reg_operand_dist") 103 .desc("number of executed instructions with N source register operands") 104 ; 105 106 dstRegOpDist 107 .init(0, 3, 2) 108 .name(name() + ".dst_reg_operand_dist") 109 .desc("number of executed instructions with N destination register " 110 "operands") 111 ; 112 113 // FIXME: the name of the WF needs to be unique 114 numTimesBlockedDueWAXDependencies 115 .name(name() + ".timesBlockedDueWAXDependencies") 116 .desc("number of times the wf's instructions are blocked due to WAW " 117 "or WAR dependencies") 118 ; 119 120 // FIXME: the name of the WF needs to be unique 121 numTimesBlockedDueRAWDependencies 122 .name(name() + ".timesBlockedDueRAWDependencies") 123 .desc("number of times the wf's instructions are blocked due to RAW " 124 "dependencies") 125 ; 126 127 // FIXME: the name of the WF needs to be unique 128 numTimesBlockedDueVrfPortAvail 129 .name(name() + ".timesBlockedDueVrfPortAvail") 130 .desc("number of times instructions are blocked due to VRF port " 131 "availability") 132 ; 133} 134 135void 136Wavefront::init() 137{ 138 reservedVectorRegs = 0; 139 startVgprIndex = 0; 140} 141 142void 143Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) 144{ 145 condRegState->init(num_cregs); 146 maxSpVgprs = num_sregs; 147 maxDpVgprs = num_dregs; 148} 149 150Wavefront::~Wavefront() 151{ 152 if (callArgMem) 153 delete callArgMem; 154 delete condRegState; 155} 156 157void 158Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr) 159{ 160 wfDynId = _wf_dyn_id; 161 basePtr = _base_ptr; 162 status = S_RUNNING; 163} 164 165bool 166Wavefront::isGmInstruction(GPUDynInstPtr ii) 167{ 168 if (IS_OT_READ_PM(ii->opType()) \|\| IS_OT_WRITE_PM(ii->opType()) \|\| 169 IS_OT_ATOMIC_PM(ii->opType())) { 170 return true; 171 } 172 173 if (IS_OT_READ_GM(ii->opType()) \|\| IS_OT_WRITE_GM(ii->opType()) \|\| 174 IS_OT_ATOMIC_GM(ii->opType())) { 175 return true; 176 } 177 178 if (IS_OT_FLAT(ii->opType())) { 179 return true; 180 } 181 182 return false; 183} 184 185bool 186Wavefront::isLmInstruction(GPUDynInstPtr ii) 187{ 188 if (IS_OT_READ_LM(ii->opType()) \|\| IS_OT_WRITE_LM(ii->opType()) \|\| 189 IS_OT_ATOMIC_LM(ii->opType())) { 190 return true; 191 } 192 193 return false; 194} 195 196bool 197Wavefront::isOldestInstALU() 198{ 199 assert(!instructionBuffer.empty()); 200 GPUDynInstPtr ii = instructionBuffer.front(); 201 202 if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP \|\| 203 ii->opType() == Enums::OT_RET \|\| ii->opType() == Enums::OT_BRANCH \|\| 204 ii->opType() == Enums::OT_ALU \|\| IS_OT_LDAS(ii->opType()) \|\| 205 ii->opType() == Enums::OT_KERN_READ)) { 206 return true; 207 } 208 209 return false; 210} 211 212bool 213Wavefront::isOldestInstBarrier() 214{ 215 assert(!instructionBuffer.empty()); 216 GPUDynInstPtr ii = instructionBuffer.front(); 217 218 if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) { 219 return true; 220 } 221 222 return false; 223} 224 225bool 226Wavefront::isOldestInstGMem() 227{ 228 assert(!instructionBuffer.empty()); 229 GPUDynInstPtr ii = instructionBuffer.front(); 230 231 if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) \|\| 232 IS_OT_WRITE_GM(ii->opType()) \|\| IS_OT_ATOMIC_GM(ii->opType()))) { 233 234 return true; 235 } 236 237 return false; 238} 239 240bool 241Wavefront::isOldestInstLMem() 242{ 243 assert(!instructionBuffer.empty()); 244 GPUDynInstPtr ii = instructionBuffer.front(); 245 246 if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) \|\| 247 IS_OT_WRITE_LM(ii->opType()) \|\| IS_OT_ATOMIC_LM(ii->opType()))) { 248 249 return true; 250 } 251 252 return false; 253} 254 255bool 256Wavefront::isOldestInstPrivMem() 257{ 258 assert(!instructionBuffer.empty()); 259 GPUDynInstPtr ii = instructionBuffer.front(); 260 261 if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) \|\| 262 IS_OT_WRITE_PM(ii->opType()) \|\| IS_OT_ATOMIC_PM(ii->opType()))) { 263 264 return true; 265 } 266 267 return false; 268} 269 270bool 271Wavefront::isOldestInstFlatMem() 272{ 273 assert(!instructionBuffer.empty()); 274 GPUDynInstPtr ii = instructionBuffer.front(); 275 276 if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) { 277 278 return true; 279 } 280 281 return false; 282} 283 284// Return true if the Wavefront's instruction 285// buffer has branch instruction. 286bool 287Wavefront::instructionBufferHasBranch() 288{ 289 for (auto it : instructionBuffer) { 290 GPUDynInstPtr ii = it; 291 292 if (ii->opType() == Enums::OT_RET \|\| ii->opType() == Enums::OT_BRANCH) { 293 return true; 294 } 295 } 296 297 return false; 298} 299 300// Remap HSAIL register to physical VGPR. 301// HSAIL register = virtual register assigned to an operand by HLC compiler 302uint32_t 303Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) 304{ 305 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); 306 // add the offset from where the VGPRs of the wavefront have been assigned 307 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; 308 // HSAIL double precision (DP) register: calculate the physical VGPR index 309 // assuming that DP registers are placed after SP ones in the VRF. The DP 310 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust 311 // the DP VGPR index before mapping it to the physical VRF address space 312 if (mode == 1 && size > 4) { 313 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); 314 } 315 316 assert((startVgprIndex <= physicalVgprIndex) && 317 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); 318 319 // calculate absolute physical VGPR index 320 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); 321} 322 323// Return true if this wavefront is ready 324// to execute an instruction of the specified type. 325int 326Wavefront::ready(itype_e type) 327{ 328 // Check to make sure wave is running 329 if (status == S_STOPPED \|\| status == S_RETURNING \|\| 330 instructionBuffer.empty()) { 331 return 0; 332 } 333 334 // Is the wave waiting at a barrier 335 if (stalledAtBarrier) { 336 if (!computeUnit->AllAtBarrier(barrierId,barrierCnt, 337 computeUnit->getRefCounter(dispatchId, wgId))) { 338 // Are all threads at barrier? 339 return 0; 340 } 341 oldBarrierCnt = barrierCnt; 342 stalledAtBarrier = false; 343 } 344 345 // Read instruction 346 GPUDynInstPtr ii = instructionBuffer.front(); 347 348 bool ready_inst M5_VAR_USED = false; 349 bool glbMemBusRdy = false; 350 bool glbMemIssueRdy = false; 351 if (type == I_GLOBAL \|\| type == I_FLAT \|\| type == I_PRIVATE) { 352 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { 353 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) 354 glbMemBusRdy = true; 355 if (computeUnit->wfWait[j].prerdy()) 356 glbMemIssueRdy = true; 357 } 358 } 359 bool locMemBusRdy = false; 360 bool locMemIssueRdy = false; 361 if (type == I_SHARED \|\| type == I_FLAT) { 362 for (int j=0; j < computeUnit->numLocMemUnits; ++j) { 363 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) 364 locMemBusRdy = true; 365 if (computeUnit->wfWait[j].prerdy()) 366 locMemIssueRdy = true; 367 } 368 } 369 370 // The following code is very error prone and the entire process for 371 // checking readiness will be fixed eventually. In the meantime, let's 372 // make sure that we do not silently let an instruction type slip 373 // through this logic and always return not ready. 374 if (!(ii->opType() == Enums::OT_BARRIER \|\| ii->opType() == Enums::OT_NOP \|\| 375 ii->opType() == Enums::OT_RET \|\| ii->opType() == Enums::OT_BRANCH \|\| 376 ii->opType() == Enums::OT_ALU \|\| IS_OT_LDAS(ii->opType()) \|\| 377 ii->opType() == Enums::OT_KERN_READ \|\| 378 ii->opType() == Enums::OT_ARG \|\| 379 IS_OT_READ_GM(ii->opType()) \|\| IS_OT_WRITE_GM(ii->opType()) \|\| 380 IS_OT_ATOMIC_GM(ii->opType()) \|\| IS_OT_READ_LM(ii->opType()) \|\| 381 IS_OT_WRITE_LM(ii->opType()) \|\| IS_OT_ATOMIC_LM(ii->opType()) \|\| 382 IS_OT_READ_PM(ii->opType()) \|\| IS_OT_WRITE_PM(ii->opType()) \|\| 383 IS_OT_ATOMIC_PM(ii->opType()) \|\| IS_OT_FLAT(ii->opType()))) { 384 panic("next instruction: %s is of unknown type\n", ii->disassemble()); 385 } 386 387 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", 388 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); 389 390 if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) { 391 // Here for ALU instruction (barrier) 392 if (!computeUnit->wfWait[simdId].prerdy()) { 393 // Is wave slot free? 394 return 0; 395 } 396 397 // Are there in pipe or outstanding memory requests? 398 if ((outstandingReqs + memReqsInPipe) > 0) { 399 return 0; 400 } 401 402 ready_inst = true; 403 } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) { 404 // Here for ALU instruction (nop) 405 if (!computeUnit->wfWait[simdId].prerdy()) { 406 // Is wave slot free? 407 return 0; 408 } 409 410 ready_inst = true; 411 } else if (type == I_ALU && ii->opType() == Enums::OT_RET) { 412 // Here for ALU instruction (return) 413 if (!computeUnit->wfWait[simdId].prerdy()) { 414 // Is wave slot free? 415 return 0; 416 } 417 418 // Are there in pipe or outstanding memory requests? 419 if ((outstandingReqs + memReqsInPipe) > 0) { 420 return 0; 421 } 422 423 ready_inst = true; 424 } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH \|\| 425 ii->opType() == Enums::OT_ALU \|\| IS_OT_LDAS(ii->opType()) \|\| 426 ii->opType() == Enums::OT_KERN_READ \|\| 427 ii->opType() == Enums::OT_ARG)) { 428 // Here for ALU instruction (all others) 429 if (!computeUnit->wfWait[simdId].prerdy()) { 430 // Is alu slot free? 431 return 0; 432 } 433 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 434 VrfAccessType::RD_WR)) { 435 return 0; 436 } 437 438 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 439 return 0; 440 } 441 ready_inst = true; 442 } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) \|\| 443 IS_OT_WRITE_GM(ii->opType()) \|\| IS_OT_ATOMIC_GM(ii->opType()))) { 444 // Here Global memory instruction 445 if (IS_OT_READ_GM(ii->opType()) \|\| IS_OT_ATOMIC_GM(ii->opType())) { 446 // Are there in pipe or outstanding global memory write requests? 447 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { 448 return 0; 449 } 450 } 451 452 if (IS_OT_WRITE_GM(ii->opType()) \|\| IS_OT_ATOMIC_GM(ii->opType()) \|\| 453 IS_OT_HIST_GM(ii->opType())) { 454 // Are there in pipe or outstanding global memory read requests? 455 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) 456 return 0; 457 } 458 459 if (!glbMemIssueRdy) { 460 // Is WV issue slot free? 461 return 0; 462 } 463 464 if (!glbMemBusRdy) { 465 // Is there an available VRF->Global memory read bus? 466 return 0; 467 } 468 469 if (!computeUnit->globalMemoryPipe. 470 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 471 // Can we insert a new request to the Global Mem Request FIFO? 472 return 0; 473 } 474 // can we schedule source & destination operands on the VRF? 475 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 476 VrfAccessType::RD_WR)) { 477 return 0; 478 } 479 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 480 return 0; 481 } 482 ready_inst = true; 483 } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) \|\| 484 IS_OT_WRITE_LM(ii->opType()) \|\| IS_OT_ATOMIC_LM(ii->opType()))) { 485 // Here for Shared memory instruction 486 if (IS_OT_READ_LM(ii->opType()) \|\| IS_OT_ATOMIC_LM(ii->opType())) { 487 if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) { 488 return 0; 489 } 490 } 491 492 if (IS_OT_WRITE_LM(ii->opType()) \|\| IS_OT_ATOMIC_LM(ii->opType()) \|\| 493 IS_OT_HIST_LM(ii->opType())) { 494 if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) { 495 return 0; 496 } 497 } 498 499 if (!locMemBusRdy) { 500 // Is there an available VRF->LDS read bus? 501 return 0; 502 } 503 if (!locMemIssueRdy) { 504 // Is wave slot free? 505 return 0; 506 } 507 508 if (!computeUnit->localMemoryPipe. 509 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { 510 // Can we insert a new request to the LDS Request FIFO? 511 return 0; 512 } 513 // can we schedule source & destination operands on the VRF? 514 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 515 VrfAccessType::RD_WR)) { 516 return 0; 517 } 518 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 519 return 0; 520 } 521 ready_inst = true; 522 } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) \|\| 523 IS_OT_WRITE_PM(ii->opType()) \|\| IS_OT_ATOMIC_PM(ii->opType()))) { 524 // Here for Private memory instruction ------------------------ // 525 if (IS_OT_READ_PM(ii->opType()) \|\| IS_OT_ATOMIC_PM(ii->opType())) { 526 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { 527 return 0; 528 } 529 } 530 531 if (IS_OT_WRITE_PM(ii->opType()) \|\| IS_OT_ATOMIC_PM(ii->opType()) \|\| 532 IS_OT_HIST_PM(ii->opType())) { 533 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) { 534 return 0; 535 } 536 } 537 538 if (!glbMemBusRdy) { 539 // Is there an available VRF->Global memory read bus? 540 return 0; 541 } 542 543 if (!glbMemIssueRdy) { 544 // Is wave slot free? 545 return 0; 546 } 547 548 if (!computeUnit->globalMemoryPipe. 549 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 550 // Can we insert a new request to the Global Mem Request FIFO? 551 return 0; 552 } 553 // can we schedule source & destination operands on the VRF? 554 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 555 VrfAccessType::RD_WR)) { 556 return 0; 557 } 558 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 559 return 0; 560 } 561 ready_inst = true; 562 } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) { 563 if (!glbMemBusRdy) { 564 // Is there an available VRF->Global memory read bus? 565 return 0; 566 } 567 568 if (!locMemBusRdy) { 569 // Is there an available VRF->LDS read bus? 570 return 0; 571 } 572 573 if (!glbMemIssueRdy) { 574 // Is wave slot free? 575 return 0; 576 } 577 578 if (!locMemIssueRdy) { 579 return 0; 580 } 581 if (!computeUnit->globalMemoryPipe. 582 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 583 // Can we insert a new request to the Global Mem Request FIFO? 584 return 0; 585 } 586 587 if (!computeUnit->localMemoryPipe. 588 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { 589 // Can we insert a new request to the LDS Request FIFO? 590 return 0; 591 } 592 // can we schedule source & destination operands on the VRF? 593 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 594 VrfAccessType::RD_WR)) { 595 return 0; 596 } 597 // are all the operands ready? (RAW, WAW and WAR depedencies met?) 598 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 599 return 0; 600 } 601 ready_inst = true; 602 } else { 603 return 0; 604 } 605 606 assert(ready_inst); 607 608 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, 609 simdId, wfSlotId, ii->disassemble()); 610 return 1; 611} 612 613void 614Wavefront::updateResources() 615{ 616 // Get current instruction 617 GPUDynInstPtr ii = instructionBuffer.front(); 618 assert(ii); 619 computeUnit->vrf[simdId]->updateResources(this, ii); 620 // Single precision ALU or Branch or Return or Special instruction 621 if (ii->opType() == Enums::OT_ALU \|\| ii->opType() == Enums::OT_SPECIAL \|\| 622 ii->opType() == Enums::OT_BRANCH \|\| IS_OT_LDAS(ii->opType()) \|\| 623 // FIXME: Kernel argument loads are currently treated as ALU operations 624 // since we don't send memory packets at execution. If we fix that then 625 // we should map them to one of the memory pipelines 626 ii->opType()==Enums::OT_KERN_READ \|\| 627 ii->opType()==Enums::OT_ARG \|\| 628 ii->opType()==Enums::OT_RET) { 629 computeUnit->aluPipe[simdId].preset(computeUnit->shader-> 630 ticks(computeUnit->spBypassLength())); 631 // this is to enforce a fixed number of cycles per issue slot per SIMD 632 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 633 ticks(computeUnit->issuePeriod)); 634 } else if (ii->opType() == Enums::OT_BARRIER) { 635 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 636 ticks(computeUnit->issuePeriod)); 637 } else if (ii->opType() == Enums::OT_FLAT_READ) { 638 assert(Enums::SC_NONE != ii->executedAs()); 639 memReqsInPipe++; 640 rdGmReqsInPipe++; 641 if ( Enums::SC_SHARED == ii->executedAs() ) { 642 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 643 preset(computeUnit->shader->ticks(4)); 644 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 645 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 646 } else { 647 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 648 preset(computeUnit->shader->ticks(4)); 649 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 650 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 651 } 652 } else if (ii->opType() == Enums::OT_FLAT_WRITE) { 653 assert(Enums::SC_NONE != ii->executedAs()); 654 memReqsInPipe++; 655 wrGmReqsInPipe++; 656 if (Enums::SC_SHARED == ii->executedAs()) { 657 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 658 preset(computeUnit->shader->ticks(8)); 659 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 660 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 661 } else { 662 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 663 preset(computeUnit->shader->ticks(8)); 664 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 665 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 666 } 667 } else if (IS_OT_READ_GM(ii->opType())) { 668 memReqsInPipe++; 669 rdGmReqsInPipe++; 670 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 671 preset(computeUnit->shader->ticks(4)); 672 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 673 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 674 } else if (IS_OT_WRITE_GM(ii->opType())) { 675 memReqsInPipe++; 676 wrGmReqsInPipe++; 677 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 678 preset(computeUnit->shader->ticks(8)); 679 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 680 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 681 } else if (IS_OT_ATOMIC_GM(ii->opType())) { 682 memReqsInPipe++; 683 wrGmReqsInPipe++; 684 rdGmReqsInPipe++; 685 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 686 preset(computeUnit->shader->ticks(8)); 687 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 688 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 689 } else if (IS_OT_READ_LM(ii->opType())) { 690 memReqsInPipe++; 691 rdLmReqsInPipe++; 692 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 693 preset(computeUnit->shader->ticks(4)); 694 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 695 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 696 } else if (IS_OT_WRITE_LM(ii->opType())) { 697 memReqsInPipe++; 698 wrLmReqsInPipe++; 699 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 700 preset(computeUnit->shader->ticks(8)); 701 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 702 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 703 } else if (IS_OT_ATOMIC_LM(ii->opType())) { 704 memReqsInPipe++; 705 wrLmReqsInPipe++; 706 rdLmReqsInPipe++; 707 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 708 preset(computeUnit->shader->ticks(8)); 709 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 710 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 711 } else if (IS_OT_READ_PM(ii->opType())) { 712 memReqsInPipe++; 713 rdGmReqsInPipe++; 714 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 715 preset(computeUnit->shader->ticks(4)); 716 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 717 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 718 } else if (IS_OT_WRITE_PM(ii->opType())) { 719 memReqsInPipe++; 720 wrGmReqsInPipe++; 721 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 722 preset(computeUnit->shader->ticks(8)); 723 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 724 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 725 } else if (IS_OT_ATOMIC_PM(ii->opType())) { 726 memReqsInPipe++; 727 wrGmReqsInPipe++; 728 rdGmReqsInPipe++; 729 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 730 preset(computeUnit->shader->ticks(8)); 731 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 732 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 733 } 734} 735 736void 737Wavefront::exec() 738{ 739 // ---- Exit if wavefront is inactive ----------------------------- // 740 741 if (status == S_STOPPED \|\| status == S_RETURNING \|\| 742 instructionBuffer.empty()) { 743 return; 744 } 745 746 // Get current instruction 747 748 GPUDynInstPtr ii = instructionBuffer.front(); 749 750 const uint32_t old_pc = pc(); 751 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " 752 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, 753 ii->disassemble(), old_pc); 754 ii->execute(); 755 // access the VRF 756 computeUnit->vrf[simdId]->exec(ii, this); 757 srcRegOpDist.sample(ii->numSrcRegOperands()); 758 dstRegOpDist.sample(ii->numDstRegOperands()); 759 computeUnit->numInstrExecuted++; 760 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - 761 computeUnit->lastExecCycle[simdId]); 762 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); 763 if (pc() == old_pc) { 764 uint32_t new_pc = old_pc + 1; 765 // PC not modified by instruction, proceed to next or pop frame 766 pc(new_pc); 767 if (new_pc == rpc()) { 768 popFromReconvergenceStack(); 769 discardFetch(); 770 } else { 771 instructionBuffer.pop_front(); 772 } 773 } 774 775 if (computeUnit->shader->hsail_mode==Shader::SIMT) { 776 const int num_active_lanes = execMask().count(); 777 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); 778 computeUnit->numVecOpsExecuted += num_active_lanes; 779 if (isGmInstruction(ii)) { 780 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); 781 } else if (isLmInstruction(ii)) { 782 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); 783 } 784 } 785 786 // ---- Update Vector ALU pipeline and other resources ------------------ // 787 // Single precision ALU or Branch or Return or Special instruction 788 if (ii->opType() == Enums::OT_ALU \|\| ii->opType() == Enums::OT_SPECIAL \|\| 789 ii->opType() == Enums::OT_BRANCH \|\| IS_OT_LDAS(ii->opType()) \|\| 790 // FIXME: Kernel argument loads are currently treated as ALU operations 791 // since we don't send memory packets at execution. If we fix that then 792 // we should map them to one of the memory pipelines 793 ii->opType() == Enums::OT_KERN_READ \|\| 794 ii->opType() == Enums::OT_ARG \|\| 795 ii->opType() == Enums::OT_RET) { 796 computeUnit->aluPipe[simdId].set(computeUnit->shader-> 797 ticks(computeUnit->spBypassLength())); 798 799 // this is to enforce a fixed number of cycles per issue slot per SIMD 800 computeUnit->wfWait[simdId].set(computeUnit->shader-> 801 ticks(computeUnit->issuePeriod)); 802 } else if (ii->opType() == Enums::OT_BARRIER) { 803 computeUnit->wfWait[simdId].set(computeUnit->shader-> 804 ticks(computeUnit->issuePeriod)); 805 } else if (ii->opType() == Enums::OT_FLAT_READ) { 806 assert(Enums::SC_NONE != ii->executedAs()); 807 808 if (Enums::SC_SHARED == ii->executedAs()) { 809 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 810 set(computeUnit->shader->ticks(4)); 811 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 812 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 813 } else { 814 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 815 set(computeUnit->shader->ticks(4)); 816 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 817 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 818 } 819 } else if (ii->opType() == Enums::OT_FLAT_WRITE) { 820 assert(Enums::SC_NONE != ii->executedAs()); 821 if (Enums::SC_SHARED == ii->executedAs()) { 822 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 823 set(computeUnit->shader->ticks(8)); 824 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 825 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 826 } else { 827 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 828 set(computeUnit->shader->ticks(8)); 829 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 830 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 831 } 832 } else if (IS_OT_READ_GM(ii->opType())) { 833 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 834 set(computeUnit->shader->ticks(4)); 835 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 836 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 837 } else if (IS_OT_WRITE_GM(ii->opType())) { 838 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 839 set(computeUnit->shader->ticks(8)); 840 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 841 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 842 } else if (IS_OT_ATOMIC_GM(ii->opType())) { 843 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 844 set(computeUnit->shader->ticks(8)); 845 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 846 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 847 } else if (IS_OT_READ_LM(ii->opType())) { 848 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 849 set(computeUnit->shader->ticks(4)); 850 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 851 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 852 } else if (IS_OT_WRITE_LM(ii->opType())) { 853 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 854 set(computeUnit->shader->ticks(8)); 855 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 856 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 857 } else if (IS_OT_ATOMIC_LM(ii->opType())) { 858 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 859 set(computeUnit->shader->ticks(8)); 860 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 861 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 862 } 863} 864 865bool 866Wavefront::waitingAtBarrier(int lane) 867{ 868 return barCnt[lane] < maxBarCnt; 869} 870 871void 872Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 873 const VectorMask& mask) 874{ 875 assert(mask.count()); 876 reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask}); 877} 878 879void 880Wavefront::popFromReconvergenceStack() 881{ 882 assert(!reconvergenceStack.empty()); 883 884 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", 885 computeUnit->cu_id, simdId, wfSlotId, wfDynId, 886 execMask().to_string<char, std::string::traits_type, 887 std::string::allocator_type>().c_str(), pc()); 888 889 reconvergenceStack.pop_back(); 890 891 DPRINTF(WavefrontStack, "%3i %s\n", pc(), 892 execMask().to_string<char, std::string::traits_type, 893 std::string::allocator_type>().c_str()); 894 895} 896 897void 898Wavefront::discardFetch() 899{ 900 instructionBuffer.clear(); 901 dropFetch \|=pendingFetch; 902} 903 904uint32_t 905Wavefront::pc() const 906{ 907 return reconvergenceStack.back()->pc; 908} 909 910uint32_t 911Wavefront::rpc() const 912{ 913 return reconvergenceStack.back()->rpc; 914} 915 916VectorMask 917Wavefront::execMask() const 918{ 919 return reconvergenceStack.back()->execMask; 920} 921 922bool 923Wavefront::execMask(int lane) const 924{ 925 return reconvergenceStack.back()->execMask[lane]; 926} 927 928 929void 930Wavefront::pc(uint32_t new_pc) 931{ 932 reconvergenceStack.back()->pc = new_pc; 933} 934 935uint32_t 936Wavefront::getStaticContextSize() const 937{ 938 return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) + 939 sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) + 940 sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) + 941 sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) + 942 computeUnit->wfSize() * sizeof(ReconvergenceStackEntry); 943} 944 945void 946Wavefront::getContext(const void out) 947{ 948* uint8_t iter = (uint8_t )out; 949 for (int i = 0; i < barCnt.size(); i++) { 950 (int )iter = barCnt[i]; iter += sizeof(barCnt[i]); 951 } 952 (int )iter = wfId; iter += sizeof(wfId); 953 (int )iter = maxBarCnt; iter += sizeof(maxBarCnt); 954 (int )iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt); 955 (int )iter = barrierCnt; iter += sizeof(barrierCnt); 956 (int )iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id); 957 (uint32_t )iter = wgId; iter += sizeof(wgId); 958 (uint32_t )iter = barrierId; iter += sizeof(barrierId); 959 (uint64_t )iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong()); 960 (Addr )iter = privBase; iter += sizeof(privBase); 961 (Addr )iter = spillBase; iter += sizeof(spillBase); 962 963 int stackSize = reconvergenceStack.size(); 964 ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(), 965 std::numeric_limits<uint32_t>::max(), 966 std::numeric_limits<uint64_t>::max()}; 967 for (int i = 0; i < workItemId[0].size(); i++) { 968 if (i < stackSize) { 969 (ReconvergenceStackEntry )iter = reconvergenceStack.back(); 970* iter += sizeof(ReconvergenceStackEntry); 971 reconvergenceStack.pop_back(); 972 } else { 973 (ReconvergenceStackEntry )iter = empty; 974 iter += sizeof(ReconvergenceStackEntry); 975 } 976 } 977 978 int wf_size = computeUnit->wfSize(); 979 for (int i = 0; i < maxSpVgprs; i++) { 980 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); 981 for (int lane = 0; lane < wf_size; lane++) { 982 uint32_t regVal = computeUnit->vrf[simdId]-> 983 read<uint32_t>(vgprIdx,lane); 984 (uint32_t )iter = regVal; iter += sizeof(regVal); 985 } 986 } 987 988 for (int i = 0; i < maxDpVgprs; i++) { 989 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); 990 for (int lane = 0; lane < wf_size; lane++) { 991 uint64_t regVal = computeUnit->vrf[simdId]-> 992 read<uint64_t>(vgprIdx,lane); 993 (uint64_t )iter = regVal; iter += sizeof(regVal); 994 } 995 } 996 997 for (int i = 0; i < condRegState->numRegs(); i++) { 998 for (int lane = 0; lane < wf_size; lane++) { 999 uint64_t regVal = condRegState->read<uint64_t>(i, lane); 1000 (uint64_t )iter = regVal; iter += sizeof(regVal); 1001 } 1002 } 1003 1004 /* saving LDS content / 1005* if (ldsChunk) 1006 for (int i = 0; i < ldsChunk->size(); i++) { 1007 char val = ldsChunk->read<char>(i); 1008 (char ) iter = val; iter += sizeof(val); 1009 } 1010} 1011 1012void 1013Wavefront::setContext(const void in) 1014{ 1015* uint8_t iter = (uint8_t )in; 1016 for (int i = 0; i < barCnt.size(); i++) { 1017 barCnt[i] = (int )iter; iter += sizeof(barCnt[i]); 1018 } 1019 wfId = (int )iter; iter += sizeof(wfId); 1020 maxBarCnt = (int )iter; iter += sizeof(maxBarCnt); 1021 oldBarrierCnt = (int )iter; iter += sizeof(oldBarrierCnt); 1022 barrierCnt = (int )iter; iter += sizeof(barrierCnt); 1023 computeUnit->cu_id = (int )iter; iter += sizeof(computeUnit->cu_id); 1024 wgId = (uint32_t )iter; iter += sizeof(wgId); 1025 barrierId = (uint32_t )iter; iter += sizeof(barrierId); 1026 initMask = VectorMask((uint64_t )iter); iter += sizeof(initMask); 1027 privBase = (Addr )iter; iter += sizeof(privBase); 1028 spillBase = (Addr )iter; iter += sizeof(spillBase); 1029 1030 for (int i = 0; i < workItemId[0].size(); i++) { 1031 ReconvergenceStackEntry newEntry = (ReconvergenceStackEntry )iter; 1032 iter += sizeof(ReconvergenceStackEntry); 1033 if (newEntry.pc != std::numeric_limits<uint32_t>::max()) { 1034 pushToReconvergenceStack(newEntry.pc, newEntry.rpc, 1035 newEntry.execMask); 1036 } 1037 } 1038 int wf_size = computeUnit->wfSize(); 1039 1040 for (int i = 0; i < maxSpVgprs; i++) { 1041 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); 1042 for (int lane = 0; lane < wf_size; lane++) { 1043 uint32_t regVal = (uint32_t )iter; iter += sizeof(regVal); 1044 computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane); 1045 } 1046 } 1047 1048 for (int i = 0; i < maxDpVgprs; i++) { 1049 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); 1050 for (int lane = 0; lane < wf_size; lane++) { 1051 uint64_t regVal = (uint64_t )iter; iter += sizeof(regVal); 1052 computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane); 1053 } 1054 } 1055 1056 for (int i = 0; i < condRegState->numRegs(); i++) { 1057 for (int lane = 0; lane < wf_size; lane++) { 1058 uint64_t regVal = (uint64_t )iter; iter += sizeof(regVal); 1059 condRegState->write<uint64_t>(i, lane, regVal); 1060 } 1061 } 1062 /** Restoring LDS contents / 1063* if (ldsChunk) 1064 for (int i = 0; i < ldsChunk->size(); i++) { 1065 char val = (char ) iter; iter += sizeof(val); 1066 ldsChunk->write<char>(i, val); 1067 } 1068}	1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 / 35 36#include "gpu-compute/wavefront.hh" 37 38#include "debug/GPUExec.hh" 39#include "debug/WavefrontStack.hh" 40#include "gpu-compute/code_enums.hh" 41#include "gpu-compute/compute_unit.hh" 42#include "gpu-compute/gpu_dyn_inst.hh" 43#include "gpu-compute/shader.hh" 44#include "gpu-compute/vector_register_file.hh" 45 46Wavefront 47WavefrontParams::create() 48{ 49 return new Wavefront(this); 50} 51 52Wavefront::Wavefront(const Params p) 53 : SimObject(p), callArgMem(nullptr) 54{ 55 lastTrace = 0; 56 simdId = p->simdId; 57 wfSlotId = p->wf_slot_id; 58 status = S_STOPPED; 59 reservedVectorRegs = 0; 60 startVgprIndex = 0; 61 outstandingReqs = 0; 62 memReqsInPipe = 0; 63 outstandingReqsWrGm = 0; 64 outstandingReqsWrLm = 0; 65 outstandingReqsRdGm = 0; 66 outstandingReqsRdLm = 0; 67 rdLmReqsInPipe = 0; 68 rdGmReqsInPipe = 0; 69 wrLmReqsInPipe = 0; 70 wrGmReqsInPipe = 0; 71 72 barrierCnt = 0; 73 oldBarrierCnt = 0; 74 stalledAtBarrier = false; 75 76 memTraceBusy = 0; 77 oldVgprTcnt = 0xffffffffffffffffll; 78 oldDgprTcnt = 0xffffffffffffffffll; 79 oldVgpr.resize(p->wfSize); 80 81 pendingFetch = false; 82 dropFetch = false; 83 condRegState = new ConditionRegisterState(); 84 maxSpVgprs = 0; 85 maxDpVgprs = 0; 86 lastAddr.resize(p->wfSize); 87 workItemFlatId.resize(p->wfSize); 88 oldDgpr.resize(p->wfSize); 89 barCnt.resize(p->wfSize); 90 for (int i = 0; i < 3; ++i) { 91 workItemId[i].resize(p->wfSize); 92 } 93} 94 95void 96Wavefront::regStats() 97{ 98 SimObject::regStats(); 99 100* srcRegOpDist 101 .init(0, 4, 2) 102 .name(name() + ".src_reg_operand_dist") 103 .desc("number of executed instructions with N source register operands") 104 ; 105 106 dstRegOpDist 107 .init(0, 3, 2) 108 .name(name() + ".dst_reg_operand_dist") 109 .desc("number of executed instructions with N destination register " 110 "operands") 111 ; 112 113 // FIXME: the name of the WF needs to be unique 114 numTimesBlockedDueWAXDependencies 115 .name(name() + ".timesBlockedDueWAXDependencies") 116 .desc("number of times the wf's instructions are blocked due to WAW " 117 "or WAR dependencies") 118 ; 119 120 // FIXME: the name of the WF needs to be unique 121 numTimesBlockedDueRAWDependencies 122 .name(name() + ".timesBlockedDueRAWDependencies") 123 .desc("number of times the wf's instructions are blocked due to RAW " 124 "dependencies") 125 ; 126 127 // FIXME: the name of the WF needs to be unique 128 numTimesBlockedDueVrfPortAvail 129 .name(name() + ".timesBlockedDueVrfPortAvail") 130 .desc("number of times instructions are blocked due to VRF port " 131 "availability") 132 ; 133} 134 135void 136Wavefront::init() 137{ 138 reservedVectorRegs = 0; 139 startVgprIndex = 0; 140} 141 142void 143Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs) 144{ 145 condRegState->init(num_cregs); 146 maxSpVgprs = num_sregs; 147 maxDpVgprs = num_dregs; 148} 149 150Wavefront::~Wavefront() 151{ 152 if (callArgMem) 153 delete callArgMem; 154 delete condRegState; 155} 156 157void 158Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr) 159{ 160 wfDynId = _wf_dyn_id; 161 basePtr = _base_ptr; 162 status = S_RUNNING; 163} 164 165bool 166Wavefront::isGmInstruction(GPUDynInstPtr ii) 167{ 168 if (IS_OT_READ_PM(ii->opType()) \|\| IS_OT_WRITE_PM(ii->opType()) \|\| 169 IS_OT_ATOMIC_PM(ii->opType())) { 170 return true; 171 } 172 173 if (IS_OT_READ_GM(ii->opType()) \|\| IS_OT_WRITE_GM(ii->opType()) \|\| 174 IS_OT_ATOMIC_GM(ii->opType())) { 175 return true; 176 } 177 178 if (IS_OT_FLAT(ii->opType())) { 179 return true; 180 } 181 182 return false; 183} 184 185bool 186Wavefront::isLmInstruction(GPUDynInstPtr ii) 187{ 188 if (IS_OT_READ_LM(ii->opType()) \|\| IS_OT_WRITE_LM(ii->opType()) \|\| 189 IS_OT_ATOMIC_LM(ii->opType())) { 190 return true; 191 } 192 193 return false; 194} 195 196bool 197Wavefront::isOldestInstALU() 198{ 199 assert(!instructionBuffer.empty()); 200 GPUDynInstPtr ii = instructionBuffer.front(); 201 202 if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP \|\| 203 ii->opType() == Enums::OT_RET \|\| ii->opType() == Enums::OT_BRANCH \|\| 204 ii->opType() == Enums::OT_ALU \|\| IS_OT_LDAS(ii->opType()) \|\| 205 ii->opType() == Enums::OT_KERN_READ)) { 206 return true; 207 } 208 209 return false; 210} 211 212bool 213Wavefront::isOldestInstBarrier() 214{ 215 assert(!instructionBuffer.empty()); 216 GPUDynInstPtr ii = instructionBuffer.front(); 217 218 if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) { 219 return true; 220 } 221 222 return false; 223} 224 225bool 226Wavefront::isOldestInstGMem() 227{ 228 assert(!instructionBuffer.empty()); 229 GPUDynInstPtr ii = instructionBuffer.front(); 230 231 if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) \|\| 232 IS_OT_WRITE_GM(ii->opType()) \|\| IS_OT_ATOMIC_GM(ii->opType()))) { 233 234 return true; 235 } 236 237 return false; 238} 239 240bool 241Wavefront::isOldestInstLMem() 242{ 243 assert(!instructionBuffer.empty()); 244 GPUDynInstPtr ii = instructionBuffer.front(); 245 246 if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) \|\| 247 IS_OT_WRITE_LM(ii->opType()) \|\| IS_OT_ATOMIC_LM(ii->opType()))) { 248 249 return true; 250 } 251 252 return false; 253} 254 255bool 256Wavefront::isOldestInstPrivMem() 257{ 258 assert(!instructionBuffer.empty()); 259 GPUDynInstPtr ii = instructionBuffer.front(); 260 261 if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) \|\| 262 IS_OT_WRITE_PM(ii->opType()) \|\| IS_OT_ATOMIC_PM(ii->opType()))) { 263 264 return true; 265 } 266 267 return false; 268} 269 270bool 271Wavefront::isOldestInstFlatMem() 272{ 273 assert(!instructionBuffer.empty()); 274 GPUDynInstPtr ii = instructionBuffer.front(); 275 276 if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) { 277 278 return true; 279 } 280 281 return false; 282} 283 284// Return true if the Wavefront's instruction 285// buffer has branch instruction. 286bool 287Wavefront::instructionBufferHasBranch() 288{ 289 for (auto it : instructionBuffer) { 290 GPUDynInstPtr ii = it; 291 292 if (ii->opType() == Enums::OT_RET \|\| ii->opType() == Enums::OT_BRANCH) { 293 return true; 294 } 295 } 296 297 return false; 298} 299 300// Remap HSAIL register to physical VGPR. 301// HSAIL register = virtual register assigned to an operand by HLC compiler 302uint32_t 303Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode) 304{ 305 assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0)); 306 // add the offset from where the VGPRs of the wavefront have been assigned 307 uint32_t physicalVgprIndex = startVgprIndex + vgprIndex; 308 // HSAIL double precision (DP) register: calculate the physical VGPR index 309 // assuming that DP registers are placed after SP ones in the VRF. The DP 310 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust 311 // the DP VGPR index before mapping it to the physical VRF address space 312 if (mode == 1 && size > 4) { 313 physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex); 314 } 315 316 assert((startVgprIndex <= physicalVgprIndex) && 317 (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex); 318 319 // calculate absolute physical VGPR index 320 return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs(); 321} 322 323// Return true if this wavefront is ready 324// to execute an instruction of the specified type. 325int 326Wavefront::ready(itype_e type) 327{ 328 // Check to make sure wave is running 329 if (status == S_STOPPED \|\| status == S_RETURNING \|\| 330 instructionBuffer.empty()) { 331 return 0; 332 } 333 334 // Is the wave waiting at a barrier 335 if (stalledAtBarrier) { 336 if (!computeUnit->AllAtBarrier(barrierId,barrierCnt, 337 computeUnit->getRefCounter(dispatchId, wgId))) { 338 // Are all threads at barrier? 339 return 0; 340 } 341 oldBarrierCnt = barrierCnt; 342 stalledAtBarrier = false; 343 } 344 345 // Read instruction 346 GPUDynInstPtr ii = instructionBuffer.front(); 347 348 bool ready_inst M5_VAR_USED = false; 349 bool glbMemBusRdy = false; 350 bool glbMemIssueRdy = false; 351 if (type == I_GLOBAL \|\| type == I_FLAT \|\| type == I_PRIVATE) { 352 for (int j=0; j < computeUnit->numGlbMemUnits; ++j) { 353 if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy()) 354 glbMemBusRdy = true; 355 if (computeUnit->wfWait[j].prerdy()) 356 glbMemIssueRdy = true; 357 } 358 } 359 bool locMemBusRdy = false; 360 bool locMemIssueRdy = false; 361 if (type == I_SHARED \|\| type == I_FLAT) { 362 for (int j=0; j < computeUnit->numLocMemUnits; ++j) { 363 if (computeUnit->vrfToLocalMemPipeBus[j].prerdy()) 364 locMemBusRdy = true; 365 if (computeUnit->wfWait[j].prerdy()) 366 locMemIssueRdy = true; 367 } 368 } 369 370 // The following code is very error prone and the entire process for 371 // checking readiness will be fixed eventually. In the meantime, let's 372 // make sure that we do not silently let an instruction type slip 373 // through this logic and always return not ready. 374 if (!(ii->opType() == Enums::OT_BARRIER \|\| ii->opType() == Enums::OT_NOP \|\| 375 ii->opType() == Enums::OT_RET \|\| ii->opType() == Enums::OT_BRANCH \|\| 376 ii->opType() == Enums::OT_ALU \|\| IS_OT_LDAS(ii->opType()) \|\| 377 ii->opType() == Enums::OT_KERN_READ \|\| 378 ii->opType() == Enums::OT_ARG \|\| 379 IS_OT_READ_GM(ii->opType()) \|\| IS_OT_WRITE_GM(ii->opType()) \|\| 380 IS_OT_ATOMIC_GM(ii->opType()) \|\| IS_OT_READ_LM(ii->opType()) \|\| 381 IS_OT_WRITE_LM(ii->opType()) \|\| IS_OT_ATOMIC_LM(ii->opType()) \|\| 382 IS_OT_READ_PM(ii->opType()) \|\| IS_OT_WRITE_PM(ii->opType()) \|\| 383 IS_OT_ATOMIC_PM(ii->opType()) \|\| IS_OT_FLAT(ii->opType()))) { 384 panic("next instruction: %s is of unknown type\n", ii->disassemble()); 385 } 386 387 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", 388 computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); 389 390 if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) { 391 // Here for ALU instruction (barrier) 392 if (!computeUnit->wfWait[simdId].prerdy()) { 393 // Is wave slot free? 394 return 0; 395 } 396 397 // Are there in pipe or outstanding memory requests? 398 if ((outstandingReqs + memReqsInPipe) > 0) { 399 return 0; 400 } 401 402 ready_inst = true; 403 } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) { 404 // Here for ALU instruction (nop) 405 if (!computeUnit->wfWait[simdId].prerdy()) { 406 // Is wave slot free? 407 return 0; 408 } 409 410 ready_inst = true; 411 } else if (type == I_ALU && ii->opType() == Enums::OT_RET) { 412 // Here for ALU instruction (return) 413 if (!computeUnit->wfWait[simdId].prerdy()) { 414 // Is wave slot free? 415 return 0; 416 } 417 418 // Are there in pipe or outstanding memory requests? 419 if ((outstandingReqs + memReqsInPipe) > 0) { 420 return 0; 421 } 422 423 ready_inst = true; 424 } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH \|\| 425 ii->opType() == Enums::OT_ALU \|\| IS_OT_LDAS(ii->opType()) \|\| 426 ii->opType() == Enums::OT_KERN_READ \|\| 427 ii->opType() == Enums::OT_ARG)) { 428 // Here for ALU instruction (all others) 429 if (!computeUnit->wfWait[simdId].prerdy()) { 430 // Is alu slot free? 431 return 0; 432 } 433 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 434 VrfAccessType::RD_WR)) { 435 return 0; 436 } 437 438 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 439 return 0; 440 } 441 ready_inst = true; 442 } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) \|\| 443 IS_OT_WRITE_GM(ii->opType()) \|\| IS_OT_ATOMIC_GM(ii->opType()))) { 444 // Here Global memory instruction 445 if (IS_OT_READ_GM(ii->opType()) \|\| IS_OT_ATOMIC_GM(ii->opType())) { 446 // Are there in pipe or outstanding global memory write requests? 447 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { 448 return 0; 449 } 450 } 451 452 if (IS_OT_WRITE_GM(ii->opType()) \|\| IS_OT_ATOMIC_GM(ii->opType()) \|\| 453 IS_OT_HIST_GM(ii->opType())) { 454 // Are there in pipe or outstanding global memory read requests? 455 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) 456 return 0; 457 } 458 459 if (!glbMemIssueRdy) { 460 // Is WV issue slot free? 461 return 0; 462 } 463 464 if (!glbMemBusRdy) { 465 // Is there an available VRF->Global memory read bus? 466 return 0; 467 } 468 469 if (!computeUnit->globalMemoryPipe. 470 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 471 // Can we insert a new request to the Global Mem Request FIFO? 472 return 0; 473 } 474 // can we schedule source & destination operands on the VRF? 475 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 476 VrfAccessType::RD_WR)) { 477 return 0; 478 } 479 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 480 return 0; 481 } 482 ready_inst = true; 483 } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) \|\| 484 IS_OT_WRITE_LM(ii->opType()) \|\| IS_OT_ATOMIC_LM(ii->opType()))) { 485 // Here for Shared memory instruction 486 if (IS_OT_READ_LM(ii->opType()) \|\| IS_OT_ATOMIC_LM(ii->opType())) { 487 if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) { 488 return 0; 489 } 490 } 491 492 if (IS_OT_WRITE_LM(ii->opType()) \|\| IS_OT_ATOMIC_LM(ii->opType()) \|\| 493 IS_OT_HIST_LM(ii->opType())) { 494 if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) { 495 return 0; 496 } 497 } 498 499 if (!locMemBusRdy) { 500 // Is there an available VRF->LDS read bus? 501 return 0; 502 } 503 if (!locMemIssueRdy) { 504 // Is wave slot free? 505 return 0; 506 } 507 508 if (!computeUnit->localMemoryPipe. 509 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { 510 // Can we insert a new request to the LDS Request FIFO? 511 return 0; 512 } 513 // can we schedule source & destination operands on the VRF? 514 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 515 VrfAccessType::RD_WR)) { 516 return 0; 517 } 518 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 519 return 0; 520 } 521 ready_inst = true; 522 } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) \|\| 523 IS_OT_WRITE_PM(ii->opType()) \|\| IS_OT_ATOMIC_PM(ii->opType()))) { 524 // Here for Private memory instruction ------------------------ // 525 if (IS_OT_READ_PM(ii->opType()) \|\| IS_OT_ATOMIC_PM(ii->opType())) { 526 if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { 527 return 0; 528 } 529 } 530 531 if (IS_OT_WRITE_PM(ii->opType()) \|\| IS_OT_ATOMIC_PM(ii->opType()) \|\| 532 IS_OT_HIST_PM(ii->opType())) { 533 if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) { 534 return 0; 535 } 536 } 537 538 if (!glbMemBusRdy) { 539 // Is there an available VRF->Global memory read bus? 540 return 0; 541 } 542 543 if (!glbMemIssueRdy) { 544 // Is wave slot free? 545 return 0; 546 } 547 548 if (!computeUnit->globalMemoryPipe. 549 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 550 // Can we insert a new request to the Global Mem Request FIFO? 551 return 0; 552 } 553 // can we schedule source & destination operands on the VRF? 554 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 555 VrfAccessType::RD_WR)) { 556 return 0; 557 } 558 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 559 return 0; 560 } 561 ready_inst = true; 562 } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) { 563 if (!glbMemBusRdy) { 564 // Is there an available VRF->Global memory read bus? 565 return 0; 566 } 567 568 if (!locMemBusRdy) { 569 // Is there an available VRF->LDS read bus? 570 return 0; 571 } 572 573 if (!glbMemIssueRdy) { 574 // Is wave slot free? 575 return 0; 576 } 577 578 if (!locMemIssueRdy) { 579 return 0; 580 } 581 if (!computeUnit->globalMemoryPipe. 582 isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { 583 // Can we insert a new request to the Global Mem Request FIFO? 584 return 0; 585 } 586 587 if (!computeUnit->localMemoryPipe. 588 isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) { 589 // Can we insert a new request to the LDS Request FIFO? 590 return 0; 591 } 592 // can we schedule source & destination operands on the VRF? 593 if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, 594 VrfAccessType::RD_WR)) { 595 return 0; 596 } 597 // are all the operands ready? (RAW, WAW and WAR depedencies met?) 598 if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { 599 return 0; 600 } 601 ready_inst = true; 602 } else { 603 return 0; 604 } 605 606 assert(ready_inst); 607 608 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id, 609 simdId, wfSlotId, ii->disassemble()); 610 return 1; 611} 612 613void 614Wavefront::updateResources() 615{ 616 // Get current instruction 617 GPUDynInstPtr ii = instructionBuffer.front(); 618 assert(ii); 619 computeUnit->vrf[simdId]->updateResources(this, ii); 620 // Single precision ALU or Branch or Return or Special instruction 621 if (ii->opType() == Enums::OT_ALU \|\| ii->opType() == Enums::OT_SPECIAL \|\| 622 ii->opType() == Enums::OT_BRANCH \|\| IS_OT_LDAS(ii->opType()) \|\| 623 // FIXME: Kernel argument loads are currently treated as ALU operations 624 // since we don't send memory packets at execution. If we fix that then 625 // we should map them to one of the memory pipelines 626 ii->opType()==Enums::OT_KERN_READ \|\| 627 ii->opType()==Enums::OT_ARG \|\| 628 ii->opType()==Enums::OT_RET) { 629 computeUnit->aluPipe[simdId].preset(computeUnit->shader-> 630 ticks(computeUnit->spBypassLength())); 631 // this is to enforce a fixed number of cycles per issue slot per SIMD 632 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 633 ticks(computeUnit->issuePeriod)); 634 } else if (ii->opType() == Enums::OT_BARRIER) { 635 computeUnit->wfWait[simdId].preset(computeUnit->shader-> 636 ticks(computeUnit->issuePeriod)); 637 } else if (ii->opType() == Enums::OT_FLAT_READ) { 638 assert(Enums::SC_NONE != ii->executedAs()); 639 memReqsInPipe++; 640 rdGmReqsInPipe++; 641 if ( Enums::SC_SHARED == ii->executedAs() ) { 642 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 643 preset(computeUnit->shader->ticks(4)); 644 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 645 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 646 } else { 647 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 648 preset(computeUnit->shader->ticks(4)); 649 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 650 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 651 } 652 } else if (ii->opType() == Enums::OT_FLAT_WRITE) { 653 assert(Enums::SC_NONE != ii->executedAs()); 654 memReqsInPipe++; 655 wrGmReqsInPipe++; 656 if (Enums::SC_SHARED == ii->executedAs()) { 657 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 658 preset(computeUnit->shader->ticks(8)); 659 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 660 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 661 } else { 662 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 663 preset(computeUnit->shader->ticks(8)); 664 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 665 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 666 } 667 } else if (IS_OT_READ_GM(ii->opType())) { 668 memReqsInPipe++; 669 rdGmReqsInPipe++; 670 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 671 preset(computeUnit->shader->ticks(4)); 672 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 673 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 674 } else if (IS_OT_WRITE_GM(ii->opType())) { 675 memReqsInPipe++; 676 wrGmReqsInPipe++; 677 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 678 preset(computeUnit->shader->ticks(8)); 679 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 680 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 681 } else if (IS_OT_ATOMIC_GM(ii->opType())) { 682 memReqsInPipe++; 683 wrGmReqsInPipe++; 684 rdGmReqsInPipe++; 685 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 686 preset(computeUnit->shader->ticks(8)); 687 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 688 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 689 } else if (IS_OT_READ_LM(ii->opType())) { 690 memReqsInPipe++; 691 rdLmReqsInPipe++; 692 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 693 preset(computeUnit->shader->ticks(4)); 694 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 695 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 696 } else if (IS_OT_WRITE_LM(ii->opType())) { 697 memReqsInPipe++; 698 wrLmReqsInPipe++; 699 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 700 preset(computeUnit->shader->ticks(8)); 701 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 702 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 703 } else if (IS_OT_ATOMIC_LM(ii->opType())) { 704 memReqsInPipe++; 705 wrLmReqsInPipe++; 706 rdLmReqsInPipe++; 707 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 708 preset(computeUnit->shader->ticks(8)); 709 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 710 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 711 } else if (IS_OT_READ_PM(ii->opType())) { 712 memReqsInPipe++; 713 rdGmReqsInPipe++; 714 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 715 preset(computeUnit->shader->ticks(4)); 716 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 717 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 718 } else if (IS_OT_WRITE_PM(ii->opType())) { 719 memReqsInPipe++; 720 wrGmReqsInPipe++; 721 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 722 preset(computeUnit->shader->ticks(8)); 723 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 724 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 725 } else if (IS_OT_ATOMIC_PM(ii->opType())) { 726 memReqsInPipe++; 727 wrGmReqsInPipe++; 728 rdGmReqsInPipe++; 729 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 730 preset(computeUnit->shader->ticks(8)); 731 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 732 preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); 733 } 734} 735 736void 737Wavefront::exec() 738{ 739 // ---- Exit if wavefront is inactive ----------------------------- // 740 741 if (status == S_STOPPED \|\| status == S_RETURNING \|\| 742 instructionBuffer.empty()) { 743 return; 744 } 745 746 // Get current instruction 747 748 GPUDynInstPtr ii = instructionBuffer.front(); 749 750 const uint32_t old_pc = pc(); 751 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " 752 "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, 753 ii->disassemble(), old_pc); 754 ii->execute(); 755 // access the VRF 756 computeUnit->vrf[simdId]->exec(ii, this); 757 srcRegOpDist.sample(ii->numSrcRegOperands()); 758 dstRegOpDist.sample(ii->numDstRegOperands()); 759 computeUnit->numInstrExecuted++; 760 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() - 761 computeUnit->lastExecCycle[simdId]); 762 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value(); 763 if (pc() == old_pc) { 764 uint32_t new_pc = old_pc + 1; 765 // PC not modified by instruction, proceed to next or pop frame 766 pc(new_pc); 767 if (new_pc == rpc()) { 768 popFromReconvergenceStack(); 769 discardFetch(); 770 } else { 771 instructionBuffer.pop_front(); 772 } 773 } 774 775 if (computeUnit->shader->hsail_mode==Shader::SIMT) { 776 const int num_active_lanes = execMask().count(); 777 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes); 778 computeUnit->numVecOpsExecuted += num_active_lanes; 779 if (isGmInstruction(ii)) { 780 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes); 781 } else if (isLmInstruction(ii)) { 782 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes); 783 } 784 } 785 786 // ---- Update Vector ALU pipeline and other resources ------------------ // 787 // Single precision ALU or Branch or Return or Special instruction 788 if (ii->opType() == Enums::OT_ALU \|\| ii->opType() == Enums::OT_SPECIAL \|\| 789 ii->opType() == Enums::OT_BRANCH \|\| IS_OT_LDAS(ii->opType()) \|\| 790 // FIXME: Kernel argument loads are currently treated as ALU operations 791 // since we don't send memory packets at execution. If we fix that then 792 // we should map them to one of the memory pipelines 793 ii->opType() == Enums::OT_KERN_READ \|\| 794 ii->opType() == Enums::OT_ARG \|\| 795 ii->opType() == Enums::OT_RET) { 796 computeUnit->aluPipe[simdId].set(computeUnit->shader-> 797 ticks(computeUnit->spBypassLength())); 798 799 // this is to enforce a fixed number of cycles per issue slot per SIMD 800 computeUnit->wfWait[simdId].set(computeUnit->shader-> 801 ticks(computeUnit->issuePeriod)); 802 } else if (ii->opType() == Enums::OT_BARRIER) { 803 computeUnit->wfWait[simdId].set(computeUnit->shader-> 804 ticks(computeUnit->issuePeriod)); 805 } else if (ii->opType() == Enums::OT_FLAT_READ) { 806 assert(Enums::SC_NONE != ii->executedAs()); 807 808 if (Enums::SC_SHARED == ii->executedAs()) { 809 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 810 set(computeUnit->shader->ticks(4)); 811 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 812 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 813 } else { 814 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 815 set(computeUnit->shader->ticks(4)); 816 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 817 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 818 } 819 } else if (ii->opType() == Enums::OT_FLAT_WRITE) { 820 assert(Enums::SC_NONE != ii->executedAs()); 821 if (Enums::SC_SHARED == ii->executedAs()) { 822 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 823 set(computeUnit->shader->ticks(8)); 824 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 825 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 826 } else { 827 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 828 set(computeUnit->shader->ticks(8)); 829 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 830 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 831 } 832 } else if (IS_OT_READ_GM(ii->opType())) { 833 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 834 set(computeUnit->shader->ticks(4)); 835 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 836 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 837 } else if (IS_OT_WRITE_GM(ii->opType())) { 838 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 839 set(computeUnit->shader->ticks(8)); 840 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 841 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 842 } else if (IS_OT_ATOMIC_GM(ii->opType())) { 843 computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. 844 set(computeUnit->shader->ticks(8)); 845 computeUnit->wfWait[computeUnit->GlbMemUnitId()]. 846 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 847 } else if (IS_OT_READ_LM(ii->opType())) { 848 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 849 set(computeUnit->shader->ticks(4)); 850 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 851 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 852 } else if (IS_OT_WRITE_LM(ii->opType())) { 853 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 854 set(computeUnit->shader->ticks(8)); 855 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 856 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 857 } else if (IS_OT_ATOMIC_LM(ii->opType())) { 858 computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. 859 set(computeUnit->shader->ticks(8)); 860 computeUnit->wfWait[computeUnit->ShrMemUnitId()]. 861 set(computeUnit->shader->ticks(computeUnit->issuePeriod)); 862 } 863} 864 865bool 866Wavefront::waitingAtBarrier(int lane) 867{ 868 return barCnt[lane] < maxBarCnt; 869} 870 871void 872Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 873 const VectorMask& mask) 874{ 875 assert(mask.count()); 876 reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask}); 877} 878 879void 880Wavefront::popFromReconvergenceStack() 881{ 882 assert(!reconvergenceStack.empty()); 883 884 DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ", 885 computeUnit->cu_id, simdId, wfSlotId, wfDynId, 886 execMask().to_string<char, std::string::traits_type, 887 std::string::allocator_type>().c_str(), pc()); 888 889 reconvergenceStack.pop_back(); 890 891 DPRINTF(WavefrontStack, "%3i %s\n", pc(), 892 execMask().to_string<char, std::string::traits_type, 893 std::string::allocator_type>().c_str()); 894 895} 896 897void 898Wavefront::discardFetch() 899{ 900 instructionBuffer.clear(); 901 dropFetch \|=pendingFetch; 902} 903 904uint32_t 905Wavefront::pc() const 906{ 907 return reconvergenceStack.back()->pc; 908} 909 910uint32_t 911Wavefront::rpc() const 912{ 913 return reconvergenceStack.back()->rpc; 914} 915 916VectorMask 917Wavefront::execMask() const 918{ 919 return reconvergenceStack.back()->execMask; 920} 921 922bool 923Wavefront::execMask(int lane) const 924{ 925 return reconvergenceStack.back()->execMask[lane]; 926} 927 928 929void 930Wavefront::pc(uint32_t new_pc) 931{ 932 reconvergenceStack.back()->pc = new_pc; 933} 934 935uint32_t 936Wavefront::getStaticContextSize() const 937{ 938 return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) + 939 sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) + 940 sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) + 941 sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) + 942 computeUnit->wfSize() * sizeof(ReconvergenceStackEntry); 943} 944 945void 946Wavefront::getContext(const void out) 947{ 948* uint8_t iter = (uint8_t )out; 949 for (int i = 0; i < barCnt.size(); i++) { 950 (int )iter = barCnt[i]; iter += sizeof(barCnt[i]); 951 } 952 (int )iter = wfId; iter += sizeof(wfId); 953 (int )iter = maxBarCnt; iter += sizeof(maxBarCnt); 954 (int )iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt); 955 (int )iter = barrierCnt; iter += sizeof(barrierCnt); 956 (int )iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id); 957 (uint32_t )iter = wgId; iter += sizeof(wgId); 958 (uint32_t )iter = barrierId; iter += sizeof(barrierId); 959 (uint64_t )iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong()); 960 (Addr )iter = privBase; iter += sizeof(privBase); 961 (Addr )iter = spillBase; iter += sizeof(spillBase); 962 963 int stackSize = reconvergenceStack.size(); 964 ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(), 965 std::numeric_limits<uint32_t>::max(), 966 std::numeric_limits<uint64_t>::max()}; 967 for (int i = 0; i < workItemId[0].size(); i++) { 968 if (i < stackSize) { 969 (ReconvergenceStackEntry )iter = reconvergenceStack.back(); 970* iter += sizeof(ReconvergenceStackEntry); 971 reconvergenceStack.pop_back(); 972 } else { 973 (ReconvergenceStackEntry )iter = empty; 974 iter += sizeof(ReconvergenceStackEntry); 975 } 976 } 977 978 int wf_size = computeUnit->wfSize(); 979 for (int i = 0; i < maxSpVgprs; i++) { 980 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); 981 for (int lane = 0; lane < wf_size; lane++) { 982 uint32_t regVal = computeUnit->vrf[simdId]-> 983 read<uint32_t>(vgprIdx,lane); 984 (uint32_t )iter = regVal; iter += sizeof(regVal); 985 } 986 } 987 988 for (int i = 0; i < maxDpVgprs; i++) { 989 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); 990 for (int lane = 0; lane < wf_size; lane++) { 991 uint64_t regVal = computeUnit->vrf[simdId]-> 992 read<uint64_t>(vgprIdx,lane); 993 (uint64_t )iter = regVal; iter += sizeof(regVal); 994 } 995 } 996 997 for (int i = 0; i < condRegState->numRegs(); i++) { 998 for (int lane = 0; lane < wf_size; lane++) { 999 uint64_t regVal = condRegState->read<uint64_t>(i, lane); 1000 (uint64_t )iter = regVal; iter += sizeof(regVal); 1001 } 1002 } 1003 1004 /* saving LDS content / 1005* if (ldsChunk) 1006 for (int i = 0; i < ldsChunk->size(); i++) { 1007 char val = ldsChunk->read<char>(i); 1008 (char ) iter = val; iter += sizeof(val); 1009 } 1010} 1011 1012void 1013Wavefront::setContext(const void in) 1014{ 1015* uint8_t iter = (uint8_t )in; 1016 for (int i = 0; i < barCnt.size(); i++) { 1017 barCnt[i] = (int )iter; iter += sizeof(barCnt[i]); 1018 } 1019 wfId = (int )iter; iter += sizeof(wfId); 1020 maxBarCnt = (int )iter; iter += sizeof(maxBarCnt); 1021 oldBarrierCnt = (int )iter; iter += sizeof(oldBarrierCnt); 1022 barrierCnt = (int )iter; iter += sizeof(barrierCnt); 1023 computeUnit->cu_id = (int )iter; iter += sizeof(computeUnit->cu_id); 1024 wgId = (uint32_t )iter; iter += sizeof(wgId); 1025 barrierId = (uint32_t )iter; iter += sizeof(barrierId); 1026 initMask = VectorMask((uint64_t )iter); iter += sizeof(initMask); 1027 privBase = (Addr )iter; iter += sizeof(privBase); 1028 spillBase = (Addr )iter; iter += sizeof(spillBase); 1029 1030 for (int i = 0; i < workItemId[0].size(); i++) { 1031 ReconvergenceStackEntry newEntry = (ReconvergenceStackEntry )iter; 1032 iter += sizeof(ReconvergenceStackEntry); 1033 if (newEntry.pc != std::numeric_limits<uint32_t>::max()) { 1034 pushToReconvergenceStack(newEntry.pc, newEntry.rpc, 1035 newEntry.execMask); 1036 } 1037 } 1038 int wf_size = computeUnit->wfSize(); 1039 1040 for (int i = 0; i < maxSpVgprs; i++) { 1041 uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1); 1042 for (int lane = 0; lane < wf_size; lane++) { 1043 uint32_t regVal = (uint32_t )iter; iter += sizeof(regVal); 1044 computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane); 1045 } 1046 } 1047 1048 for (int i = 0; i < maxDpVgprs; i++) { 1049 uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1); 1050 for (int lane = 0; lane < wf_size; lane++) { 1051 uint64_t regVal = (uint64_t )iter; iter += sizeof(regVal); 1052 computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane); 1053 } 1054 } 1055 1056 for (int i = 0; i < condRegState->numRegs(); i++) { 1057 for (int lane = 0; lane < wf_size; lane++) { 1058 uint64_t regVal = (uint64_t )iter; iter += sizeof(regVal); 1059 condRegState->write<uint64_t>(i, lane, regVal); 1060 } 1061 } 1062 /** Restoring LDS contents / 1063* if (ldsChunk) 1064 for (int i = 0; i < ldsChunk->size(); i++) { 1065 char val = (char ) iter; iter += sizeof(val); 1066 ldsChunk->write<char>(i, val); 1067 } 1068}
	1069 1070void 1071Wavefront::computeActualWgSz(NDRange ndr) 1072{ 1073* actualWgSzTotal = 1; 1074 for (int d = 0; d < 3; ++d) { 1075 actualWgSz[d] = std::min(workGroupSz[d], 1076 gridSz[d] - ndr->wgId[d] * workGroupSz[d]); 1077 actualWgSzTotal = actualWgSz[d]; 1078* } 1079}