compute_unit.hh revision 11698:d1ad31187fa5
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: John Kalamatianos, Anthony Gutierrez 34 */ 35 36#ifndef __COMPUTE_UNIT_HH__ 37#define __COMPUTE_UNIT_HH__ 38 39#include <deque> 40#include <map> 41#include <unordered_map> 42#include <vector> 43 44#include "base/callback.hh" 45#include "base/statistics.hh" 46#include "base/types.hh" 47#include "enums/PrefetchType.hh" 48#include "gpu-compute/exec_stage.hh" 49#include "gpu-compute/fetch_stage.hh" 50#include "gpu-compute/global_memory_pipeline.hh" 51#include "gpu-compute/local_memory_pipeline.hh" 52#include "gpu-compute/qstruct.hh" 53#include "gpu-compute/schedule_stage.hh" 54#include "gpu-compute/scoreboard_check_stage.hh" 55#include "mem/mem_object.hh" 56#include "mem/port.hh" 57 58static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1; 59static const int MAX_WIDTH_FOR_MEM_INST = 32; 60 61class NDRange; 62class Shader; 63class VectorRegisterFile; 64 65struct ComputeUnitParams; 66 67enum EXEC_POLICY 68{ 69 OLDEST = 0, 70 RR 71}; 72 73// List of execution units 74enum EXEC_UNIT 75{ 76 SIMD0 = 0, 77 SIMD1, 78 SIMD2, 79 SIMD3, 80 GLBMEM_PIPE, 81 LDSMEM_PIPE, 82 NUM_UNITS 83}; 84 85enum TLB_CACHE 86{ 87 TLB_MISS_CACHE_MISS = 0, 88 TLB_MISS_CACHE_HIT, 89 TLB_HIT_CACHE_MISS, 90 TLB_HIT_CACHE_HIT 91}; 92 93class ComputeUnit : public MemObject 94{ 95 public: 96 FetchStage fetchStage; 97 ScoreboardCheckStage scoreboardCheckStage; 98 ScheduleStage scheduleStage; 99 ExecStage execStage; 100 GlobalMemPipeline globalMemoryPipe; 101 LocalMemPipeline localMemoryPipe; 102 103 // Buffers used to communicate between various pipeline stages 104 105 // List of waves which are ready to be scheduled. 106 // Each execution resource has a ready list. readyList is 107 // used to communicate between scoreboardCheck stage and 108 // schedule stage 109 // TODO: make enum to index readyList 110 std::vector<std::vector<Wavefront*>> readyList; 111 112 // Stores the status of waves. A READY implies the 113 // wave is ready to be scheduled this cycle and 114 // is already present in the readyList. waveStatusList is 115 // used to communicate between scoreboardCheck stage and 116 // schedule stage 117 // TODO: convert std::pair to a class to increase readability 118 std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList; 119 120 // List of waves which will be dispatched to 121 // each execution resource. A FILLED implies 122 // dispatch list is non-empty and 123 // execution unit has something to execute 124 // this cycle. Currently, the dispatch list of 125 // an execution resource can hold only one wave because 126 // an execution resource can execute only one wave in a cycle. 127 // dispatchList is used to communicate between schedule 128 // and exec stage 129 // TODO: convert std::pair to a class to increase readability 130 std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList; 131 132 int rrNextMemID; // used by RR WF exec policy to cycle through WF's 133 int rrNextALUWp; 134 typedef ComputeUnitParams Params; 135 std::vector<std::vector<Wavefront*>> wfList; 136 int cu_id; 137 138 // array of vector register files, one per SIMD 139 std::vector<VectorRegisterFile*> vrf; 140 // Number of vector ALU units (SIMDs) in CU 141 int numSIMDs; 142 // number of pipe stages for bypassing data to next dependent single 143 // precision vector instruction inside the vector ALU pipeline 144 int spBypassPipeLength; 145 // number of pipe stages for bypassing data to next dependent double 146 // precision vector instruction inside the vector ALU pipeline 147 int dpBypassPipeLength; 148 // number of cycles per issue period 149 int issuePeriod; 150 151 // Number of global and local memory execution resources in CU 152 int numGlbMemUnits; 153 int numLocMemUnits; 154 // tracks the last cycle a vector instruction was executed on a SIMD 155 std::vector<uint64_t> lastExecCycle; 156 157 // true if we allow a separate TLB per lane 158 bool perLaneTLB; 159 // if 0, TLB prefetching is off. 160 int prefetchDepth; 161 // if fixed-stride prefetching, this is the stride. 162 int prefetchStride; 163 164 std::vector<Addr> lastVaddrCU; 165 std::vector<std::vector<Addr>> lastVaddrSimd; 166 std::vector<std::vector<std::vector<Addr>>> lastVaddrWF; 167 Enums::PrefetchType prefetchType; 168 EXEC_POLICY exec_policy; 169 170 bool xact_cas_mode; 171 bool debugSegFault; 172 bool functionalTLB; 173 bool localMemBarrier; 174 175 /* 176 * for Counting page accesses 177 * 178 * cuExitCallback inherits from Callback. When you register a callback 179 * function as an exit callback, it will get added to an exit callback 180 * queue, such that on simulation exit, all callbacks in the callback 181 * queue will have their process() function called. 182 */ 183 bool countPages; 184 185 Shader *shader; 186 uint32_t barrier_id; 187 // vector of Vector ALU (MACC) pipelines 188 std::vector<WaitClass> aluPipe; 189 // minimum issue period per SIMD unit (in cycles) 190 std::vector<WaitClass> wfWait; 191 192 // Resource control for Vector Register File->Global Memory pipe buses 193 std::vector<WaitClass> vrfToGlobalMemPipeBus; 194 // Resource control for Vector Register File->Local Memory pipe buses 195 std::vector<WaitClass> vrfToLocalMemPipeBus; 196 int nextGlbMemBus; 197 int nextLocMemBus; 198 // Resource control for global memory to VRF data/address bus 199 WaitClass glbMemToVrfBus; 200 // Resource control for local memory to VRF data/address bus 201 WaitClass locMemToVrfBus; 202 203 uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes 204 uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes 205 uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store 206 uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load 207 208 Tick req_tick_latency; 209 Tick resp_tick_latency; 210 211 // number of vector registers being reserved for each SIMD unit 212 std::vector<int> vectorRegsReserved; 213 // number of vector registers per SIMD unit 214 uint32_t numVecRegsPerSimd; 215 // Support for scheduling VGPR status update events 216 std::vector<std::pair<uint32_t, uint32_t> > regIdxVec; 217 std::vector<uint64_t> timestampVec; 218 std::vector<uint8_t> statusVec; 219 220 void 221 registerEvent(uint32_t simdId, 222 uint32_t regIdx, 223 uint32_t operandSize, 224 uint64_t when, 225 uint8_t newStatus) { 226 regIdxVec.push_back(std::make_pair(simdId, regIdx)); 227 timestampVec.push_back(when); 228 statusVec.push_back(newStatus); 229 if (operandSize > 4) { 230 regIdxVec.push_back(std::make_pair(simdId, 231 ((regIdx + 1) % 232 numVecRegsPerSimd))); 233 timestampVec.push_back(when); 234 statusVec.push_back(newStatus); 235 } 236 } 237 238 void updateEvents(); 239 240 // this hash map will keep track of page divergence 241 // per memory instruction per wavefront. The hash map 242 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. 243 std::map<Addr, int> pagesTouched; 244 245 ComputeUnit(const Params *p); 246 ~ComputeUnit(); 247 int spBypassLength() { return spBypassPipeLength; }; 248 int dpBypassLength() { return dpBypassPipeLength; }; 249 int storeBusLength() { return numCyclesPerStoreTransfer; }; 250 int loadBusLength() { return numCyclesPerLoadTransfer; }; 251 int wfSize() const { return wavefrontSize; }; 252 253 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 254 void exec(); 255 void initiateFetch(Wavefront *wavefront); 256 void fetch(PacketPtr pkt, Wavefront *wavefront); 257 void fillKernelState(Wavefront *w, NDRange *ndr); 258 259 void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk, 260 NDRange *ndr); 261 262 void StartWorkgroup(NDRange *ndr); 263 int ReadyWorkgroup(NDRange *ndr); 264 265 bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; } 266 bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; } 267 bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; } 268 int GlbMemUnitId() { return GLBMEM_PIPE; } 269 int ShrMemUnitId() { return LDSMEM_PIPE; } 270 int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; } 271 int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; } 272 /* This function cycles through all the wavefronts in all the phases to see 273 * if all of the wavefronts which should be associated with one barrier 274 * (denoted with _barrier_id), are all at the same barrier in the program 275 * (denoted by bcnt). When the number at the barrier matches bslots, then 276 * return true. 277 */ 278 int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots); 279 bool cedeSIMD(int simdId, int wfSlotId); 280 281 template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst); 282 virtual void init(); 283 void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); 284 void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); 285 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, 286 bool kernelLaunch=true, 287 RequestPtr req=nullptr); 288 void handleMemPacket(PacketPtr pkt, int memport_index); 289 bool processTimingPacket(PacketPtr pkt); 290 void processFetchReturn(PacketPtr pkt); 291 void updatePageDivergenceDist(Addr addr); 292 293 MasterID masterId() { return _masterId; } 294 295 bool isDone() const; 296 bool isSimdDone(uint32_t) const; 297 298 protected: 299 MasterID _masterId; 300 301 LdsState &lds; 302 303 public: 304 Stats::Scalar vALUInsts; 305 Stats::Formula vALUInstsPerWF; 306 Stats::Scalar sALUInsts; 307 Stats::Formula sALUInstsPerWF; 308 Stats::Scalar instCyclesVALU; 309 Stats::Scalar instCyclesSALU; 310 Stats::Scalar threadCyclesVALU; 311 Stats::Formula vALUUtilization; 312 Stats::Scalar ldsNoFlatInsts; 313 Stats::Formula ldsNoFlatInstsPerWF; 314 Stats::Scalar flatVMemInsts; 315 Stats::Formula flatVMemInstsPerWF; 316 Stats::Scalar flatLDSInsts; 317 Stats::Formula flatLDSInstsPerWF; 318 Stats::Scalar vectorMemWrites; 319 Stats::Formula vectorMemWritesPerWF; 320 Stats::Scalar vectorMemReads; 321 Stats::Formula vectorMemReadsPerWF; 322 Stats::Scalar scalarMemWrites; 323 Stats::Formula scalarMemWritesPerWF; 324 Stats::Scalar scalarMemReads; 325 Stats::Formula scalarMemReadsPerWF; 326 327 void updateInstStats(GPUDynInstPtr gpuDynInst); 328 329 // the following stats compute the avg. TLB accesslatency per 330 // uncoalesced request (only for data) 331 Stats::Scalar tlbRequests; 332 Stats::Scalar tlbCycles; 333 Stats::Formula tlbLatency; 334 // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table. 335 Stats::Vector hitsPerTLBLevel; 336 337 Stats::Scalar ldsBankAccesses; 338 Stats::Distribution ldsBankConflictDist; 339 340 // over all memory instructions executed over all wavefronts 341 // how many touched 0-4 pages, 4-8, ..., 60-64 pages 342 Stats::Distribution pageDivergenceDist; 343 Stats::Scalar dynamicGMemInstrCnt; 344 Stats::Scalar dynamicLMemInstrCnt; 345 346 Stats::Scalar wgBlockedDueLdsAllocation; 347 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active 348 // when the instruction is committed, this number is still incremented by 1 349 Stats::Scalar numInstrExecuted; 350 // Number of cycles among successive instruction executions across all 351 // wavefronts of the same CU 352 Stats::Distribution execRateDist; 353 // number of individual vector operations executed 354 Stats::Scalar numVecOpsExecuted; 355 // Total cycles that something is running on the GPU 356 Stats::Scalar totalCycles; 357 Stats::Formula vpc; // vector ops per cycle 358 Stats::Formula ipc; // vector instructions per cycle 359 Stats::Distribution controlFlowDivergenceDist; 360 Stats::Distribution activeLanesPerGMemInstrDist; 361 Stats::Distribution activeLanesPerLMemInstrDist; 362 // number of vector ALU instructions received 363 Stats::Formula numALUInstsExecuted; 364 // number of times a WG can not start due to lack of free VGPRs in SIMDs 365 Stats::Scalar numTimesWgBlockedDueVgprAlloc; 366 Stats::Scalar numCASOps; 367 Stats::Scalar numFailedCASOps; 368 Stats::Scalar completedWfs; 369 // flag per vector SIMD unit that is set when there is at least one 370 // WV that has a vector ALU instruction as the oldest in its 371 // Instruction Buffer: Defined in the Scoreboard stage, consumed 372 // by the Execute stage. 373 std::vector<bool> vectorAluInstAvail; 374 // number of available (oldest) LDS instructions that could have 375 // been issued to the LDS at a specific issue slot 376 int shrMemInstAvail; 377 // number of available Global memory instructions that could have 378 // been issued to TCP at a specific issue slot 379 int glbMemInstAvail; 380 381 void 382 regStats(); 383 384 LdsState & 385 getLds() const 386 { 387 return lds; 388 } 389 390 int32_t 391 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const; 392 393 int cacheLineSize() const { return _cacheLineSize; } 394 395 bool 396 sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result)); 397 398 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct; 399 pageDataStruct pageAccesses; 400 401 class CUExitCallback : public Callback 402 { 403 private: 404 ComputeUnit *computeUnit; 405 406 public: 407 virtual ~CUExitCallback() { } 408 409 CUExitCallback(ComputeUnit *_cu) 410 { 411 computeUnit = _cu; 412 } 413 414 virtual void 415 process(); 416 }; 417 418 CUExitCallback *cuExitCallback; 419 420 /** Data access Port **/ 421 class DataPort : public MasterPort 422 { 423 public: 424 DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index) 425 : MasterPort(_name, _cu), computeUnit(_cu), 426 index(_index) { } 427 428 bool snoopRangeSent; 429 430 struct SenderState : public Packet::SenderState 431 { 432 GPUDynInstPtr _gpuDynInst; 433 int port_index; 434 Packet::SenderState *saved; 435 436 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, 437 Packet::SenderState *sender_state=nullptr) 438 : _gpuDynInst(gpuDynInst), 439 port_index(_port_index), 440 saved(sender_state) { } 441 }; 442 443 class MemReqEvent : public Event 444 { 445 private: 446 DataPort *dataPort; 447 PacketPtr pkt; 448 449 public: 450 MemReqEvent(DataPort *_data_port, PacketPtr _pkt) 451 : Event(), dataPort(_data_port), pkt(_pkt) 452 { 453 setFlags(Event::AutoDelete); 454 } 455 456 void process(); 457 const char *description() const; 458 }; 459 460 class MemRespEvent : public Event 461 { 462 private: 463 DataPort *dataPort; 464 PacketPtr pkt; 465 466 public: 467 MemRespEvent(DataPort *_data_port, PacketPtr _pkt) 468 : Event(), dataPort(_data_port), pkt(_pkt) 469 { 470 setFlags(Event::AutoDelete); 471 } 472 473 void process(); 474 const char *description() const; 475 }; 476 477 std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries; 478 479 protected: 480 ComputeUnit *computeUnit; 481 int index; 482 483 virtual bool recvTimingResp(PacketPtr pkt); 484 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 485 virtual void recvFunctional(PacketPtr pkt) { } 486 virtual void recvRangeChange() { } 487 virtual void recvReqRetry(); 488 489 virtual void 490 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) 491 { 492 resp.clear(); 493 snoop = true; 494 } 495 496 }; 497 498 // Instruction cache access port 499 class SQCPort : public MasterPort 500 { 501 public: 502 SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index) 503 : MasterPort(_name, _cu), computeUnit(_cu), 504 index(_index) { } 505 506 bool snoopRangeSent; 507 508 struct SenderState : public Packet::SenderState 509 { 510 Wavefront *wavefront; 511 Packet::SenderState *saved; 512 513 SenderState(Wavefront *_wavefront, Packet::SenderState 514 *sender_state=nullptr) 515 : wavefront(_wavefront), saved(sender_state) { } 516 }; 517 518 std::deque<std::pair<PacketPtr, Wavefront*>> retries; 519 520 protected: 521 ComputeUnit *computeUnit; 522 int index; 523 524 virtual bool recvTimingResp(PacketPtr pkt); 525 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 526 virtual void recvFunctional(PacketPtr pkt) { } 527 virtual void recvRangeChange() { } 528 virtual void recvReqRetry(); 529 530 virtual void 531 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) 532 { 533 resp.clear(); 534 snoop = true; 535 } 536 }; 537 538 /** Data TLB port **/ 539 class DTLBPort : public MasterPort 540 { 541 public: 542 DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index) 543 : MasterPort(_name, _cu), computeUnit(_cu), 544 index(_index), stalled(false) 545 { } 546 547 bool isStalled() { return stalled; } 548 void stallPort() { stalled = true; } 549 void unstallPort() { stalled = false; } 550 551 /** 552 * here we queue all the translation requests that were 553 * not successfully sent. 554 */ 555 std::deque<PacketPtr> retries; 556 557 /** SenderState is information carried along with the packet 558 * throughout the TLB hierarchy 559 */ 560 struct SenderState: public Packet::SenderState 561 { 562 // the memInst that this is associated with 563 GPUDynInstPtr _gpuDynInst; 564 565 // the lane in the memInst this is associated with, so we send 566 // the memory request down the right port 567 int portIndex; 568 569 // constructor used for packets involved in timing accesses 570 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index) 571 : _gpuDynInst(gpuDynInst), portIndex(port_index) { } 572 573 }; 574 575 protected: 576 ComputeUnit *computeUnit; 577 int index; 578 bool stalled; 579 580 virtual bool recvTimingResp(PacketPtr pkt); 581 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 582 virtual void recvFunctional(PacketPtr pkt) { } 583 virtual void recvRangeChange() { } 584 virtual void recvReqRetry(); 585 }; 586 587 class ITLBPort : public MasterPort 588 { 589 public: 590 ITLBPort(const std::string &_name, ComputeUnit *_cu) 591 : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { } 592 593 594 bool isStalled() { return stalled; } 595 void stallPort() { stalled = true; } 596 void unstallPort() { stalled = false; } 597 598 /** 599 * here we queue all the translation requests that were 600 * not successfully sent. 601 */ 602 std::deque<PacketPtr> retries; 603 604 /** SenderState is information carried along with the packet 605 * throughout the TLB hierarchy 606 */ 607 struct SenderState: public Packet::SenderState 608 { 609 // The wavefront associated with this request 610 Wavefront *wavefront; 611 612 SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { } 613 }; 614 615 protected: 616 ComputeUnit *computeUnit; 617 bool stalled; 618 619 virtual bool recvTimingResp(PacketPtr pkt); 620 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 621 virtual void recvFunctional(PacketPtr pkt) { } 622 virtual void recvRangeChange() { } 623 virtual void recvReqRetry(); 624 }; 625 626 /** 627 * the port intended to communicate between the CU and its LDS 628 */ 629 class LDSPort : public MasterPort 630 { 631 public: 632 LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id) 633 : MasterPort(_name, _cu, _id), computeUnit(_cu) 634 { 635 } 636 637 bool isStalled() const { return stalled; } 638 void stallPort() { stalled = true; } 639 void unstallPort() { stalled = false; } 640 641 /** 642 * here we queue all the requests that were 643 * not successfully sent. 644 */ 645 std::queue<PacketPtr> retries; 646 647 /** 648 * SenderState is information carried along with the packet, esp. the 649 * GPUDynInstPtr 650 */ 651 class SenderState: public Packet::SenderState 652 { 653 protected: 654 // The actual read/write/atomic request that goes with this command 655 GPUDynInstPtr _gpuDynInst = nullptr; 656 657 public: 658 SenderState(GPUDynInstPtr gpuDynInst): 659 _gpuDynInst(gpuDynInst) 660 { 661 } 662 663 GPUDynInstPtr 664 getMemInst() const 665 { 666 return _gpuDynInst; 667 } 668 }; 669 670 virtual bool 671 sendTimingReq(PacketPtr pkt); 672 673 protected: 674 675 bool stalled = false; ///< whether or not it is stalled 676 677 ComputeUnit *computeUnit; 678 679 virtual bool 680 recvTimingResp(PacketPtr pkt); 681 682 virtual Tick 683 recvAtomic(PacketPtr pkt) { return 0; } 684 685 virtual void 686 recvFunctional(PacketPtr pkt) 687 { 688 } 689 690 virtual void 691 recvRangeChange() 692 { 693 } 694 695 virtual void 696 recvReqRetry(); 697 }; 698 699 /** The port to access the Local Data Store 700 * Can be connected to a LDS object 701 */ 702 LDSPort *ldsPort = nullptr; 703 704 LDSPort * 705 getLdsPort() const 706 { 707 return ldsPort; 708 } 709 710 /** The memory port for SIMD data accesses. 711 * Can be connected to PhysMem for Ruby for timing simulations 712 */ 713 std::vector<DataPort*> memPort; 714 // port to the TLB hierarchy (i.e., the L1 TLB) 715 std::vector<DTLBPort*> tlbPort; 716 // port to the SQC (i.e. the I-cache) 717 SQCPort *sqcPort; 718 // port to the SQC TLB (there's a separate TLB for each I-cache) 719 ITLBPort *sqcTLBPort; 720 721 virtual BaseMasterPort& 722 getMasterPort(const std::string &if_name, PortID idx) 723 { 724 if (if_name == "memory_port") { 725 memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx), 726 this, idx); 727 return *memPort[idx]; 728 } else if (if_name == "translation_port") { 729 tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx), 730 this, idx); 731 return *tlbPort[idx]; 732 } else if (if_name == "sqc_port") { 733 sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx), 734 this, idx); 735 return *sqcPort; 736 } else if (if_name == "sqc_tlb_port") { 737 sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this); 738 return *sqcTLBPort; 739 } else if (if_name == "ldsPort") { 740 if (ldsPort) { 741 fatal("an LDS port was already allocated"); 742 } 743 ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx); 744 return *ldsPort; 745 } else { 746 panic("incorrect port name"); 747 } 748 } 749 750 // xact_cas_load() 751 class waveIdentifier 752 { 753 public: 754 waveIdentifier() { } 755 waveIdentifier(int _simdId, int _wfSlotId) 756 : simdId(_simdId), wfSlotId(_wfSlotId) { } 757 758 int simdId; 759 int wfSlotId; 760 }; 761 762 class waveQueue 763 { 764 public: 765 std::list<waveIdentifier> waveIDQueue; 766 }; 767 std::map<unsigned, waveQueue> xactCasLoadMap; 768 769 uint64_t getAndIncSeqNum() { return globalSeqNum++; } 770 771 private: 772 const int _cacheLineSize; 773 uint64_t globalSeqNum; 774 int wavefrontSize; 775 GPUStaticInst *kernelLaunchInst; 776}; 777 778#endif // __COMPUTE_UNIT_HH__ 779