compute_unit.hh revision 11308
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: John Kalamatianos, Anthony Gutierrez 34 */ 35 36#ifndef __COMPUTE_UNIT_HH__ 37#define __COMPUTE_UNIT_HH__ 38 39#include <deque> 40#include <map> 41#include <unordered_map> 42#include <vector> 43 44#include "base/callback.hh" 45#include "base/statistics.hh" 46#include "base/types.hh" 47#include "enums/PrefetchType.hh" 48#include "gpu-compute/exec_stage.hh" 49#include "gpu-compute/fetch_stage.hh" 50#include "gpu-compute/global_memory_pipeline.hh" 51#include "gpu-compute/local_memory_pipeline.hh" 52#include "gpu-compute/qstruct.hh" 53#include "gpu-compute/schedule_stage.hh" 54#include "gpu-compute/scoreboard_check_stage.hh" 55#include "mem/mem_object.hh" 56#include "mem/port.hh" 57 58static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1; 59static const int MAX_WIDTH_FOR_MEM_INST = 32; 60 61class NDRange; 62class Shader; 63class VectorRegisterFile; 64 65struct ComputeUnitParams; 66 67enum EXEC_POLICY 68{ 69 OLDEST = 0, 70 RR 71}; 72 73// List of execution units 74enum EXEC_UNIT 75{ 76 SIMD0 = 0, 77 SIMD1, 78 SIMD2, 79 SIMD3, 80 GLBMEM_PIPE, 81 LDSMEM_PIPE, 82 NUM_UNITS 83}; 84 85enum TLB_CACHE 86{ 87 TLB_MISS_CACHE_MISS = 0, 88 TLB_MISS_CACHE_HIT, 89 TLB_HIT_CACHE_MISS, 90 TLB_HIT_CACHE_HIT 91}; 92 93class ComputeUnit : public MemObject 94{ 95 public: 96 FetchStage fetchStage; 97 ScoreboardCheckStage scoreboardCheckStage; 98 ScheduleStage scheduleStage; 99 ExecStage execStage; 100 GlobalMemPipeline globalMemoryPipe; 101 LocalMemPipeline localMemoryPipe; 102 103 // Buffers used to communicate between various pipeline stages 104 105 // List of waves which are ready to be scheduled. 106 // Each execution resource has a ready list. readyList is 107 // used to communicate between scoreboardCheck stage and 108 // schedule stage 109 // TODO: make enum to index readyList 110 std::vector<std::vector<Wavefront*>> readyList; 111 112 // Stores the status of waves. A READY implies the 113 // wave is ready to be scheduled this cycle and 114 // is already present in the readyList. waveStatusList is 115 // used to communicate between scoreboardCheck stage and 116 // schedule stage 117 // TODO: convert std::pair to a class to increase readability 118 std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList; 119 120 // List of waves which will be dispatched to 121 // each execution resource. A FILLED implies 122 // dispatch list is non-empty and 123 // execution unit has something to execute 124 // this cycle. Currently, the dispatch list of 125 // an execution resource can hold only one wave because 126 // an execution resource can execute only one wave in a cycle. 127 // dispatchList is used to communicate between schedule 128 // and exec stage 129 // TODO: convert std::pair to a class to increase readability 130 std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList; 131 132 int rrNextMemID; // used by RR WF exec policy to cycle through WF's 133 int rrNextALUWp; 134 typedef ComputeUnitParams Params; 135 std::vector<std::vector<Wavefront*>> wfList; 136 int cu_id; 137 138 // array of vector register files, one per SIMD 139 std::vector<VectorRegisterFile*> vrf; 140 // Number of vector ALU units (SIMDs) in CU 141 int numSIMDs; 142 // number of pipe stages for bypassing data to next dependent single 143 // precision vector instruction inside the vector ALU pipeline 144 int spBypassPipeLength; 145 // number of pipe stages for bypassing data to next dependent double 146 // precision vector instruction inside the vector ALU pipeline 147 int dpBypassPipeLength; 148 // number of cycles per issue period 149 int issuePeriod; 150 151 // Number of global and local memory execution resources in CU 152 int numGlbMemUnits; 153 int numLocMemUnits; 154 // tracks the last cycle a vector instruction was executed on a SIMD 155 std::vector<uint64_t> lastExecCycle; 156 157 // true if we allow a separate TLB per lane 158 bool perLaneTLB; 159 // if 0, TLB prefetching is off. 160 int prefetchDepth; 161 // if fixed-stride prefetching, this is the stride. 162 int prefetchStride; 163 164 class LastVaddrWave 165 { 166 public: 167 Addr vaddrs[VSZ]; 168 Addr& operator[](int idx) { 169 return vaddrs[idx]; 170 } 171 172 LastVaddrWave() { 173 for (int i = 0; i < VSZ; ++i) 174 vaddrs[i] = 0; 175 } 176 }; 177 178 LastVaddrWave lastVaddrCU; 179 std::vector<LastVaddrWave> lastVaddrPhase; 180 std::vector<std::vector<std::vector<Addr>>> lastVaddrWF; 181 Enums::PrefetchType prefetchType; 182 EXEC_POLICY exec_policy; 183 184 bool xact_cas_mode; 185 bool debugSegFault; 186 bool functionalTLB; 187 bool localMemBarrier; 188 189 /* 190 * for Counting page accesses 191 * 192 * cuExitCallback inherits from Callback. When you register a callback 193 * function as an exit callback, it will get added to an exit callback 194 * queue, such that on simulation exit, all callbacks in the callback 195 * queue will have their process() function called. 196 */ 197 bool countPages; 198 199 Shader *shader; 200 uint32_t barrier_id; 201 // vector of Vector ALU (MACC) pipelines 202 std::vector<WaitClass> aluPipe; 203 // minimum issue period per SIMD unit (in cycles) 204 std::vector<WaitClass> wfWait; 205 206 // Resource control for Vector Register File->Global Memory pipe buses 207 std::vector<WaitClass> vrfToGlobalMemPipeBus; 208 // Resource control for Vector Register File->Local Memory pipe buses 209 std::vector<WaitClass> vrfToLocalMemPipeBus; 210 int nextGlbMemBus; 211 int nextLocMemBus; 212 // Resource control for global memory to VRF data/address bus 213 WaitClass glbMemToVrfBus; 214 // Resource control for local memory to VRF data/address bus 215 WaitClass locMemToVrfBus; 216 217 uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes 218 uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes 219 uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store 220 uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load 221 222 Tick req_tick_latency; 223 Tick resp_tick_latency; 224 225 // number of vector registers being reserved for each SIMD unit 226 std::vector<int> vectorRegsReserved; 227 // number of vector registers per SIMD unit 228 uint32_t numVecRegsPerSimd; 229 // Support for scheduling VGPR status update events 230 std::vector<std::pair<uint32_t, uint32_t> > regIdxVec; 231 std::vector<uint64_t> timestampVec; 232 std::vector<uint8_t> statusVec; 233 234 void 235 registerEvent(uint32_t simdId, 236 uint32_t regIdx, 237 uint32_t operandSize, 238 uint64_t when, 239 uint8_t newStatus) { 240 regIdxVec.push_back(std::make_pair(simdId, regIdx)); 241 timestampVec.push_back(when); 242 statusVec.push_back(newStatus); 243 if (operandSize > 4) { 244 regIdxVec.push_back(std::make_pair(simdId, 245 ((regIdx + 1) % 246 numVecRegsPerSimd))); 247 timestampVec.push_back(when); 248 statusVec.push_back(newStatus); 249 } 250 } 251 252 void updateEvents(); 253 254 // this hash map will keep track of page divergence 255 // per memory instruction per wavefront. The hash map 256 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. 257 std::map<Addr, int> pagesTouched; 258 259 ComputeUnit(const Params *p); 260 ~ComputeUnit(); 261 int spBypassLength() { return spBypassPipeLength; }; 262 int dpBypassLength() { return dpBypassPipeLength; }; 263 int storeBusLength() { return numCyclesPerStoreTransfer; }; 264 int loadBusLength() { return numCyclesPerLoadTransfer; }; 265 int wfSize() const { return wavefrontSize; }; 266 267 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 268 void exec(); 269 void initiateFetch(Wavefront *wavefront); 270 void fetch(PacketPtr pkt, Wavefront *wavefront); 271 void FillKernelState(Wavefront *w, NDRange *ndr); 272 273 void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], 274 int trueWgSizeTotal); 275 276 void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, 277 int trueWgSize[], int trueWgSizeTotal, 278 LdsChunk *ldsChunk, uint64_t origSpillMemStart); 279 280 void StartWorkgroup(NDRange *ndr); 281 int ReadyWorkgroup(NDRange *ndr); 282 283 bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; } 284 bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; } 285 bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; } 286 int GlbMemUnitId() { return GLBMEM_PIPE; } 287 int ShrMemUnitId() { return LDSMEM_PIPE; } 288 int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; } 289 int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; } 290 /* This function cycles through all the wavefronts in all the phases to see 291 * if all of the wavefronts which should be associated with one barrier 292 * (denoted with _barrier_id), are all at the same barrier in the program 293 * (denoted by bcnt). When the number at the barrier matches bslots, then 294 * return true. 295 */ 296 int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots); 297 bool cedeSIMD(int simdId, int wfSlotId); 298 299 template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst); 300 virtual void init(); 301 void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); 302 void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); 303 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, 304 bool kernelLaunch=true, 305 RequestPtr req=nullptr); 306 void handleMemPacket(PacketPtr pkt, int memport_index); 307 bool processTimingPacket(PacketPtr pkt); 308 void processFetchReturn(PacketPtr pkt); 309 void updatePageDivergenceDist(Addr addr); 310 311 MasterID masterId() { return _masterId; } 312 313 bool isDone() const; 314 bool isSimdDone(uint32_t) const; 315 316 protected: 317 MasterID _masterId; 318 319 LdsState &lds; 320 321 public: 322 // the following stats compute the avg. TLB accesslatency per 323 // uncoalesced request (only for data) 324 Stats::Scalar tlbRequests; 325 Stats::Scalar tlbCycles; 326 Stats::Formula tlbLatency; 327 // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table. 328 Stats::Vector hitsPerTLBLevel; 329 330 Stats::Scalar ldsBankAccesses; 331 Stats::Distribution ldsBankConflictDist; 332 333 // over all memory instructions executed over all wavefronts 334 // how many touched 0-4 pages, 4-8, ..., 60-64 pages 335 Stats::Distribution pageDivergenceDist; 336 Stats::Scalar dynamicGMemInstrCnt; 337 Stats::Scalar dynamicLMemInstrCnt; 338 339 Stats::Scalar wgBlockedDueLdsAllocation; 340 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active 341 // when the instruction is committed, this number is still incremented by 1 342 Stats::Scalar numInstrExecuted; 343 // Number of cycles among successive instruction executions across all 344 // wavefronts of the same CU 345 Stats::Distribution execRateDist; 346 // number of individual vector operations executed 347 Stats::Scalar numVecOpsExecuted; 348 // Total cycles that something is running on the GPU 349 Stats::Scalar totalCycles; 350 Stats::Formula vpc; // vector ops per cycle 351 Stats::Formula ipc; // vector instructions per cycle 352 Stats::Distribution controlFlowDivergenceDist; 353 Stats::Distribution activeLanesPerGMemInstrDist; 354 Stats::Distribution activeLanesPerLMemInstrDist; 355 // number of vector ALU instructions received 356 Stats::Formula numALUInstsExecuted; 357 // number of times a WG can not start due to lack of free VGPRs in SIMDs 358 Stats::Scalar numTimesWgBlockedDueVgprAlloc; 359 Stats::Scalar numCASOps; 360 Stats::Scalar numFailedCASOps; 361 Stats::Scalar completedWfs; 362 // flag per vector SIMD unit that is set when there is at least one 363 // WV that has a vector ALU instruction as the oldest in its 364 // Instruction Buffer: Defined in the Scoreboard stage, consumed 365 // by the Execute stage. 366 std::vector<bool> vectorAluInstAvail; 367 // number of available (oldest) LDS instructions that could have 368 // been issued to the LDS at a specific issue slot 369 int shrMemInstAvail; 370 // number of available Global memory instructions that could have 371 // been issued to TCP at a specific issue slot 372 int glbMemInstAvail; 373 374 void 375 regStats(); 376 377 LdsState & 378 getLds() const 379 { 380 return lds; 381 } 382 383 int32_t 384 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const; 385 386 bool 387 sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result)); 388 389 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct; 390 pageDataStruct pageAccesses; 391 392 class CUExitCallback : public Callback 393 { 394 private: 395 ComputeUnit *computeUnit; 396 397 public: 398 virtual ~CUExitCallback() { } 399 400 CUExitCallback(ComputeUnit *_cu) 401 { 402 computeUnit = _cu; 403 } 404 405 virtual void 406 process(); 407 }; 408 409 CUExitCallback *cuExitCallback; 410 411 /** Data access Port **/ 412 class DataPort : public MasterPort 413 { 414 public: 415 DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index) 416 : MasterPort(_name, _cu), computeUnit(_cu), 417 index(_index) { } 418 419 bool snoopRangeSent; 420 421 struct SenderState : public Packet::SenderState 422 { 423 GPUDynInstPtr _gpuDynInst; 424 int port_index; 425 Packet::SenderState *saved; 426 427 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, 428 Packet::SenderState *sender_state=nullptr) 429 : _gpuDynInst(gpuDynInst), 430 port_index(_port_index), 431 saved(sender_state) { } 432 }; 433 434 class MemReqEvent : public Event 435 { 436 private: 437 DataPort *dataPort; 438 PacketPtr pkt; 439 440 public: 441 MemReqEvent(DataPort *_data_port, PacketPtr _pkt) 442 : Event(), dataPort(_data_port), pkt(_pkt) 443 { 444 setFlags(Event::AutoDelete); 445 } 446 447 void process(); 448 const char *description() const; 449 }; 450 451 class MemRespEvent : public Event 452 { 453 private: 454 DataPort *dataPort; 455 PacketPtr pkt; 456 457 public: 458 MemRespEvent(DataPort *_data_port, PacketPtr _pkt) 459 : Event(), dataPort(_data_port), pkt(_pkt) 460 { 461 setFlags(Event::AutoDelete); 462 } 463 464 void process(); 465 const char *description() const; 466 }; 467 468 std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries; 469 470 protected: 471 ComputeUnit *computeUnit; 472 int index; 473 474 virtual bool recvTimingResp(PacketPtr pkt); 475 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 476 virtual void recvFunctional(PacketPtr pkt) { } 477 virtual void recvRangeChange() { } 478 virtual void recvReqRetry(); 479 480 virtual void 481 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) 482 { 483 resp.clear(); 484 snoop = true; 485 } 486 487 }; 488 489 // Instruction cache access port 490 class SQCPort : public MasterPort 491 { 492 public: 493 SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index) 494 : MasterPort(_name, _cu), computeUnit(_cu), 495 index(_index) { } 496 497 bool snoopRangeSent; 498 499 struct SenderState : public Packet::SenderState 500 { 501 Wavefront *wavefront; 502 Packet::SenderState *saved; 503 504 SenderState(Wavefront *_wavefront, Packet::SenderState 505 *sender_state=nullptr) 506 : wavefront(_wavefront), saved(sender_state) { } 507 }; 508 509 std::deque<std::pair<PacketPtr, Wavefront*>> retries; 510 511 protected: 512 ComputeUnit *computeUnit; 513 int index; 514 515 virtual bool recvTimingResp(PacketPtr pkt); 516 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 517 virtual void recvFunctional(PacketPtr pkt) { } 518 virtual void recvRangeChange() { } 519 virtual void recvReqRetry(); 520 521 virtual void 522 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) 523 { 524 resp.clear(); 525 snoop = true; 526 } 527 }; 528 529 /** Data TLB port **/ 530 class DTLBPort : public MasterPort 531 { 532 public: 533 DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index) 534 : MasterPort(_name, _cu), computeUnit(_cu), 535 index(_index), stalled(false) 536 { } 537 538 bool isStalled() { return stalled; } 539 void stallPort() { stalled = true; } 540 void unstallPort() { stalled = false; } 541 542 /** 543 * here we queue all the translation requests that were 544 * not successfully sent. 545 */ 546 std::deque<PacketPtr> retries; 547 548 /** SenderState is information carried along with the packet 549 * throughout the TLB hierarchy 550 */ 551 struct SenderState: public Packet::SenderState 552 { 553 // the memInst that this is associated with 554 GPUDynInstPtr _gpuDynInst; 555 556 // the lane in the memInst this is associated with, so we send 557 // the memory request down the right port 558 int portIndex; 559 560 // constructor used for packets involved in timing accesses 561 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index) 562 : _gpuDynInst(gpuDynInst), portIndex(port_index) { } 563 564 }; 565 566 protected: 567 ComputeUnit *computeUnit; 568 int index; 569 bool stalled; 570 571 virtual bool recvTimingResp(PacketPtr pkt); 572 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 573 virtual void recvFunctional(PacketPtr pkt) { } 574 virtual void recvRangeChange() { } 575 virtual void recvReqRetry(); 576 }; 577 578 class ITLBPort : public MasterPort 579 { 580 public: 581 ITLBPort(const std::string &_name, ComputeUnit *_cu) 582 : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { } 583 584 585 bool isStalled() { return stalled; } 586 void stallPort() { stalled = true; } 587 void unstallPort() { stalled = false; } 588 589 /** 590 * here we queue all the translation requests that were 591 * not successfully sent. 592 */ 593 std::deque<PacketPtr> retries; 594 595 /** SenderState is information carried along with the packet 596 * throughout the TLB hierarchy 597 */ 598 struct SenderState: public Packet::SenderState 599 { 600 // The wavefront associated with this request 601 Wavefront *wavefront; 602 603 SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { } 604 }; 605 606 protected: 607 ComputeUnit *computeUnit; 608 bool stalled; 609 610 virtual bool recvTimingResp(PacketPtr pkt); 611 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 612 virtual void recvFunctional(PacketPtr pkt) { } 613 virtual void recvRangeChange() { } 614 virtual void recvReqRetry(); 615 }; 616 617 /** 618 * the port intended to communicate between the CU and its LDS 619 */ 620 class LDSPort : public MasterPort 621 { 622 public: 623 LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id) 624 : MasterPort(_name, _cu, _id), computeUnit(_cu) 625 { 626 } 627 628 bool isStalled() const { return stalled; } 629 void stallPort() { stalled = true; } 630 void unstallPort() { stalled = false; } 631 632 /** 633 * here we queue all the requests that were 634 * not successfully sent. 635 */ 636 std::queue<PacketPtr> retries; 637 638 /** 639 * SenderState is information carried along with the packet, esp. the 640 * GPUDynInstPtr 641 */ 642 class SenderState: public Packet::SenderState 643 { 644 protected: 645 // The actual read/write/atomic request that goes with this command 646 GPUDynInstPtr _gpuDynInst = nullptr; 647 648 public: 649 SenderState(GPUDynInstPtr gpuDynInst): 650 _gpuDynInst(gpuDynInst) 651 { 652 } 653 654 GPUDynInstPtr 655 getMemInst() const 656 { 657 return _gpuDynInst; 658 } 659 }; 660 661 virtual bool 662 sendTimingReq(PacketPtr pkt); 663 664 protected: 665 666 bool stalled = false; ///< whether or not it is stalled 667 668 ComputeUnit *computeUnit; 669 670 virtual bool 671 recvTimingResp(PacketPtr pkt); 672 673 virtual Tick 674 recvAtomic(PacketPtr pkt) { return 0; } 675 676 virtual void 677 recvFunctional(PacketPtr pkt) 678 { 679 } 680 681 virtual void 682 recvRangeChange() 683 { 684 } 685 686 virtual void 687 recvReqRetry(); 688 }; 689 690 /** The port to access the Local Data Store 691 * Can be connected to a LDS object 692 */ 693 LDSPort *ldsPort = nullptr; 694 695 LDSPort * 696 getLdsPort() const 697 { 698 return ldsPort; 699 } 700 701 /** The memory port for SIMD data accesses. 702 * Can be connected to PhysMem for Ruby for timing simulations 703 */ 704 std::vector<DataPort*> memPort; 705 // port to the TLB hierarchy (i.e., the L1 TLB) 706 std::vector<DTLBPort*> tlbPort; 707 // port to the SQC (i.e. the I-cache) 708 SQCPort *sqcPort; 709 // port to the SQC TLB (there's a separate TLB for each I-cache) 710 ITLBPort *sqcTLBPort; 711 712 virtual BaseMasterPort& 713 getMasterPort(const std::string &if_name, PortID idx) 714 { 715 if (if_name == "memory_port") { 716 memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx), 717 this, idx); 718 return *memPort[idx]; 719 } else if (if_name == "translation_port") { 720 tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx), 721 this, idx); 722 return *tlbPort[idx]; 723 } else if (if_name == "sqc_port") { 724 sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx), 725 this, idx); 726 return *sqcPort; 727 } else if (if_name == "sqc_tlb_port") { 728 sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this); 729 return *sqcTLBPort; 730 } else if (if_name == "ldsPort") { 731 if (ldsPort) { 732 fatal("an LDS port was already allocated"); 733 } 734 ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx); 735 return *ldsPort; 736 } else { 737 panic("incorrect port name"); 738 } 739 } 740 741 // xact_cas_load() 742 class waveIdentifier 743 { 744 public: 745 waveIdentifier() { } 746 waveIdentifier(int _simdId, int _wfSlotId) 747 : simdId(_simdId), wfSlotId(_wfSlotId) { } 748 749 int simdId; 750 int wfSlotId; 751 }; 752 753 class waveQueue 754 { 755 public: 756 std::list<waveIdentifier> waveIDQueue; 757 }; 758 std::map<unsigned, waveQueue> xactCasLoadMap; 759 760 uint64_t getAndIncSeqNum() { return globalSeqNum++; } 761 762 private: 763 uint64_t globalSeqNum; 764 int wavefrontSize; 765}; 766 767#endif // __COMPUTE_UNIT_HH__ 768