compute_unit.hh revision 11534
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: John Kalamatianos, Anthony Gutierrez 34 */ 35 36#ifndef __COMPUTE_UNIT_HH__ 37#define __COMPUTE_UNIT_HH__ 38 39#include <deque> 40#include <map> 41#include <unordered_map> 42#include <vector> 43 44#include "base/callback.hh" 45#include "base/statistics.hh" 46#include "base/types.hh" 47#include "enums/PrefetchType.hh" 48#include "gpu-compute/exec_stage.hh" 49#include "gpu-compute/fetch_stage.hh" 50#include "gpu-compute/global_memory_pipeline.hh" 51#include "gpu-compute/local_memory_pipeline.hh" 52#include "gpu-compute/qstruct.hh" 53#include "gpu-compute/schedule_stage.hh" 54#include "gpu-compute/scoreboard_check_stage.hh" 55#include "mem/mem_object.hh" 56#include "mem/port.hh" 57 58static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1; 59static const int MAX_WIDTH_FOR_MEM_INST = 32; 60 61class NDRange; 62class Shader; 63class VectorRegisterFile; 64 65struct ComputeUnitParams; 66 67enum EXEC_POLICY 68{ 69 OLDEST = 0, 70 RR 71}; 72 73// List of execution units 74enum EXEC_UNIT 75{ 76 SIMD0 = 0, 77 SIMD1, 78 SIMD2, 79 SIMD3, 80 GLBMEM_PIPE, 81 LDSMEM_PIPE, 82 NUM_UNITS 83}; 84 85enum TLB_CACHE 86{ 87 TLB_MISS_CACHE_MISS = 0, 88 TLB_MISS_CACHE_HIT, 89 TLB_HIT_CACHE_MISS, 90 TLB_HIT_CACHE_HIT 91}; 92 93class ComputeUnit : public MemObject 94{ 95 public: 96 FetchStage fetchStage; 97 ScoreboardCheckStage scoreboardCheckStage; 98 ScheduleStage scheduleStage; 99 ExecStage execStage; 100 GlobalMemPipeline globalMemoryPipe; 101 LocalMemPipeline localMemoryPipe; 102 103 // Buffers used to communicate between various pipeline stages 104 105 // List of waves which are ready to be scheduled. 106 // Each execution resource has a ready list. readyList is 107 // used to communicate between scoreboardCheck stage and 108 // schedule stage 109 // TODO: make enum to index readyList 110 std::vector<std::vector<Wavefront*>> readyList; 111 112 // Stores the status of waves. A READY implies the 113 // wave is ready to be scheduled this cycle and 114 // is already present in the readyList. waveStatusList is 115 // used to communicate between scoreboardCheck stage and 116 // schedule stage 117 // TODO: convert std::pair to a class to increase readability 118 std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList; 119 120 // List of waves which will be dispatched to 121 // each execution resource. A FILLED implies 122 // dispatch list is non-empty and 123 // execution unit has something to execute 124 // this cycle. Currently, the dispatch list of 125 // an execution resource can hold only one wave because 126 // an execution resource can execute only one wave in a cycle. 127 // dispatchList is used to communicate between schedule 128 // and exec stage 129 // TODO: convert std::pair to a class to increase readability 130 std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList; 131 132 int rrNextMemID; // used by RR WF exec policy to cycle through WF's 133 int rrNextALUWp; 134 typedef ComputeUnitParams Params; 135 std::vector<std::vector<Wavefront*>> wfList; 136 int cu_id; 137 138 // array of vector register files, one per SIMD 139 std::vector<VectorRegisterFile*> vrf; 140 // Number of vector ALU units (SIMDs) in CU 141 int numSIMDs; 142 // number of pipe stages for bypassing data to next dependent single 143 // precision vector instruction inside the vector ALU pipeline 144 int spBypassPipeLength; 145 // number of pipe stages for bypassing data to next dependent double 146 // precision vector instruction inside the vector ALU pipeline 147 int dpBypassPipeLength; 148 // number of cycles per issue period 149 int issuePeriod; 150 151 // Number of global and local memory execution resources in CU 152 int numGlbMemUnits; 153 int numLocMemUnits; 154 // tracks the last cycle a vector instruction was executed on a SIMD 155 std::vector<uint64_t> lastExecCycle; 156 157 // true if we allow a separate TLB per lane 158 bool perLaneTLB; 159 // if 0, TLB prefetching is off. 160 int prefetchDepth; 161 // if fixed-stride prefetching, this is the stride. 162 int prefetchStride; 163 164 std::vector<Addr> lastVaddrCU; 165 std::vector<std::vector<Addr>> lastVaddrSimd; 166 std::vector<std::vector<std::vector<Addr>>> lastVaddrWF; 167 Enums::PrefetchType prefetchType; 168 EXEC_POLICY exec_policy; 169 170 bool xact_cas_mode; 171 bool debugSegFault; 172 bool functionalTLB; 173 bool localMemBarrier; 174 175 /* 176 * for Counting page accesses 177 * 178 * cuExitCallback inherits from Callback. When you register a callback 179 * function as an exit callback, it will get added to an exit callback 180 * queue, such that on simulation exit, all callbacks in the callback 181 * queue will have their process() function called. 182 */ 183 bool countPages; 184 185 Shader *shader; 186 uint32_t barrier_id; 187 // vector of Vector ALU (MACC) pipelines 188 std::vector<WaitClass> aluPipe; 189 // minimum issue period per SIMD unit (in cycles) 190 std::vector<WaitClass> wfWait; 191 192 // Resource control for Vector Register File->Global Memory pipe buses 193 std::vector<WaitClass> vrfToGlobalMemPipeBus; 194 // Resource control for Vector Register File->Local Memory pipe buses 195 std::vector<WaitClass> vrfToLocalMemPipeBus; 196 int nextGlbMemBus; 197 int nextLocMemBus; 198 // Resource control for global memory to VRF data/address bus 199 WaitClass glbMemToVrfBus; 200 // Resource control for local memory to VRF data/address bus 201 WaitClass locMemToVrfBus; 202 203 uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes 204 uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes 205 uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store 206 uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load 207 208 Tick req_tick_latency; 209 Tick resp_tick_latency; 210 211 // number of vector registers being reserved for each SIMD unit 212 std::vector<int> vectorRegsReserved; 213 // number of vector registers per SIMD unit 214 uint32_t numVecRegsPerSimd; 215 // Support for scheduling VGPR status update events 216 std::vector<std::pair<uint32_t, uint32_t> > regIdxVec; 217 std::vector<uint64_t> timestampVec; 218 std::vector<uint8_t> statusVec; 219 220 void 221 registerEvent(uint32_t simdId, 222 uint32_t regIdx, 223 uint32_t operandSize, 224 uint64_t when, 225 uint8_t newStatus) { 226 regIdxVec.push_back(std::make_pair(simdId, regIdx)); 227 timestampVec.push_back(when); 228 statusVec.push_back(newStatus); 229 if (operandSize > 4) { 230 regIdxVec.push_back(std::make_pair(simdId, 231 ((regIdx + 1) % 232 numVecRegsPerSimd))); 233 timestampVec.push_back(when); 234 statusVec.push_back(newStatus); 235 } 236 } 237 238 void updateEvents(); 239 240 // this hash map will keep track of page divergence 241 // per memory instruction per wavefront. The hash map 242 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. 243 std::map<Addr, int> pagesTouched; 244 245 ComputeUnit(const Params *p); 246 ~ComputeUnit(); 247 int spBypassLength() { return spBypassPipeLength; }; 248 int dpBypassLength() { return dpBypassPipeLength; }; 249 int storeBusLength() { return numCyclesPerStoreTransfer; }; 250 int loadBusLength() { return numCyclesPerLoadTransfer; }; 251 int wfSize() const { return wavefrontSize; }; 252 253 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 254 void exec(); 255 void initiateFetch(Wavefront *wavefront); 256 void fetch(PacketPtr pkt, Wavefront *wavefront); 257 void FillKernelState(Wavefront *w, NDRange *ndr); 258 259 void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], 260 int trueWgSizeTotal); 261 262 void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, 263 int trueWgSize[], int trueWgSizeTotal, 264 LdsChunk *ldsChunk, uint64_t origSpillMemStart); 265 266 void StartWorkgroup(NDRange *ndr); 267 int ReadyWorkgroup(NDRange *ndr); 268 269 bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; } 270 bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; } 271 bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; } 272 int GlbMemUnitId() { return GLBMEM_PIPE; } 273 int ShrMemUnitId() { return LDSMEM_PIPE; } 274 int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; } 275 int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; } 276 /* This function cycles through all the wavefronts in all the phases to see 277 * if all of the wavefronts which should be associated with one barrier 278 * (denoted with _barrier_id), are all at the same barrier in the program 279 * (denoted by bcnt). When the number at the barrier matches bslots, then 280 * return true. 281 */ 282 int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots); 283 bool cedeSIMD(int simdId, int wfSlotId); 284 285 template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst); 286 virtual void init(); 287 void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); 288 void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); 289 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, 290 bool kernelLaunch=true, 291 RequestPtr req=nullptr); 292 void handleMemPacket(PacketPtr pkt, int memport_index); 293 bool processTimingPacket(PacketPtr pkt); 294 void processFetchReturn(PacketPtr pkt); 295 void updatePageDivergenceDist(Addr addr); 296 297 MasterID masterId() { return _masterId; } 298 299 bool isDone() const; 300 bool isSimdDone(uint32_t) const; 301 302 protected: 303 MasterID _masterId; 304 305 LdsState &lds; 306 307 public: 308 // the following stats compute the avg. TLB accesslatency per 309 // uncoalesced request (only for data) 310 Stats::Scalar tlbRequests; 311 Stats::Scalar tlbCycles; 312 Stats::Formula tlbLatency; 313 // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table. 314 Stats::Vector hitsPerTLBLevel; 315 316 Stats::Scalar ldsBankAccesses; 317 Stats::Distribution ldsBankConflictDist; 318 319 // over all memory instructions executed over all wavefronts 320 // how many touched 0-4 pages, 4-8, ..., 60-64 pages 321 Stats::Distribution pageDivergenceDist; 322 Stats::Scalar dynamicGMemInstrCnt; 323 Stats::Scalar dynamicLMemInstrCnt; 324 325 Stats::Scalar wgBlockedDueLdsAllocation; 326 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active 327 // when the instruction is committed, this number is still incremented by 1 328 Stats::Scalar numInstrExecuted; 329 // Number of cycles among successive instruction executions across all 330 // wavefronts of the same CU 331 Stats::Distribution execRateDist; 332 // number of individual vector operations executed 333 Stats::Scalar numVecOpsExecuted; 334 // Total cycles that something is running on the GPU 335 Stats::Scalar totalCycles; 336 Stats::Formula vpc; // vector ops per cycle 337 Stats::Formula ipc; // vector instructions per cycle 338 Stats::Distribution controlFlowDivergenceDist; 339 Stats::Distribution activeLanesPerGMemInstrDist; 340 Stats::Distribution activeLanesPerLMemInstrDist; 341 // number of vector ALU instructions received 342 Stats::Formula numALUInstsExecuted; 343 // number of times a WG can not start due to lack of free VGPRs in SIMDs 344 Stats::Scalar numTimesWgBlockedDueVgprAlloc; 345 Stats::Scalar numCASOps; 346 Stats::Scalar numFailedCASOps; 347 Stats::Scalar completedWfs; 348 // flag per vector SIMD unit that is set when there is at least one 349 // WV that has a vector ALU instruction as the oldest in its 350 // Instruction Buffer: Defined in the Scoreboard stage, consumed 351 // by the Execute stage. 352 std::vector<bool> vectorAluInstAvail; 353 // number of available (oldest) LDS instructions that could have 354 // been issued to the LDS at a specific issue slot 355 int shrMemInstAvail; 356 // number of available Global memory instructions that could have 357 // been issued to TCP at a specific issue slot 358 int glbMemInstAvail; 359 360 void 361 regStats(); 362 363 LdsState & 364 getLds() const 365 { 366 return lds; 367 } 368 369 int32_t 370 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const; 371 372 bool 373 sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result)); 374 375 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct; 376 pageDataStruct pageAccesses; 377 378 class CUExitCallback : public Callback 379 { 380 private: 381 ComputeUnit *computeUnit; 382 383 public: 384 virtual ~CUExitCallback() { } 385 386 CUExitCallback(ComputeUnit *_cu) 387 { 388 computeUnit = _cu; 389 } 390 391 virtual void 392 process(); 393 }; 394 395 CUExitCallback *cuExitCallback; 396 397 /** Data access Port **/ 398 class DataPort : public MasterPort 399 { 400 public: 401 DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index) 402 : MasterPort(_name, _cu), computeUnit(_cu), 403 index(_index) { } 404 405 bool snoopRangeSent; 406 407 struct SenderState : public Packet::SenderState 408 { 409 GPUDynInstPtr _gpuDynInst; 410 int port_index; 411 Packet::SenderState *saved; 412 413 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, 414 Packet::SenderState *sender_state=nullptr) 415 : _gpuDynInst(gpuDynInst), 416 port_index(_port_index), 417 saved(sender_state) { } 418 }; 419 420 class MemReqEvent : public Event 421 { 422 private: 423 DataPort *dataPort; 424 PacketPtr pkt; 425 426 public: 427 MemReqEvent(DataPort *_data_port, PacketPtr _pkt) 428 : Event(), dataPort(_data_port), pkt(_pkt) 429 { 430 setFlags(Event::AutoDelete); 431 } 432 433 void process(); 434 const char *description() const; 435 }; 436 437 class MemRespEvent : public Event 438 { 439 private: 440 DataPort *dataPort; 441 PacketPtr pkt; 442 443 public: 444 MemRespEvent(DataPort *_data_port, PacketPtr _pkt) 445 : Event(), dataPort(_data_port), pkt(_pkt) 446 { 447 setFlags(Event::AutoDelete); 448 } 449 450 void process(); 451 const char *description() const; 452 }; 453 454 std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries; 455 456 protected: 457 ComputeUnit *computeUnit; 458 int index; 459 460 virtual bool recvTimingResp(PacketPtr pkt); 461 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 462 virtual void recvFunctional(PacketPtr pkt) { } 463 virtual void recvRangeChange() { } 464 virtual void recvReqRetry(); 465 466 virtual void 467 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) 468 { 469 resp.clear(); 470 snoop = true; 471 } 472 473 }; 474 475 // Instruction cache access port 476 class SQCPort : public MasterPort 477 { 478 public: 479 SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index) 480 : MasterPort(_name, _cu), computeUnit(_cu), 481 index(_index) { } 482 483 bool snoopRangeSent; 484 485 struct SenderState : public Packet::SenderState 486 { 487 Wavefront *wavefront; 488 Packet::SenderState *saved; 489 490 SenderState(Wavefront *_wavefront, Packet::SenderState 491 *sender_state=nullptr) 492 : wavefront(_wavefront), saved(sender_state) { } 493 }; 494 495 std::deque<std::pair<PacketPtr, Wavefront*>> retries; 496 497 protected: 498 ComputeUnit *computeUnit; 499 int index; 500 501 virtual bool recvTimingResp(PacketPtr pkt); 502 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 503 virtual void recvFunctional(PacketPtr pkt) { } 504 virtual void recvRangeChange() { } 505 virtual void recvReqRetry(); 506 507 virtual void 508 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) 509 { 510 resp.clear(); 511 snoop = true; 512 } 513 }; 514 515 /** Data TLB port **/ 516 class DTLBPort : public MasterPort 517 { 518 public: 519 DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index) 520 : MasterPort(_name, _cu), computeUnit(_cu), 521 index(_index), stalled(false) 522 { } 523 524 bool isStalled() { return stalled; } 525 void stallPort() { stalled = true; } 526 void unstallPort() { stalled = false; } 527 528 /** 529 * here we queue all the translation requests that were 530 * not successfully sent. 531 */ 532 std::deque<PacketPtr> retries; 533 534 /** SenderState is information carried along with the packet 535 * throughout the TLB hierarchy 536 */ 537 struct SenderState: public Packet::SenderState 538 { 539 // the memInst that this is associated with 540 GPUDynInstPtr _gpuDynInst; 541 542 // the lane in the memInst this is associated with, so we send 543 // the memory request down the right port 544 int portIndex; 545 546 // constructor used for packets involved in timing accesses 547 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index) 548 : _gpuDynInst(gpuDynInst), portIndex(port_index) { } 549 550 }; 551 552 protected: 553 ComputeUnit *computeUnit; 554 int index; 555 bool stalled; 556 557 virtual bool recvTimingResp(PacketPtr pkt); 558 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 559 virtual void recvFunctional(PacketPtr pkt) { } 560 virtual void recvRangeChange() { } 561 virtual void recvReqRetry(); 562 }; 563 564 class ITLBPort : public MasterPort 565 { 566 public: 567 ITLBPort(const std::string &_name, ComputeUnit *_cu) 568 : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { } 569 570 571 bool isStalled() { return stalled; } 572 void stallPort() { stalled = true; } 573 void unstallPort() { stalled = false; } 574 575 /** 576 * here we queue all the translation requests that were 577 * not successfully sent. 578 */ 579 std::deque<PacketPtr> retries; 580 581 /** SenderState is information carried along with the packet 582 * throughout the TLB hierarchy 583 */ 584 struct SenderState: public Packet::SenderState 585 { 586 // The wavefront associated with this request 587 Wavefront *wavefront; 588 589 SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { } 590 }; 591 592 protected: 593 ComputeUnit *computeUnit; 594 bool stalled; 595 596 virtual bool recvTimingResp(PacketPtr pkt); 597 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 598 virtual void recvFunctional(PacketPtr pkt) { } 599 virtual void recvRangeChange() { } 600 virtual void recvReqRetry(); 601 }; 602 603 /** 604 * the port intended to communicate between the CU and its LDS 605 */ 606 class LDSPort : public MasterPort 607 { 608 public: 609 LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id) 610 : MasterPort(_name, _cu, _id), computeUnit(_cu) 611 { 612 } 613 614 bool isStalled() const { return stalled; } 615 void stallPort() { stalled = true; } 616 void unstallPort() { stalled = false; } 617 618 /** 619 * here we queue all the requests that were 620 * not successfully sent. 621 */ 622 std::queue<PacketPtr> retries; 623 624 /** 625 * SenderState is information carried along with the packet, esp. the 626 * GPUDynInstPtr 627 */ 628 class SenderState: public Packet::SenderState 629 { 630 protected: 631 // The actual read/write/atomic request that goes with this command 632 GPUDynInstPtr _gpuDynInst = nullptr; 633 634 public: 635 SenderState(GPUDynInstPtr gpuDynInst): 636 _gpuDynInst(gpuDynInst) 637 { 638 } 639 640 GPUDynInstPtr 641 getMemInst() const 642 { 643 return _gpuDynInst; 644 } 645 }; 646 647 virtual bool 648 sendTimingReq(PacketPtr pkt); 649 650 protected: 651 652 bool stalled = false; ///< whether or not it is stalled 653 654 ComputeUnit *computeUnit; 655 656 virtual bool 657 recvTimingResp(PacketPtr pkt); 658 659 virtual Tick 660 recvAtomic(PacketPtr pkt) { return 0; } 661 662 virtual void 663 recvFunctional(PacketPtr pkt) 664 { 665 } 666 667 virtual void 668 recvRangeChange() 669 { 670 } 671 672 virtual void 673 recvReqRetry(); 674 }; 675 676 /** The port to access the Local Data Store 677 * Can be connected to a LDS object 678 */ 679 LDSPort *ldsPort = nullptr; 680 681 LDSPort * 682 getLdsPort() const 683 { 684 return ldsPort; 685 } 686 687 /** The memory port for SIMD data accesses. 688 * Can be connected to PhysMem for Ruby for timing simulations 689 */ 690 std::vector<DataPort*> memPort; 691 // port to the TLB hierarchy (i.e., the L1 TLB) 692 std::vector<DTLBPort*> tlbPort; 693 // port to the SQC (i.e. the I-cache) 694 SQCPort *sqcPort; 695 // port to the SQC TLB (there's a separate TLB for each I-cache) 696 ITLBPort *sqcTLBPort; 697 698 virtual BaseMasterPort& 699 getMasterPort(const std::string &if_name, PortID idx) 700 { 701 if (if_name == "memory_port") { 702 memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx), 703 this, idx); 704 return *memPort[idx]; 705 } else if (if_name == "translation_port") { 706 tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx), 707 this, idx); 708 return *tlbPort[idx]; 709 } else if (if_name == "sqc_port") { 710 sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx), 711 this, idx); 712 return *sqcPort; 713 } else if (if_name == "sqc_tlb_port") { 714 sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this); 715 return *sqcTLBPort; 716 } else if (if_name == "ldsPort") { 717 if (ldsPort) { 718 fatal("an LDS port was already allocated"); 719 } 720 ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx); 721 return *ldsPort; 722 } else { 723 panic("incorrect port name"); 724 } 725 } 726 727 // xact_cas_load() 728 class waveIdentifier 729 { 730 public: 731 waveIdentifier() { } 732 waveIdentifier(int _simdId, int _wfSlotId) 733 : simdId(_simdId), wfSlotId(_wfSlotId) { } 734 735 int simdId; 736 int wfSlotId; 737 }; 738 739 class waveQueue 740 { 741 public: 742 std::list<waveIdentifier> waveIDQueue; 743 }; 744 std::map<unsigned, waveQueue> xactCasLoadMap; 745 746 uint64_t getAndIncSeqNum() { return globalSeqNum++; } 747 748 private: 749 uint64_t globalSeqNum; 750 int wavefrontSize; 751}; 752 753#endif // __COMPUTE_UNIT_HH__ 754