Cross Reference: /gem5/src/gpu-compute/compute

Deleted Added

sdiff udiff text old ( 12126:06c1fbaa5724 ) new ( 12697:cd71b966be1e )

full compact

compute_unit.hh (12126:06c1fbaa5724)	compute_unit.hh (12697:cd71b966be1e)
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 *	1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission.	17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission.
20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 *	20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 *
33 * Author: John Kalamatianos, Anthony Gutierrez	33 * Authors: John Kalamatianos, 34 * Anthony Gutierrez
34 / 35 36#ifndef __COMPUTE_UNIT_HH__ 37#define __COMPUTE_UNIT_HH__ 38 39#include <deque> 40#include <map> 41#include <unordered_map> 42#include <vector> 43 44#include "base/callback.hh" 45#include "base/statistics.hh" 46#include "base/types.hh" 47#include "enums/PrefetchType.hh" 48#include "gpu-compute/exec_stage.hh" 49#include "gpu-compute/fetch_stage.hh" 50#include "gpu-compute/global_memory_pipeline.hh" 51#include "gpu-compute/local_memory_pipeline.hh" 52#include "gpu-compute/qstruct.hh" 53#include "gpu-compute/schedule_stage.hh" 54#include "gpu-compute/scoreboard_check_stage.hh" 55#include "mem/mem_object.hh" 56#include "mem/port.hh" 57 58static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1; 59static const int MAX_WIDTH_FOR_MEM_INST = 32; 60 61class NDRange; 62class Shader; 63class VectorRegisterFile; 64 65struct ComputeUnitParams; 66 67enum EXEC_POLICY 68{ 69 OLDEST = 0, 70 RR 71}; 72 73// List of execution units 74enum EXEC_UNIT 75{ 76 SIMD0 = 0, 77 SIMD1, 78 SIMD2, 79 SIMD3, 80 GLBMEM_PIPE, 81 LDSMEM_PIPE, 82 NUM_UNITS 83}; 84 85enum TLB_CACHE 86{ 87 TLB_MISS_CACHE_MISS = 0, 88 TLB_MISS_CACHE_HIT, 89 TLB_HIT_CACHE_MISS, 90 TLB_HIT_CACHE_HIT 91}; 92 93class ComputeUnit : public MemObject 94{ 95 public: 96 FetchStage fetchStage; 97 ScoreboardCheckStage scoreboardCheckStage; 98 ScheduleStage scheduleStage; 99 ExecStage execStage; 100* GlobalMemPipeline globalMemoryPipe; 101 LocalMemPipeline localMemoryPipe; 102 103 // Buffers used to communicate between various pipeline stages 104 105 // List of waves which are ready to be scheduled. 106 // Each execution resource has a ready list. readyList is 107 // used to communicate between scoreboardCheck stage and 108 // schedule stage 109 // TODO: make enum to index readyList 110 std::vector<std::vector<Wavefront>> readyList; 111* 112 // Stores the status of waves. A READY implies the 113 // wave is ready to be scheduled this cycle and 114 // is already present in the readyList. waveStatusList is 115 // used to communicate between scoreboardCheck stage and 116 // schedule stage 117 // TODO: convert std::pair to a class to increase readability 118 std::vector<std::vector<std::pair<Wavefront, WAVE_STATUS>>> waveStatusList; 119* 120 // List of waves which will be dispatched to 121 // each execution resource. A FILLED implies 122 // dispatch list is non-empty and 123 // execution unit has something to execute 124 // this cycle. Currently, the dispatch list of 125 // an execution resource can hold only one wave because 126 // an execution resource can execute only one wave in a cycle. 127 // dispatchList is used to communicate between schedule 128 // and exec stage 129 // TODO: convert std::pair to a class to increase readability 130 std::vector<std::pair<Wavefront, DISPATCH_STATUS>> dispatchList; 131* 132 int rrNextMemID; // used by RR WF exec policy to cycle through WF's 133 int rrNextALUWp; 134 typedef ComputeUnitParams Params; 135 std::vector<std::vector<Wavefront>> wfList; 136* int cu_id; 137 138 // array of vector register files, one per SIMD 139 std::vector<VectorRegisterFile> vrf; 140* // Number of vector ALU units (SIMDs) in CU 141 int numSIMDs; 142 // number of pipe stages for bypassing data to next dependent single 143 // precision vector instruction inside the vector ALU pipeline 144 int spBypassPipeLength; 145 // number of pipe stages for bypassing data to next dependent double 146 // precision vector instruction inside the vector ALU pipeline 147 int dpBypassPipeLength; 148 // number of cycles per issue period 149 int issuePeriod; 150 151 // Number of global and local memory execution resources in CU 152 int numGlbMemUnits; 153 int numLocMemUnits; 154 // tracks the last cycle a vector instruction was executed on a SIMD 155 std::vector<uint64_t> lastExecCycle; 156 157 // true if we allow a separate TLB per lane 158 bool perLaneTLB; 159 // if 0, TLB prefetching is off. 160 int prefetchDepth; 161 // if fixed-stride prefetching, this is the stride. 162 int prefetchStride; 163 164 std::vector<Addr> lastVaddrCU; 165 std::vector<std::vector<Addr>> lastVaddrSimd; 166 std::vector<std::vector<std::vector<Addr>>> lastVaddrWF; 167 Enums::PrefetchType prefetchType; 168 EXEC_POLICY exec_policy; 169 170 bool xact_cas_mode; 171 bool debugSegFault; 172 bool functionalTLB; 173 bool localMemBarrier; 174 175 /* 176 * for Counting page accesses 177 * 178 * cuExitCallback inherits from Callback. When you register a callback 179 * function as an exit callback, it will get added to an exit callback 180 * queue, such that on simulation exit, all callbacks in the callback 181 * queue will have their process() function called. 182 / 183* bool countPages; 184 185 Shader shader; 186* uint32_t barrier_id; 187 // vector of Vector ALU (MACC) pipelines 188 std::vector<WaitClass> aluPipe; 189 // minimum issue period per SIMD unit (in cycles) 190 std::vector<WaitClass> wfWait; 191 192 // Resource control for Vector Register File->Global Memory pipe buses 193 std::vector<WaitClass> vrfToGlobalMemPipeBus; 194 // Resource control for Vector Register File->Local Memory pipe buses 195 std::vector<WaitClass> vrfToLocalMemPipeBus; 196 int nextGlbMemBus; 197 int nextLocMemBus; 198 // Resource control for global memory to VRF data/address bus 199 WaitClass glbMemToVrfBus; 200 // Resource control for local memory to VRF data/address bus 201 WaitClass locMemToVrfBus; 202 203 uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes 204 uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes 205 uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store 206 uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load 207 208 Tick req_tick_latency; 209 Tick resp_tick_latency; 210 211 // number of vector registers being reserved for each SIMD unit 212 std::vector<int> vectorRegsReserved; 213 // number of vector registers per SIMD unit 214 uint32_t numVecRegsPerSimd; 215 // Support for scheduling VGPR status update events 216 std::vector<std::pair<uint32_t, uint32_t> > regIdxVec; 217 std::vector<uint64_t> timestampVec; 218 std::vector<uint8_t> statusVec; 219 220 void 221 registerEvent(uint32_t simdId, 222 uint32_t regIdx, 223 uint32_t operandSize, 224 uint64_t when, 225 uint8_t newStatus) { 226 regIdxVec.push_back(std::make_pair(simdId, regIdx)); 227 timestampVec.push_back(when); 228 statusVec.push_back(newStatus); 229 if (operandSize > 4) { 230 regIdxVec.push_back(std::make_pair(simdId, 231 ((regIdx + 1) % 232 numVecRegsPerSimd))); 233 timestampVec.push_back(when); 234 statusVec.push_back(newStatus); 235 } 236 } 237 238 void updateEvents(); 239 240 // this hash map will keep track of page divergence 241 // per memory instruction per wavefront. The hash map 242 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. 243 std::map<Addr, int> pagesTouched; 244 245 ComputeUnit(const Params p); 246* ~ComputeUnit(); 247 int spBypassLength() { return spBypassPipeLength; }; 248 int dpBypassLength() { return dpBypassPipeLength; }; 249 int storeBusLength() { return numCyclesPerStoreTransfer; }; 250 int loadBusLength() { return numCyclesPerLoadTransfer; }; 251 int wfSize() const { return wavefrontSize; }; 252 253 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 254 void exec(); 255 void initiateFetch(Wavefront wavefront); 256* void fetch(PacketPtr pkt, Wavefront wavefront); 257* void fillKernelState(Wavefront w, NDRange ndr); 258 259 void startWavefront(Wavefront w, int waveId, LdsChunk ldsChunk, 260 NDRange ndr); 261* 262 void StartWorkgroup(NDRange ndr); 263* int ReadyWorkgroup(NDRange ndr); 264* 265 bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; } 266 bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; } 267 bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; } 268 int GlbMemUnitId() { return GLBMEM_PIPE; } 269 int ShrMemUnitId() { return LDSMEM_PIPE; } 270 int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; } 271 int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; } 272 /* This function cycles through all the wavefronts in all the phases to see 273 * if all of the wavefronts which should be associated with one barrier 274 * (denoted with _barrier_id), are all at the same barrier in the program 275 * (denoted by bcnt). When the number at the barrier matches bslots, then 276 * return true. 277 / 278* int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots); 279 bool cedeSIMD(int simdId, int wfSlotId); 280 281 template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst); 282 virtual void init(); 283 void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); 284 void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); 285 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, 286 bool kernelLaunch=true, 287 RequestPtr req=nullptr); 288 void handleMemPacket(PacketPtr pkt, int memport_index); 289 bool processTimingPacket(PacketPtr pkt); 290 void processFetchReturn(PacketPtr pkt); 291 void updatePageDivergenceDist(Addr addr); 292 293 MasterID masterId() { return _masterId; } 294 295 bool isDone() const; 296 bool isSimdDone(uint32_t) const; 297 298 protected: 299 MasterID _masterId; 300 301 LdsState &lds; 302 303 public: 304 Stats::Scalar vALUInsts; 305 Stats::Formula vALUInstsPerWF; 306 Stats::Scalar sALUInsts; 307 Stats::Formula sALUInstsPerWF; 308 Stats::Scalar instCyclesVALU; 309 Stats::Scalar instCyclesSALU; 310 Stats::Scalar threadCyclesVALU; 311 Stats::Formula vALUUtilization; 312 Stats::Scalar ldsNoFlatInsts; 313 Stats::Formula ldsNoFlatInstsPerWF; 314 Stats::Scalar flatVMemInsts; 315 Stats::Formula flatVMemInstsPerWF; 316 Stats::Scalar flatLDSInsts; 317 Stats::Formula flatLDSInstsPerWF; 318 Stats::Scalar vectorMemWrites; 319 Stats::Formula vectorMemWritesPerWF; 320 Stats::Scalar vectorMemReads; 321 Stats::Formula vectorMemReadsPerWF; 322 Stats::Scalar scalarMemWrites; 323 Stats::Formula scalarMemWritesPerWF; 324 Stats::Scalar scalarMemReads; 325 Stats::Formula scalarMemReadsPerWF; 326 327 void updateInstStats(GPUDynInstPtr gpuDynInst); 328 329 // the following stats compute the avg. TLB accesslatency per 330 // uncoalesced request (only for data) 331 Stats::Scalar tlbRequests; 332 Stats::Scalar tlbCycles; 333 Stats::Formula tlbLatency; 334 // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table. 335 Stats::Vector hitsPerTLBLevel; 336 337 Stats::Scalar ldsBankAccesses; 338 Stats::Distribution ldsBankConflictDist; 339 340 // over all memory instructions executed over all wavefronts 341 // how many touched 0-4 pages, 4-8, ..., 60-64 pages 342 Stats::Distribution pageDivergenceDist; 343 Stats::Scalar dynamicGMemInstrCnt; 344 Stats::Scalar dynamicLMemInstrCnt; 345 346 Stats::Scalar wgBlockedDueLdsAllocation; 347 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active 348 // when the instruction is committed, this number is still incremented by 1 349 Stats::Scalar numInstrExecuted; 350 // Number of cycles among successive instruction executions across all 351 // wavefronts of the same CU 352 Stats::Distribution execRateDist; 353 // number of individual vector operations executed 354 Stats::Scalar numVecOpsExecuted; 355 // Total cycles that something is running on the GPU 356 Stats::Scalar totalCycles; 357 Stats::Formula vpc; // vector ops per cycle 358 Stats::Formula ipc; // vector instructions per cycle 359 Stats::Distribution controlFlowDivergenceDist; 360 Stats::Distribution activeLanesPerGMemInstrDist; 361 Stats::Distribution activeLanesPerLMemInstrDist; 362 // number of vector ALU instructions received 363 Stats::Formula numALUInstsExecuted; 364 // number of times a WG can not start due to lack of free VGPRs in SIMDs 365 Stats::Scalar numTimesWgBlockedDueVgprAlloc; 366 Stats::Scalar numCASOps; 367 Stats::Scalar numFailedCASOps; 368 Stats::Scalar completedWfs; 369 // flag per vector SIMD unit that is set when there is at least one 370 // WV that has a vector ALU instruction as the oldest in its 371 // Instruction Buffer: Defined in the Scoreboard stage, consumed 372 // by the Execute stage. 373 std::vector<bool> vectorAluInstAvail; 374 // number of available (oldest) LDS instructions that could have 375 // been issued to the LDS at a specific issue slot 376 int shrMemInstAvail; 377 // number of available Global memory instructions that could have 378 // been issued to TCP at a specific issue slot 379 int glbMemInstAvail; 380 381 void 382 regStats(); 383 384 LdsState & 385 getLds() const 386 { 387 return lds; 388 } 389 390 int32_t 391 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const; 392 393 int cacheLineSize() const { return _cacheLineSize; } 394 395 bool 396 sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result)); 397 398 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct; 399 pageDataStruct pageAccesses; 400 401 class CUExitCallback : public Callback 402 { 403 private: 404 ComputeUnit computeUnit; 405* 406 public: 407 virtual ~CUExitCallback() { } 408 409 CUExitCallback(ComputeUnit _cu) 410* { 411 computeUnit = _cu; 412 } 413 414 virtual void 415 process(); 416 }; 417 418 CUExitCallback cuExitCallback; 419* 420 / Data access Port / 421 class DataPort : public MasterPort 422 { 423 public: 424 DataPort(const std::string &_name, ComputeUnit _cu, PortID _index) 425* : MasterPort(_name, _cu), computeUnit(_cu), 426 index(_index) { } 427 428 bool snoopRangeSent; 429 430 struct SenderState : public Packet::SenderState 431 { 432 GPUDynInstPtr _gpuDynInst; 433 int port_index; 434 Packet::SenderState saved; 435* 436 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, 437 Packet::SenderState sender_state=nullptr) 438* : _gpuDynInst(gpuDynInst), 439 port_index(_port_index), 440 saved(sender_state) { } 441 }; 442 443 void processMemReqEvent(PacketPtr pkt); 444 EventFunctionWrapper createMemReqEvent(PacketPtr pkt); 445* 446 void processMemRespEvent(PacketPtr pkt); 447 EventFunctionWrapper createMemRespEvent(PacketPtr pkt); 448* 449 std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries; 450 451 protected: 452 ComputeUnit computeUnit; 453* int index; 454 455 virtual bool recvTimingResp(PacketPtr pkt); 456 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 457 virtual void recvFunctional(PacketPtr pkt) { } 458 virtual void recvRangeChange() { } 459 virtual void recvReqRetry(); 460 461 virtual void 462 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) 463 { 464 resp.clear(); 465 snoop = true; 466 } 467 468 }; 469 470 // Instruction cache access port 471 class SQCPort : public MasterPort 472 { 473 public: 474 SQCPort(const std::string &_name, ComputeUnit _cu, PortID _index) 475* : MasterPort(_name, _cu), computeUnit(_cu), 476 index(_index) { } 477 478 bool snoopRangeSent; 479 480 struct SenderState : public Packet::SenderState 481 { 482 Wavefront wavefront; 483* Packet::SenderState saved; 484* 485 SenderState(Wavefront _wavefront, Packet::SenderState 486* sender_state=nullptr) 487* : wavefront(_wavefront), saved(sender_state) { } 488 }; 489 490 std::deque<std::pair<PacketPtr, Wavefront>> retries; 491* 492 protected: 493 ComputeUnit computeUnit; 494* int index; 495 496 virtual bool recvTimingResp(PacketPtr pkt); 497 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 498 virtual void recvFunctional(PacketPtr pkt) { } 499 virtual void recvRangeChange() { } 500 virtual void recvReqRetry(); 501 502 virtual void 503 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) 504 { 505 resp.clear(); 506 snoop = true; 507 } 508 }; 509 510 / Data TLB port / 511 class DTLBPort : public MasterPort 512 { 513 public: 514 DTLBPort(const std::string &_name, ComputeUnit _cu, PortID _index) 515* : MasterPort(_name, _cu), computeUnit(_cu), 516 index(_index), stalled(false) 517 { } 518 519 bool isStalled() { return stalled; } 520 void stallPort() { stalled = true; } 521 void unstallPort() { stalled = false; } 522 523 /** 524 * here we queue all the translation requests that were 525 * not successfully sent. 526 / 527* std::deque<PacketPtr> retries; 528 529 /** SenderState is information carried along with the packet 530 * throughout the TLB hierarchy 531 / 532* struct SenderState: public Packet::SenderState 533 { 534 // the memInst that this is associated with 535 GPUDynInstPtr _gpuDynInst; 536 537 // the lane in the memInst this is associated with, so we send 538 // the memory request down the right port 539 int portIndex; 540 541 // constructor used for packets involved in timing accesses 542 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index) 543 : _gpuDynInst(gpuDynInst), portIndex(port_index) { } 544 545 }; 546 547 protected: 548 ComputeUnit computeUnit; 549* int index; 550 bool stalled; 551 552 virtual bool recvTimingResp(PacketPtr pkt); 553 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 554 virtual void recvFunctional(PacketPtr pkt) { } 555 virtual void recvRangeChange() { } 556 virtual void recvReqRetry(); 557 }; 558 559 class ITLBPort : public MasterPort 560 { 561 public: 562 ITLBPort(const std::string &_name, ComputeUnit _cu) 563* : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { } 564 565 566 bool isStalled() { return stalled; } 567 void stallPort() { stalled = true; } 568 void unstallPort() { stalled = false; } 569 570 /** 571 * here we queue all the translation requests that were 572 * not successfully sent. 573 / 574* std::deque<PacketPtr> retries; 575 576 /** SenderState is information carried along with the packet 577 * throughout the TLB hierarchy 578 / 579* struct SenderState: public Packet::SenderState 580 { 581 // The wavefront associated with this request 582 Wavefront wavefront; 583* 584 SenderState(Wavefront _wavefront) : wavefront(_wavefront) { } 585* }; 586 587 protected: 588 ComputeUnit computeUnit; 589* bool stalled; 590 591 virtual bool recvTimingResp(PacketPtr pkt); 592 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 593 virtual void recvFunctional(PacketPtr pkt) { } 594 virtual void recvRangeChange() { } 595 virtual void recvReqRetry(); 596 }; 597 598 /** 599 * the port intended to communicate between the CU and its LDS 600 / 601* class LDSPort : public MasterPort 602 { 603 public: 604 LDSPort(const std::string &_name, ComputeUnit _cu, PortID _id) 605* : MasterPort(_name, _cu, _id), computeUnit(_cu) 606 { 607 } 608 609 bool isStalled() const { return stalled; } 610 void stallPort() { stalled = true; } 611 void unstallPort() { stalled = false; } 612 613 /** 614 * here we queue all the requests that were 615 * not successfully sent. 616 / 617* std::queue<PacketPtr> retries; 618 619 /** 620 * SenderState is information carried along with the packet, esp. the 621 * GPUDynInstPtr 622 / 623* class SenderState: public Packet::SenderState 624 { 625 protected: 626 // The actual read/write/atomic request that goes with this command 627 GPUDynInstPtr _gpuDynInst = nullptr; 628 629 public: 630 SenderState(GPUDynInstPtr gpuDynInst): 631 _gpuDynInst(gpuDynInst) 632 { 633 } 634 635 GPUDynInstPtr 636 getMemInst() const 637 { 638 return _gpuDynInst; 639 } 640 }; 641 642 virtual bool 643 sendTimingReq(PacketPtr pkt); 644 645 protected: 646 647 bool stalled = false; ///< whether or not it is stalled 648 649 ComputeUnit computeUnit; 650* 651 virtual bool 652 recvTimingResp(PacketPtr pkt); 653 654 virtual Tick 655 recvAtomic(PacketPtr pkt) { return 0; } 656 657 virtual void 658 recvFunctional(PacketPtr pkt) 659 { 660 } 661 662 virtual void 663 recvRangeChange() 664 { 665 } 666 667 virtual void 668 recvReqRetry(); 669 }; 670 671 /** The port to access the Local Data Store 672 * Can be connected to a LDS object 673 / 674* LDSPort ldsPort = nullptr; 675* 676 LDSPort * 677 getLdsPort() const 678 { 679 return ldsPort; 680 } 681 682 /** The memory port for SIMD data accesses. 683 * Can be connected to PhysMem for Ruby for timing simulations 684 / 685* std::vector<DataPort> memPort; 686* // port to the TLB hierarchy (i.e., the L1 TLB) 687 std::vector<DTLBPort> tlbPort; 688* // port to the SQC (i.e. the I-cache) 689 SQCPort sqcPort; 690* // port to the SQC TLB (there's a separate TLB for each I-cache) 691 ITLBPort sqcTLBPort; 692* 693 virtual BaseMasterPort& 694 getMasterPort(const std::string &if_name, PortID idx) 695 { 696 if (if_name == "memory_port") { 697 memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx), 698 this, idx); 699 return memPort[idx]; 700* } else if (if_name == "translation_port") { 701 tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx), 702 this, idx); 703 return tlbPort[idx]; 704* } else if (if_name == "sqc_port") { 705 sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx), 706 this, idx); 707 return sqcPort; 708* } else if (if_name == "sqc_tlb_port") { 709 sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this); 710 return sqcTLBPort; 711* } else if (if_name == "ldsPort") { 712 if (ldsPort) { 713 fatal("an LDS port was already allocated"); 714 } 715 ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx); 716 return ldsPort; 717* } else { 718 panic("incorrect port name"); 719 } 720 } 721 722 // xact_cas_load() 723 class waveIdentifier 724 { 725 public: 726 waveIdentifier() { } 727 waveIdentifier(int _simdId, int _wfSlotId) 728 : simdId(_simdId), wfSlotId(_wfSlotId) { } 729 730 int simdId; 731 int wfSlotId; 732 }; 733 734 class waveQueue 735 { 736 public: 737 std::list<waveIdentifier> waveIDQueue; 738 }; 739 std::map<unsigned, waveQueue> xactCasLoadMap; 740 741 uint64_t getAndIncSeqNum() { return globalSeqNum++; } 742 743 private: 744 const int _cacheLineSize; 745 uint64_t globalSeqNum; 746 int wavefrontSize; 747 GPUStaticInst kernelLaunchInst; 748}; 749* 750#endif // __COMPUTE_UNIT_HH__	35 / 36 37#ifndef __COMPUTE_UNIT_HH__ 38#define __COMPUTE_UNIT_HH__ 39 40#include <deque> 41#include <map> 42#include <unordered_map> 43#include <vector> 44 45#include "base/callback.hh" 46#include "base/statistics.hh" 47#include "base/types.hh" 48#include "enums/PrefetchType.hh" 49#include "gpu-compute/exec_stage.hh" 50#include "gpu-compute/fetch_stage.hh" 51#include "gpu-compute/global_memory_pipeline.hh" 52#include "gpu-compute/local_memory_pipeline.hh" 53#include "gpu-compute/qstruct.hh" 54#include "gpu-compute/schedule_stage.hh" 55#include "gpu-compute/scoreboard_check_stage.hh" 56#include "mem/mem_object.hh" 57#include "mem/port.hh" 58 59static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1; 60static const int MAX_WIDTH_FOR_MEM_INST = 32; 61 62class NDRange; 63class Shader; 64class VectorRegisterFile; 65 66struct ComputeUnitParams; 67 68enum EXEC_POLICY 69{ 70 OLDEST = 0, 71 RR 72}; 73 74// List of execution units 75enum EXEC_UNIT 76{ 77 SIMD0 = 0, 78 SIMD1, 79 SIMD2, 80 SIMD3, 81 GLBMEM_PIPE, 82 LDSMEM_PIPE, 83 NUM_UNITS 84}; 85 86enum TLB_CACHE 87{ 88 TLB_MISS_CACHE_MISS = 0, 89 TLB_MISS_CACHE_HIT, 90 TLB_HIT_CACHE_MISS, 91 TLB_HIT_CACHE_HIT 92}; 93 94class ComputeUnit : public MemObject 95{ 96 public: 97 FetchStage fetchStage; 98 ScoreboardCheckStage scoreboardCheckStage; 99 ScheduleStage scheduleStage; 100* ExecStage execStage; 101 GlobalMemPipeline globalMemoryPipe; 102 LocalMemPipeline localMemoryPipe; 103 104 // Buffers used to communicate between various pipeline stages 105 106 // List of waves which are ready to be scheduled. 107 // Each execution resource has a ready list. readyList is 108 // used to communicate between scoreboardCheck stage and 109 // schedule stage 110 // TODO: make enum to index readyList 111 std::vector<std::vector<Wavefront>> readyList; 112* 113 // Stores the status of waves. A READY implies the 114 // wave is ready to be scheduled this cycle and 115 // is already present in the readyList. waveStatusList is 116 // used to communicate between scoreboardCheck stage and 117 // schedule stage 118 // TODO: convert std::pair to a class to increase readability 119 std::vector<std::vector<std::pair<Wavefront, WAVE_STATUS>>> waveStatusList; 120* 121 // List of waves which will be dispatched to 122 // each execution resource. A FILLED implies 123 // dispatch list is non-empty and 124 // execution unit has something to execute 125 // this cycle. Currently, the dispatch list of 126 // an execution resource can hold only one wave because 127 // an execution resource can execute only one wave in a cycle. 128 // dispatchList is used to communicate between schedule 129 // and exec stage 130 // TODO: convert std::pair to a class to increase readability 131 std::vector<std::pair<Wavefront, DISPATCH_STATUS>> dispatchList; 132* 133 int rrNextMemID; // used by RR WF exec policy to cycle through WF's 134 int rrNextALUWp; 135 typedef ComputeUnitParams Params; 136 std::vector<std::vector<Wavefront>> wfList; 137* int cu_id; 138 139 // array of vector register files, one per SIMD 140 std::vector<VectorRegisterFile> vrf; 141* // Number of vector ALU units (SIMDs) in CU 142 int numSIMDs; 143 // number of pipe stages for bypassing data to next dependent single 144 // precision vector instruction inside the vector ALU pipeline 145 int spBypassPipeLength; 146 // number of pipe stages for bypassing data to next dependent double 147 // precision vector instruction inside the vector ALU pipeline 148 int dpBypassPipeLength; 149 // number of cycles per issue period 150 int issuePeriod; 151 152 // Number of global and local memory execution resources in CU 153 int numGlbMemUnits; 154 int numLocMemUnits; 155 // tracks the last cycle a vector instruction was executed on a SIMD 156 std::vector<uint64_t> lastExecCycle; 157 158 // true if we allow a separate TLB per lane 159 bool perLaneTLB; 160 // if 0, TLB prefetching is off. 161 int prefetchDepth; 162 // if fixed-stride prefetching, this is the stride. 163 int prefetchStride; 164 165 std::vector<Addr> lastVaddrCU; 166 std::vector<std::vector<Addr>> lastVaddrSimd; 167 std::vector<std::vector<std::vector<Addr>>> lastVaddrWF; 168 Enums::PrefetchType prefetchType; 169 EXEC_POLICY exec_policy; 170 171 bool xact_cas_mode; 172 bool debugSegFault; 173 bool functionalTLB; 174 bool localMemBarrier; 175 176 /* 177 * for Counting page accesses 178 * 179 * cuExitCallback inherits from Callback. When you register a callback 180 * function as an exit callback, it will get added to an exit callback 181 * queue, such that on simulation exit, all callbacks in the callback 182 * queue will have their process() function called. 183 / 184* bool countPages; 185 186 Shader shader; 187* uint32_t barrier_id; 188 // vector of Vector ALU (MACC) pipelines 189 std::vector<WaitClass> aluPipe; 190 // minimum issue period per SIMD unit (in cycles) 191 std::vector<WaitClass> wfWait; 192 193 // Resource control for Vector Register File->Global Memory pipe buses 194 std::vector<WaitClass> vrfToGlobalMemPipeBus; 195 // Resource control for Vector Register File->Local Memory pipe buses 196 std::vector<WaitClass> vrfToLocalMemPipeBus; 197 int nextGlbMemBus; 198 int nextLocMemBus; 199 // Resource control for global memory to VRF data/address bus 200 WaitClass glbMemToVrfBus; 201 // Resource control for local memory to VRF data/address bus 202 WaitClass locMemToVrfBus; 203 204 uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes 205 uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes 206 uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store 207 uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load 208 209 Tick req_tick_latency; 210 Tick resp_tick_latency; 211 212 // number of vector registers being reserved for each SIMD unit 213 std::vector<int> vectorRegsReserved; 214 // number of vector registers per SIMD unit 215 uint32_t numVecRegsPerSimd; 216 // Support for scheduling VGPR status update events 217 std::vector<std::pair<uint32_t, uint32_t> > regIdxVec; 218 std::vector<uint64_t> timestampVec; 219 std::vector<uint8_t> statusVec; 220 221 void 222 registerEvent(uint32_t simdId, 223 uint32_t regIdx, 224 uint32_t operandSize, 225 uint64_t when, 226 uint8_t newStatus) { 227 regIdxVec.push_back(std::make_pair(simdId, regIdx)); 228 timestampVec.push_back(when); 229 statusVec.push_back(newStatus); 230 if (operandSize > 4) { 231 regIdxVec.push_back(std::make_pair(simdId, 232 ((regIdx + 1) % 233 numVecRegsPerSimd))); 234 timestampVec.push_back(when); 235 statusVec.push_back(newStatus); 236 } 237 } 238 239 void updateEvents(); 240 241 // this hash map will keep track of page divergence 242 // per memory instruction per wavefront. The hash map 243 // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc. 244 std::map<Addr, int> pagesTouched; 245 246 ComputeUnit(const Params p); 247* ~ComputeUnit(); 248 int spBypassLength() { return spBypassPipeLength; }; 249 int dpBypassLength() { return dpBypassPipeLength; }; 250 int storeBusLength() { return numCyclesPerStoreTransfer; }; 251 int loadBusLength() { return numCyclesPerLoadTransfer; }; 252 int wfSize() const { return wavefrontSize; }; 253 254 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 255 void exec(); 256 void initiateFetch(Wavefront wavefront); 257* void fetch(PacketPtr pkt, Wavefront wavefront); 258* void fillKernelState(Wavefront w, NDRange ndr); 259 260 void startWavefront(Wavefront w, int waveId, LdsChunk ldsChunk, 261 NDRange ndr); 262* 263 void StartWorkgroup(NDRange ndr); 264* int ReadyWorkgroup(NDRange ndr); 265* 266 bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; } 267 bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; } 268 bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; } 269 int GlbMemUnitId() { return GLBMEM_PIPE; } 270 int ShrMemUnitId() { return LDSMEM_PIPE; } 271 int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; } 272 int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; } 273 /* This function cycles through all the wavefronts in all the phases to see 274 * if all of the wavefronts which should be associated with one barrier 275 * (denoted with _barrier_id), are all at the same barrier in the program 276 * (denoted by bcnt). When the number at the barrier matches bslots, then 277 * return true. 278 / 279* int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots); 280 bool cedeSIMD(int simdId, int wfSlotId); 281 282 template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst); 283 virtual void init(); 284 void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); 285 void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt); 286 void injectGlobalMemFence(GPUDynInstPtr gpuDynInst, 287 bool kernelLaunch=true, 288 RequestPtr req=nullptr); 289 void handleMemPacket(PacketPtr pkt, int memport_index); 290 bool processTimingPacket(PacketPtr pkt); 291 void processFetchReturn(PacketPtr pkt); 292 void updatePageDivergenceDist(Addr addr); 293 294 MasterID masterId() { return _masterId; } 295 296 bool isDone() const; 297 bool isSimdDone(uint32_t) const; 298 299 protected: 300 MasterID _masterId; 301 302 LdsState &lds; 303 304 public: 305 Stats::Scalar vALUInsts; 306 Stats::Formula vALUInstsPerWF; 307 Stats::Scalar sALUInsts; 308 Stats::Formula sALUInstsPerWF; 309 Stats::Scalar instCyclesVALU; 310 Stats::Scalar instCyclesSALU; 311 Stats::Scalar threadCyclesVALU; 312 Stats::Formula vALUUtilization; 313 Stats::Scalar ldsNoFlatInsts; 314 Stats::Formula ldsNoFlatInstsPerWF; 315 Stats::Scalar flatVMemInsts; 316 Stats::Formula flatVMemInstsPerWF; 317 Stats::Scalar flatLDSInsts; 318 Stats::Formula flatLDSInstsPerWF; 319 Stats::Scalar vectorMemWrites; 320 Stats::Formula vectorMemWritesPerWF; 321 Stats::Scalar vectorMemReads; 322 Stats::Formula vectorMemReadsPerWF; 323 Stats::Scalar scalarMemWrites; 324 Stats::Formula scalarMemWritesPerWF; 325 Stats::Scalar scalarMemReads; 326 Stats::Formula scalarMemReadsPerWF; 327 328 void updateInstStats(GPUDynInstPtr gpuDynInst); 329 330 // the following stats compute the avg. TLB accesslatency per 331 // uncoalesced request (only for data) 332 Stats::Scalar tlbRequests; 333 Stats::Scalar tlbCycles; 334 Stats::Formula tlbLatency; 335 // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table. 336 Stats::Vector hitsPerTLBLevel; 337 338 Stats::Scalar ldsBankAccesses; 339 Stats::Distribution ldsBankConflictDist; 340 341 // over all memory instructions executed over all wavefronts 342 // how many touched 0-4 pages, 4-8, ..., 60-64 pages 343 Stats::Distribution pageDivergenceDist; 344 Stats::Scalar dynamicGMemInstrCnt; 345 Stats::Scalar dynamicLMemInstrCnt; 346 347 Stats::Scalar wgBlockedDueLdsAllocation; 348 // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active 349 // when the instruction is committed, this number is still incremented by 1 350 Stats::Scalar numInstrExecuted; 351 // Number of cycles among successive instruction executions across all 352 // wavefronts of the same CU 353 Stats::Distribution execRateDist; 354 // number of individual vector operations executed 355 Stats::Scalar numVecOpsExecuted; 356 // Total cycles that something is running on the GPU 357 Stats::Scalar totalCycles; 358 Stats::Formula vpc; // vector ops per cycle 359 Stats::Formula ipc; // vector instructions per cycle 360 Stats::Distribution controlFlowDivergenceDist; 361 Stats::Distribution activeLanesPerGMemInstrDist; 362 Stats::Distribution activeLanesPerLMemInstrDist; 363 // number of vector ALU instructions received 364 Stats::Formula numALUInstsExecuted; 365 // number of times a WG can not start due to lack of free VGPRs in SIMDs 366 Stats::Scalar numTimesWgBlockedDueVgprAlloc; 367 Stats::Scalar numCASOps; 368 Stats::Scalar numFailedCASOps; 369 Stats::Scalar completedWfs; 370 // flag per vector SIMD unit that is set when there is at least one 371 // WV that has a vector ALU instruction as the oldest in its 372 // Instruction Buffer: Defined in the Scoreboard stage, consumed 373 // by the Execute stage. 374 std::vector<bool> vectorAluInstAvail; 375 // number of available (oldest) LDS instructions that could have 376 // been issued to the LDS at a specific issue slot 377 int shrMemInstAvail; 378 // number of available Global memory instructions that could have 379 // been issued to TCP at a specific issue slot 380 int glbMemInstAvail; 381 382 void 383 regStats(); 384 385 LdsState & 386 getLds() const 387 { 388 return lds; 389 } 390 391 int32_t 392 getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const; 393 394 int cacheLineSize() const { return _cacheLineSize; } 395 396 bool 397 sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result)); 398 399 typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct; 400 pageDataStruct pageAccesses; 401 402 class CUExitCallback : public Callback 403 { 404 private: 405 ComputeUnit computeUnit; 406* 407 public: 408 virtual ~CUExitCallback() { } 409 410 CUExitCallback(ComputeUnit _cu) 411* { 412 computeUnit = _cu; 413 } 414 415 virtual void 416 process(); 417 }; 418 419 CUExitCallback cuExitCallback; 420* 421 / Data access Port / 422 class DataPort : public MasterPort 423 { 424 public: 425 DataPort(const std::string &_name, ComputeUnit _cu, PortID _index) 426* : MasterPort(_name, _cu), computeUnit(_cu), 427 index(_index) { } 428 429 bool snoopRangeSent; 430 431 struct SenderState : public Packet::SenderState 432 { 433 GPUDynInstPtr _gpuDynInst; 434 int port_index; 435 Packet::SenderState saved; 436* 437 SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index, 438 Packet::SenderState sender_state=nullptr) 439* : _gpuDynInst(gpuDynInst), 440 port_index(_port_index), 441 saved(sender_state) { } 442 }; 443 444 void processMemReqEvent(PacketPtr pkt); 445 EventFunctionWrapper createMemReqEvent(PacketPtr pkt); 446* 447 void processMemRespEvent(PacketPtr pkt); 448 EventFunctionWrapper createMemRespEvent(PacketPtr pkt); 449* 450 std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries; 451 452 protected: 453 ComputeUnit computeUnit; 454* int index; 455 456 virtual bool recvTimingResp(PacketPtr pkt); 457 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 458 virtual void recvFunctional(PacketPtr pkt) { } 459 virtual void recvRangeChange() { } 460 virtual void recvReqRetry(); 461 462 virtual void 463 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) 464 { 465 resp.clear(); 466 snoop = true; 467 } 468 469 }; 470 471 // Instruction cache access port 472 class SQCPort : public MasterPort 473 { 474 public: 475 SQCPort(const std::string &_name, ComputeUnit _cu, PortID _index) 476* : MasterPort(_name, _cu), computeUnit(_cu), 477 index(_index) { } 478 479 bool snoopRangeSent; 480 481 struct SenderState : public Packet::SenderState 482 { 483 Wavefront wavefront; 484* Packet::SenderState saved; 485* 486 SenderState(Wavefront _wavefront, Packet::SenderState 487* sender_state=nullptr) 488* : wavefront(_wavefront), saved(sender_state) { } 489 }; 490 491 std::deque<std::pair<PacketPtr, Wavefront>> retries; 492* 493 protected: 494 ComputeUnit computeUnit; 495* int index; 496 497 virtual bool recvTimingResp(PacketPtr pkt); 498 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 499 virtual void recvFunctional(PacketPtr pkt) { } 500 virtual void recvRangeChange() { } 501 virtual void recvReqRetry(); 502 503 virtual void 504 getDeviceAddressRanges(AddrRangeList &resp, bool &snoop) 505 { 506 resp.clear(); 507 snoop = true; 508 } 509 }; 510 511 / Data TLB port / 512 class DTLBPort : public MasterPort 513 { 514 public: 515 DTLBPort(const std::string &_name, ComputeUnit _cu, PortID _index) 516* : MasterPort(_name, _cu), computeUnit(_cu), 517 index(_index), stalled(false) 518 { } 519 520 bool isStalled() { return stalled; } 521 void stallPort() { stalled = true; } 522 void unstallPort() { stalled = false; } 523 524 /** 525 * here we queue all the translation requests that were 526 * not successfully sent. 527 / 528* std::deque<PacketPtr> retries; 529 530 /** SenderState is information carried along with the packet 531 * throughout the TLB hierarchy 532 / 533* struct SenderState: public Packet::SenderState 534 { 535 // the memInst that this is associated with 536 GPUDynInstPtr _gpuDynInst; 537 538 // the lane in the memInst this is associated with, so we send 539 // the memory request down the right port 540 int portIndex; 541 542 // constructor used for packets involved in timing accesses 543 SenderState(GPUDynInstPtr gpuDynInst, PortID port_index) 544 : _gpuDynInst(gpuDynInst), portIndex(port_index) { } 545 546 }; 547 548 protected: 549 ComputeUnit computeUnit; 550* int index; 551 bool stalled; 552 553 virtual bool recvTimingResp(PacketPtr pkt); 554 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 555 virtual void recvFunctional(PacketPtr pkt) { } 556 virtual void recvRangeChange() { } 557 virtual void recvReqRetry(); 558 }; 559 560 class ITLBPort : public MasterPort 561 { 562 public: 563 ITLBPort(const std::string &_name, ComputeUnit _cu) 564* : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { } 565 566 567 bool isStalled() { return stalled; } 568 void stallPort() { stalled = true; } 569 void unstallPort() { stalled = false; } 570 571 /** 572 * here we queue all the translation requests that were 573 * not successfully sent. 574 / 575* std::deque<PacketPtr> retries; 576 577 /** SenderState is information carried along with the packet 578 * throughout the TLB hierarchy 579 / 580* struct SenderState: public Packet::SenderState 581 { 582 // The wavefront associated with this request 583 Wavefront wavefront; 584* 585 SenderState(Wavefront _wavefront) : wavefront(_wavefront) { } 586* }; 587 588 protected: 589 ComputeUnit computeUnit; 590* bool stalled; 591 592 virtual bool recvTimingResp(PacketPtr pkt); 593 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 594 virtual void recvFunctional(PacketPtr pkt) { } 595 virtual void recvRangeChange() { } 596 virtual void recvReqRetry(); 597 }; 598 599 /** 600 * the port intended to communicate between the CU and its LDS 601 / 602* class LDSPort : public MasterPort 603 { 604 public: 605 LDSPort(const std::string &_name, ComputeUnit _cu, PortID _id) 606* : MasterPort(_name, _cu, _id), computeUnit(_cu) 607 { 608 } 609 610 bool isStalled() const { return stalled; } 611 void stallPort() { stalled = true; } 612 void unstallPort() { stalled = false; } 613 614 /** 615 * here we queue all the requests that were 616 * not successfully sent. 617 / 618* std::queue<PacketPtr> retries; 619 620 /** 621 * SenderState is information carried along with the packet, esp. the 622 * GPUDynInstPtr 623 / 624* class SenderState: public Packet::SenderState 625 { 626 protected: 627 // The actual read/write/atomic request that goes with this command 628 GPUDynInstPtr _gpuDynInst = nullptr; 629 630 public: 631 SenderState(GPUDynInstPtr gpuDynInst): 632 _gpuDynInst(gpuDynInst) 633 { 634 } 635 636 GPUDynInstPtr 637 getMemInst() const 638 { 639 return _gpuDynInst; 640 } 641 }; 642 643 virtual bool 644 sendTimingReq(PacketPtr pkt); 645 646 protected: 647 648 bool stalled = false; ///< whether or not it is stalled 649 650 ComputeUnit computeUnit; 651* 652 virtual bool 653 recvTimingResp(PacketPtr pkt); 654 655 virtual Tick 656 recvAtomic(PacketPtr pkt) { return 0; } 657 658 virtual void 659 recvFunctional(PacketPtr pkt) 660 { 661 } 662 663 virtual void 664 recvRangeChange() 665 { 666 } 667 668 virtual void 669 recvReqRetry(); 670 }; 671 672 /** The port to access the Local Data Store 673 * Can be connected to a LDS object 674 / 675* LDSPort ldsPort = nullptr; 676* 677 LDSPort * 678 getLdsPort() const 679 { 680 return ldsPort; 681 } 682 683 /** The memory port for SIMD data accesses. 684 * Can be connected to PhysMem for Ruby for timing simulations 685 / 686* std::vector<DataPort> memPort; 687* // port to the TLB hierarchy (i.e., the L1 TLB) 688 std::vector<DTLBPort> tlbPort; 689* // port to the SQC (i.e. the I-cache) 690 SQCPort sqcPort; 691* // port to the SQC TLB (there's a separate TLB for each I-cache) 692 ITLBPort sqcTLBPort; 693* 694 virtual BaseMasterPort& 695 getMasterPort(const std::string &if_name, PortID idx) 696 { 697 if (if_name == "memory_port") { 698 memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx), 699 this, idx); 700 return memPort[idx]; 701* } else if (if_name == "translation_port") { 702 tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx), 703 this, idx); 704 return tlbPort[idx]; 705* } else if (if_name == "sqc_port") { 706 sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx), 707 this, idx); 708 return sqcPort; 709* } else if (if_name == "sqc_tlb_port") { 710 sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this); 711 return sqcTLBPort; 712* } else if (if_name == "ldsPort") { 713 if (ldsPort) { 714 fatal("an LDS port was already allocated"); 715 } 716 ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx); 717 return ldsPort; 718* } else { 719 panic("incorrect port name"); 720 } 721 } 722 723 // xact_cas_load() 724 class waveIdentifier 725 { 726 public: 727 waveIdentifier() { } 728 waveIdentifier(int _simdId, int _wfSlotId) 729 : simdId(_simdId), wfSlotId(_wfSlotId) { } 730 731 int simdId; 732 int wfSlotId; 733 }; 734 735 class waveQueue 736 { 737 public: 738 std::list<waveIdentifier> waveIDQueue; 739 }; 740 std::map<unsigned, waveQueue> xactCasLoadMap; 741 742 uint64_t getAndIncSeqNum() { return globalSeqNum++; } 743 744 private: 745 const int _cacheLineSize; 746 uint64_t globalSeqNum; 747 int wavefrontSize; 748 GPUStaticInst kernelLaunchInst; 749}; 750* 751#endif // __COMPUTE_UNIT_HH__