1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: John Kalamatianos,
34 *          Anthony Gutierrez
35 */
36
37#ifndef __COMPUTE_UNIT_HH__
38#define __COMPUTE_UNIT_HH__
39
40#include <deque>
41#include <map>
42#include <unordered_map>
43#include <vector>
44
45#include "base/callback.hh"
46#include "base/statistics.hh"
47#include "base/types.hh"
48#include "enums/PrefetchType.hh"
49#include "gpu-compute/exec_stage.hh"
50#include "gpu-compute/fetch_stage.hh"
51#include "gpu-compute/global_memory_pipeline.hh"
52#include "gpu-compute/local_memory_pipeline.hh"
53#include "gpu-compute/qstruct.hh"
54#include "gpu-compute/schedule_stage.hh"
55#include "gpu-compute/scoreboard_check_stage.hh"
56#include "mem/port.hh"
57#include "sim/clocked_object.hh"
58
59static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
60static const int MAX_WIDTH_FOR_MEM_INST = 32;
61
62class NDRange;
63class Shader;
64class VectorRegisterFile;
65
66struct ComputeUnitParams;
67
68enum EXEC_POLICY
69{
70    OLDEST = 0,
71    RR
72};
73
74// List of execution units
75enum EXEC_UNIT
76{
77    SIMD0 = 0,
78    SIMD1,
79    SIMD2,
80    SIMD3,
81    GLBMEM_PIPE,
82    LDSMEM_PIPE,
83    NUM_UNITS
84};
85
86enum TLB_CACHE
87{
88    TLB_MISS_CACHE_MISS = 0,
89    TLB_MISS_CACHE_HIT,
90    TLB_HIT_CACHE_MISS,
91    TLB_HIT_CACHE_HIT
92};
93
94class ComputeUnit : public ClockedObject
95{
96  public:
97    FetchStage fetchStage;
98    ScoreboardCheckStage scoreboardCheckStage;
99    ScheduleStage scheduleStage;
100    ExecStage execStage;
101    GlobalMemPipeline globalMemoryPipe;
102    LocalMemPipeline localMemoryPipe;
103
104    // Buffers used to communicate between various pipeline stages
105
106    // List of waves which are ready to be scheduled.
107    // Each execution resource has a ready list. readyList is
108    // used to communicate between scoreboardCheck stage and
109    // schedule stage
110    // TODO: make enum to index readyList
111    std::vector<std::vector<Wavefront*>> readyList;
112
113    // Stores the status of waves. A READY implies the
114    // wave is ready to be scheduled this cycle and
115    // is already present in the readyList. waveStatusList is
116    // used to communicate between scoreboardCheck stage and
117    // schedule stage
118    // TODO: convert std::pair to a class to increase readability
119    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
120
121    // List of waves which will be dispatched to
122    // each execution resource. A FILLED implies
123    // dispatch list is non-empty and
124    // execution unit has something to execute
125    // this cycle. Currently, the dispatch list of
126    // an execution resource can hold only one wave because
127    // an execution resource can execute only one wave in a cycle.
128    // dispatchList is used to communicate between schedule
129    // and exec stage
130    // TODO: convert std::pair to a class to increase readability
131    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
132
133    int rrNextMemID; // used by RR WF exec policy to cycle through WF's
134    int rrNextALUWp;
135    typedef ComputeUnitParams Params;
136    std::vector<std::vector<Wavefront*>> wfList;
137    int cu_id;
138
139    // array of vector register files, one per SIMD
140    std::vector<VectorRegisterFile*> vrf;
141    // Number of vector ALU units (SIMDs) in CU
142    int numSIMDs;
143    // number of pipe stages for bypassing data to next dependent single
144    // precision vector instruction inside the vector ALU pipeline
145    int spBypassPipeLength;
146    // number of pipe stages for bypassing data to next dependent double
147    // precision vector instruction inside the vector ALU pipeline
148    int dpBypassPipeLength;
149    // number of cycles per issue period
150    int issuePeriod;
151
152    // Number of global and local memory execution resources in CU
153    int numGlbMemUnits;
154    int numLocMemUnits;
155    // tracks the last cycle a vector instruction was executed on a SIMD
156    std::vector<uint64_t> lastExecCycle;
157
158    // true if we allow a separate TLB per lane
159    bool perLaneTLB;
160    // if 0, TLB prefetching is off.
161    int prefetchDepth;
162    // if fixed-stride prefetching, this is the stride.
163    int prefetchStride;
164
165    std::vector<Addr> lastVaddrCU;
166    std::vector<std::vector<Addr>> lastVaddrSimd;
167    std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
168    Enums::PrefetchType prefetchType;
169    EXEC_POLICY exec_policy;
170
171    bool xact_cas_mode;
172    bool debugSegFault;
173    bool functionalTLB;
174    bool localMemBarrier;
175
176    /*
177     * for Counting page accesses
178     *
179     * cuExitCallback inherits from Callback. When you register a callback
180     * function as an exit callback, it will get added to an exit callback
181     * queue, such that on simulation exit, all callbacks in the callback
182     * queue will have their process() function called.
183     */
184    bool countPages;
185
186    Shader *shader;
187    uint32_t barrier_id;
188    // vector of Vector ALU (MACC) pipelines
189    std::vector<WaitClass> aluPipe;
190    // minimum issue period per SIMD unit (in cycles)
191    std::vector<WaitClass> wfWait;
192
193    // Resource control for Vector Register File->Global Memory pipe buses
194    std::vector<WaitClass> vrfToGlobalMemPipeBus;
195    // Resource control for Vector Register File->Local Memory pipe buses
196    std::vector<WaitClass> vrfToLocalMemPipeBus;
197    int nextGlbMemBus;
198    int nextLocMemBus;
199    // Resource control for global memory to VRF data/address bus
200    WaitClass glbMemToVrfBus;
201    // Resource control for local memory to VRF data/address bus
202    WaitClass locMemToVrfBus;
203
204    uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
205    uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
206    uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
207    uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load
208
209    Tick req_tick_latency;
210    Tick resp_tick_latency;
211
212    // number of vector registers being reserved for each SIMD unit
213    std::vector<int> vectorRegsReserved;
214    // number of vector registers per SIMD unit
215    uint32_t numVecRegsPerSimd;
216    // Support for scheduling VGPR status update events
217    std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
218    std::vector<uint64_t> timestampVec;
219    std::vector<uint8_t>  statusVec;
220
221    void
222    registerEvent(uint32_t simdId,
223                  uint32_t regIdx,
224                  uint32_t operandSize,
225                  uint64_t when,
226                  uint8_t newStatus) {
227        regIdxVec.push_back(std::make_pair(simdId, regIdx));
228        timestampVec.push_back(when);
229        statusVec.push_back(newStatus);
230        if (operandSize > 4) {
231            regIdxVec.push_back(std::make_pair(simdId,
232                                               ((regIdx + 1) %
233                                                numVecRegsPerSimd)));
234            timestampVec.push_back(when);
235            statusVec.push_back(newStatus);
236        }
237    }
238
239    void updateEvents();
240
241    // this hash map will keep track of page divergence
242    // per memory instruction per wavefront. The hash map
243    // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
244    std::map<Addr, int> pagesTouched;
245
246    ComputeUnit(const Params *p);
247    ~ComputeUnit();
248    int spBypassLength() { return spBypassPipeLength; };
249    int dpBypassLength() { return dpBypassPipeLength; };
250    int storeBusLength() { return numCyclesPerStoreTransfer; };
251    int loadBusLength() { return numCyclesPerLoadTransfer; };
252    int wfSize() const { return wavefrontSize; };
253
254    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
255    void exec();
256    void initiateFetch(Wavefront *wavefront);
257    void fetch(PacketPtr pkt, Wavefront *wavefront);
258    void fillKernelState(Wavefront *w, NDRange *ndr);
259
260    void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
261                        NDRange *ndr);
262
263    void StartWorkgroup(NDRange *ndr);
264    int ReadyWorkgroup(NDRange *ndr);
265
266    bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
267    bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
268    bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
269    int GlbMemUnitId() { return GLBMEM_PIPE; }
270    int ShrMemUnitId() { return LDSMEM_PIPE; }
271    int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
272    int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
273    /* This function cycles through all the wavefronts in all the phases to see
274     * if all of the wavefronts which should be associated with one barrier
275     * (denoted with _barrier_id), are all at the same barrier in the program
276     * (denoted by bcnt). When the number at the barrier matches bslots, then
277     * return true.
278     */
279    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
280    bool cedeSIMD(int simdId, int wfSlotId);
281
282    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
283    virtual void init() override;
284    void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
285    void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
286    void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
287                              bool kernelLaunch=true,
288                              RequestPtr req=nullptr);
289    void handleMemPacket(PacketPtr pkt, int memport_index);
290    bool processTimingPacket(PacketPtr pkt);
291    void processFetchReturn(PacketPtr pkt);
292    void updatePageDivergenceDist(Addr addr);
293
294    MasterID masterId() { return _masterId; }
295
296    bool isDone() const;
297    bool isSimdDone(uint32_t) const;
298
299  protected:
300    MasterID _masterId;
301
302    LdsState &lds;
303
304  public:
305    Stats::Scalar vALUInsts;
306    Stats::Formula vALUInstsPerWF;
307    Stats::Scalar sALUInsts;
308    Stats::Formula sALUInstsPerWF;
309    Stats::Scalar instCyclesVALU;
310    Stats::Scalar instCyclesSALU;
311    Stats::Scalar threadCyclesVALU;
312    Stats::Formula vALUUtilization;
313    Stats::Scalar ldsNoFlatInsts;
314    Stats::Formula ldsNoFlatInstsPerWF;
315    Stats::Scalar flatVMemInsts;
316    Stats::Formula flatVMemInstsPerWF;
317    Stats::Scalar flatLDSInsts;
318    Stats::Formula flatLDSInstsPerWF;
319    Stats::Scalar vectorMemWrites;
320    Stats::Formula vectorMemWritesPerWF;
321    Stats::Scalar vectorMemReads;
322    Stats::Formula vectorMemReadsPerWF;
323    Stats::Scalar scalarMemWrites;
324    Stats::Formula scalarMemWritesPerWF;
325    Stats::Scalar scalarMemReads;
326    Stats::Formula scalarMemReadsPerWF;
327
328    void updateInstStats(GPUDynInstPtr gpuDynInst);
329
330    // the following stats compute the avg. TLB accesslatency per
331    // uncoalesced request (only for data)
332    Stats::Scalar tlbRequests;
333    Stats::Scalar tlbCycles;
334    Stats::Formula tlbLatency;
335    // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
336    Stats::Vector hitsPerTLBLevel;
337
338    Stats::Scalar ldsBankAccesses;
339    Stats::Distribution ldsBankConflictDist;
340
341    // over all memory instructions executed over all wavefronts
342    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
343    Stats::Distribution pageDivergenceDist;
344    Stats::Scalar dynamicGMemInstrCnt;
345    Stats::Scalar dynamicLMemInstrCnt;
346
347    Stats::Scalar wgBlockedDueLdsAllocation;
348    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
349    // when the instruction is committed, this number is still incremented by 1
350    Stats::Scalar numInstrExecuted;
351    // Number of cycles among successive instruction executions across all
352    // wavefronts of the same CU
353    Stats::Distribution execRateDist;
354    // number of individual vector operations executed
355    Stats::Scalar numVecOpsExecuted;
356    // Total cycles that something is running on the GPU
357    Stats::Scalar totalCycles;
358    Stats::Formula vpc; // vector ops per cycle
359    Stats::Formula ipc; // vector instructions per cycle
360    Stats::Distribution controlFlowDivergenceDist;
361    Stats::Distribution activeLanesPerGMemInstrDist;
362    Stats::Distribution activeLanesPerLMemInstrDist;
363    // number of vector ALU instructions received
364    Stats::Formula numALUInstsExecuted;
365    // number of times a WG can not start due to lack of free VGPRs in SIMDs
366    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
367    Stats::Scalar numCASOps;
368    Stats::Scalar numFailedCASOps;
369    Stats::Scalar completedWfs;
370    // flag per vector SIMD unit that is set when there is at least one
371    // WV that has a vector ALU instruction as the oldest in its
372    // Instruction Buffer: Defined in the Scoreboard stage, consumed
373    // by the Execute stage.
374    std::vector<bool> vectorAluInstAvail;
375    // number of available (oldest) LDS instructions that could have
376    // been issued to the LDS at a specific issue slot
377    int shrMemInstAvail;
378    // number of available Global memory instructions that could have
379    // been issued to TCP at a specific issue slot
380    int glbMemInstAvail;
381
382    void
383    regStats() override;
384
385    LdsState &
386    getLds() const
387    {
388        return lds;
389    }
390
391    int32_t
392    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
393
394    int cacheLineSize() const { return _cacheLineSize; }
395
396    bool
397    sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
398
399    typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
400    pageDataStruct pageAccesses;
401
402    class CUExitCallback : public Callback
403    {
404      private:
405        ComputeUnit *computeUnit;
406
407      public:
408        virtual ~CUExitCallback() { }
409
410        CUExitCallback(ComputeUnit *_cu)
411        {
412            computeUnit = _cu;
413        }
414
415        virtual void
416        process();
417    };
418
419    CUExitCallback *cuExitCallback;
420
421    /** Data access Port **/
422    class DataPort : public MasterPort
423    {
424      public:
425        DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
426            : MasterPort(_name, _cu), computeUnit(_cu),
427              index(_index) { }
428
429        bool snoopRangeSent;
430
431        struct SenderState : public Packet::SenderState
432        {
433            GPUDynInstPtr _gpuDynInst;
434            int port_index;
435            Packet::SenderState *saved;
436
437            SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
438                        Packet::SenderState *sender_state=nullptr)
439                : _gpuDynInst(gpuDynInst),
440                  port_index(_port_index),
441                  saved(sender_state) { }
442        };
443
444        void processMemReqEvent(PacketPtr pkt);
445        EventFunctionWrapper *createMemReqEvent(PacketPtr pkt);
446
447        void processMemRespEvent(PacketPtr pkt);
448        EventFunctionWrapper *createMemRespEvent(PacketPtr pkt);
449
450        std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
451
452      protected:
453        ComputeUnit *computeUnit;
454        int index;
455
456        virtual bool recvTimingResp(PacketPtr pkt);
457        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
458        virtual void recvFunctional(PacketPtr pkt) { }
459        virtual void recvRangeChange() { }
460        virtual void recvReqRetry();
461
462        virtual void
463        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
464        {
465            resp.clear();
466            snoop = true;
467        }
468
469    };
470
471    // Instruction cache access port
472    class SQCPort : public MasterPort
473    {
474      public:
475        SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
476            : MasterPort(_name, _cu), computeUnit(_cu),
477              index(_index) { }
478
479        bool snoopRangeSent;
480
481        struct SenderState : public Packet::SenderState
482        {
483            Wavefront *wavefront;
484            Packet::SenderState *saved;
485
486            SenderState(Wavefront *_wavefront, Packet::SenderState
487                    *sender_state=nullptr)
488                : wavefront(_wavefront), saved(sender_state) { }
489        };
490
491        std::deque<std::pair<PacketPtr, Wavefront*>> retries;
492
493      protected:
494        ComputeUnit *computeUnit;
495        int index;
496
497        virtual bool recvTimingResp(PacketPtr pkt);
498        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
499        virtual void recvFunctional(PacketPtr pkt) { }
500        virtual void recvRangeChange() { }
501        virtual void recvReqRetry();
502
503        virtual void
504        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
505        {
506            resp.clear();
507            snoop = true;
508        }
509     };
510
511    /** Data TLB port **/
512    class DTLBPort : public MasterPort
513    {
514      public:
515        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
516            : MasterPort(_name, _cu), computeUnit(_cu),
517              index(_index), stalled(false)
518        { }
519
520        bool isStalled() { return stalled; }
521        void stallPort() { stalled = true; }
522        void unstallPort() { stalled = false; }
523
524        /**
525         * here we queue all the translation requests that were
526         * not successfully sent.
527         */
528        std::deque<PacketPtr> retries;
529
530        /** SenderState is information carried along with the packet
531         * throughout the TLB hierarchy
532         */
533        struct SenderState: public Packet::SenderState
534        {
535            // the memInst that this is associated with
536            GPUDynInstPtr _gpuDynInst;
537
538            // the lane in the memInst this is associated with, so we send
539            // the memory request down the right port
540            int portIndex;
541
542            // constructor used for packets involved in timing accesses
543            SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
544                : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
545
546        };
547
548      protected:
549        ComputeUnit *computeUnit;
550        int index;
551        bool stalled;
552
553        virtual bool recvTimingResp(PacketPtr pkt);
554        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
555        virtual void recvFunctional(PacketPtr pkt) { }
556        virtual void recvRangeChange() { }
557        virtual void recvReqRetry();
558    };
559
560    class ITLBPort : public MasterPort
561    {
562      public:
563        ITLBPort(const std::string &_name, ComputeUnit *_cu)
564            : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
565
566
567        bool isStalled() { return stalled; }
568        void stallPort() { stalled = true; }
569        void unstallPort() { stalled = false; }
570
571        /**
572         * here we queue all the translation requests that were
573         * not successfully sent.
574         */
575        std::deque<PacketPtr> retries;
576
577        /** SenderState is information carried along with the packet
578         * throughout the TLB hierarchy
579         */
580        struct SenderState: public Packet::SenderState
581        {
582            // The wavefront associated with this request
583            Wavefront *wavefront;
584
585            SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
586        };
587
588      protected:
589        ComputeUnit *computeUnit;
590        bool stalled;
591
592        virtual bool recvTimingResp(PacketPtr pkt);
593        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
594        virtual void recvFunctional(PacketPtr pkt) { }
595        virtual void recvRangeChange() { }
596        virtual void recvReqRetry();
597    };
598
599    /**
600     * the port intended to communicate between the CU and its LDS
601     */
602    class LDSPort : public MasterPort
603    {
604      public:
605        LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
606        : MasterPort(_name, _cu, _id), computeUnit(_cu)
607        {
608        }
609
610        bool isStalled() const { return stalled; }
611        void stallPort() { stalled = true; }
612        void unstallPort() { stalled = false; }
613
614        /**
615         * here we queue all the requests that were
616         * not successfully sent.
617         */
618        std::queue<PacketPtr> retries;
619
620        /**
621         *  SenderState is information carried along with the packet, esp. the
622         *  GPUDynInstPtr
623         */
624        class SenderState: public Packet::SenderState
625        {
626          protected:
627            // The actual read/write/atomic request that goes with this command
628            GPUDynInstPtr _gpuDynInst = nullptr;
629
630          public:
631            SenderState(GPUDynInstPtr gpuDynInst):
632              _gpuDynInst(gpuDynInst)
633            {
634            }
635
636            GPUDynInstPtr
637            getMemInst() const
638            {
639              return _gpuDynInst;
640            }
641        };
642
643        virtual bool
644        sendTimingReq(PacketPtr pkt);
645
646      protected:
647
648        bool stalled = false; ///< whether or not it is stalled
649
650        ComputeUnit *computeUnit;
651
652        virtual bool
653        recvTimingResp(PacketPtr pkt);
654
655        virtual Tick
656        recvAtomic(PacketPtr pkt) { return 0; }
657
658        virtual void
659        recvFunctional(PacketPtr pkt)
660        {
661        }
662
663        virtual void
664        recvRangeChange()
665        {
666        }
667
668        virtual void
669        recvReqRetry();
670    };
671
672    /** The port to access the Local Data Store
673     *  Can be connected to a LDS object
674     */
675    LDSPort *ldsPort = nullptr;
676
677    LDSPort *
678    getLdsPort() const
679    {
680        return ldsPort;
681    }
682
683    /** The memory port for SIMD data accesses.
684     *  Can be connected to PhysMem for Ruby for timing simulations
685     */
686    std::vector<DataPort*> memPort;
687    // port to the TLB hierarchy (i.e., the L1 TLB)
688    std::vector<DTLBPort*> tlbPort;
689    // port to the SQC (i.e. the I-cache)
690    SQCPort *sqcPort;
691    // port to the SQC TLB (there's a separate TLB for each I-cache)
692    ITLBPort *sqcTLBPort;
693
694    Port &
695    getPort(const std::string &if_name, PortID idx) override
696    {
697        if (if_name == "memory_port") {
698            memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
699                                        this, idx);
700            return *memPort[idx];
701        } else if (if_name == "translation_port") {
702            tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
703                                        this, idx);
704            return *tlbPort[idx];
705        } else if (if_name == "sqc_port") {
706            sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
707                                  this, idx);
708            return *sqcPort;
709        } else if (if_name == "sqc_tlb_port") {
710            sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
711            return *sqcTLBPort;
712        } else if (if_name == "ldsPort") {
713            if (ldsPort) {
714                fatal("an LDS port was already allocated");
715            }
716            ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
717            return *ldsPort;
718        } else {
719            panic("incorrect port name");
720        }
721    }
722
723    // xact_cas_load()
724    class waveIdentifier
725    {
726      public:
727        waveIdentifier() { }
728        waveIdentifier(int _simdId, int _wfSlotId)
729          : simdId(_simdId), wfSlotId(_wfSlotId) { }
730
731        int simdId;
732        int wfSlotId;
733    };
734
735    class waveQueue
736    {
737      public:
738        std::list<waveIdentifier> waveIDQueue;
739    };
740    std::map<unsigned, waveQueue> xactCasLoadMap;
741
742    uint64_t getAndIncSeqNum() { return globalSeqNum++; }
743
744  private:
745    const int _cacheLineSize;
746    uint64_t globalSeqNum;
747    int wavefrontSize;
748    GPUStaticInst *kernelLaunchInst;
749};
750
751#endif // __COMPUTE_UNIT_HH__
752