compute_unit.hh revision 11308:7d8836fd043d
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: John Kalamatianos, Anthony Gutierrez
34 */
35
36#ifndef __COMPUTE_UNIT_HH__
37#define __COMPUTE_UNIT_HH__
38
39#include <deque>
40#include <map>
41#include <unordered_map>
42#include <vector>
43
44#include "base/callback.hh"
45#include "base/statistics.hh"
46#include "base/types.hh"
47#include "enums/PrefetchType.hh"
48#include "gpu-compute/exec_stage.hh"
49#include "gpu-compute/fetch_stage.hh"
50#include "gpu-compute/global_memory_pipeline.hh"
51#include "gpu-compute/local_memory_pipeline.hh"
52#include "gpu-compute/qstruct.hh"
53#include "gpu-compute/schedule_stage.hh"
54#include "gpu-compute/scoreboard_check_stage.hh"
55#include "mem/mem_object.hh"
56#include "mem/port.hh"
57
58static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
59static const int MAX_WIDTH_FOR_MEM_INST = 32;
60
61class NDRange;
62class Shader;
63class VectorRegisterFile;
64
65struct ComputeUnitParams;
66
67enum EXEC_POLICY
68{
69    OLDEST = 0,
70    RR
71};
72
73// List of execution units
74enum EXEC_UNIT
75{
76    SIMD0 = 0,
77    SIMD1,
78    SIMD2,
79    SIMD3,
80    GLBMEM_PIPE,
81    LDSMEM_PIPE,
82    NUM_UNITS
83};
84
85enum TLB_CACHE
86{
87    TLB_MISS_CACHE_MISS = 0,
88    TLB_MISS_CACHE_HIT,
89    TLB_HIT_CACHE_MISS,
90    TLB_HIT_CACHE_HIT
91};
92
93class ComputeUnit : public MemObject
94{
95  public:
96    FetchStage fetchStage;
97    ScoreboardCheckStage scoreboardCheckStage;
98    ScheduleStage scheduleStage;
99    ExecStage execStage;
100    GlobalMemPipeline globalMemoryPipe;
101    LocalMemPipeline localMemoryPipe;
102
103    // Buffers used to communicate between various pipeline stages
104
105    // List of waves which are ready to be scheduled.
106    // Each execution resource has a ready list. readyList is
107    // used to communicate between scoreboardCheck stage and
108    // schedule stage
109    // TODO: make enum to index readyList
110    std::vector<std::vector<Wavefront*>> readyList;
111
112    // Stores the status of waves. A READY implies the
113    // wave is ready to be scheduled this cycle and
114    // is already present in the readyList. waveStatusList is
115    // used to communicate between scoreboardCheck stage and
116    // schedule stage
117    // TODO: convert std::pair to a class to increase readability
118    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
119
120    // List of waves which will be dispatched to
121    // each execution resource. A FILLED implies
122    // dispatch list is non-empty and
123    // execution unit has something to execute
124    // this cycle. Currently, the dispatch list of
125    // an execution resource can hold only one wave because
126    // an execution resource can execute only one wave in a cycle.
127    // dispatchList is used to communicate between schedule
128    // and exec stage
129    // TODO: convert std::pair to a class to increase readability
130    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
131
132    int rrNextMemID; // used by RR WF exec policy to cycle through WF's
133    int rrNextALUWp;
134    typedef ComputeUnitParams Params;
135    std::vector<std::vector<Wavefront*>> wfList;
136    int cu_id;
137
138    // array of vector register files, one per SIMD
139    std::vector<VectorRegisterFile*> vrf;
140    // Number of vector ALU units (SIMDs) in CU
141    int numSIMDs;
142    // number of pipe stages for bypassing data to next dependent single
143    // precision vector instruction inside the vector ALU pipeline
144    int spBypassPipeLength;
145    // number of pipe stages for bypassing data to next dependent double
146    // precision vector instruction inside the vector ALU pipeline
147    int dpBypassPipeLength;
148    // number of cycles per issue period
149    int issuePeriod;
150
151    // Number of global and local memory execution resources in CU
152    int numGlbMemUnits;
153    int numLocMemUnits;
154    // tracks the last cycle a vector instruction was executed on a SIMD
155    std::vector<uint64_t> lastExecCycle;
156
157    // true if we allow a separate TLB per lane
158    bool perLaneTLB;
159    // if 0, TLB prefetching is off.
160    int prefetchDepth;
161    // if fixed-stride prefetching, this is the stride.
162    int prefetchStride;
163
164    class LastVaddrWave
165    {
166      public:
167        Addr vaddrs[VSZ];
168        Addr& operator[](int idx) {
169            return vaddrs[idx];
170        }
171
172        LastVaddrWave() {
173            for (int i = 0; i < VSZ; ++i)
174                vaddrs[i] = 0;
175        }
176    };
177
178    LastVaddrWave lastVaddrCU;
179    std::vector<LastVaddrWave> lastVaddrPhase;
180    std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
181    Enums::PrefetchType prefetchType;
182    EXEC_POLICY exec_policy;
183
184    bool xact_cas_mode;
185    bool debugSegFault;
186    bool functionalTLB;
187    bool localMemBarrier;
188
189    /*
190     * for Counting page accesses
191     *
192     * cuExitCallback inherits from Callback. When you register a callback
193     * function as an exit callback, it will get added to an exit callback
194     * queue, such that on simulation exit, all callbacks in the callback
195     * queue will have their process() function called.
196     */
197    bool countPages;
198
199    Shader *shader;
200    uint32_t barrier_id;
201    // vector of Vector ALU (MACC) pipelines
202    std::vector<WaitClass> aluPipe;
203    // minimum issue period per SIMD unit (in cycles)
204    std::vector<WaitClass> wfWait;
205
206    // Resource control for Vector Register File->Global Memory pipe buses
207    std::vector<WaitClass> vrfToGlobalMemPipeBus;
208    // Resource control for Vector Register File->Local Memory pipe buses
209    std::vector<WaitClass> vrfToLocalMemPipeBus;
210    int nextGlbMemBus;
211    int nextLocMemBus;
212    // Resource control for global memory to VRF data/address bus
213    WaitClass glbMemToVrfBus;
214    // Resource control for local memory to VRF data/address bus
215    WaitClass locMemToVrfBus;
216
217    uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
218    uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
219    uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
220    uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load
221
222    Tick req_tick_latency;
223    Tick resp_tick_latency;
224
225    // number of vector registers being reserved for each SIMD unit
226    std::vector<int> vectorRegsReserved;
227    // number of vector registers per SIMD unit
228    uint32_t numVecRegsPerSimd;
229    // Support for scheduling VGPR status update events
230    std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
231    std::vector<uint64_t> timestampVec;
232    std::vector<uint8_t>  statusVec;
233
234    void
235    registerEvent(uint32_t simdId,
236                  uint32_t regIdx,
237                  uint32_t operandSize,
238                  uint64_t when,
239                  uint8_t newStatus) {
240        regIdxVec.push_back(std::make_pair(simdId, regIdx));
241        timestampVec.push_back(when);
242        statusVec.push_back(newStatus);
243        if (operandSize > 4) {
244            regIdxVec.push_back(std::make_pair(simdId,
245                                               ((regIdx + 1) %
246                                                numVecRegsPerSimd)));
247            timestampVec.push_back(when);
248            statusVec.push_back(newStatus);
249        }
250    }
251
252    void updateEvents();
253
254    // this hash map will keep track of page divergence
255    // per memory instruction per wavefront. The hash map
256    // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
257    std::map<Addr, int> pagesTouched;
258
259    ComputeUnit(const Params *p);
260    ~ComputeUnit();
261    int spBypassLength() { return spBypassPipeLength; };
262    int dpBypassLength() { return dpBypassPipeLength; };
263    int storeBusLength() { return numCyclesPerStoreTransfer; };
264    int loadBusLength() { return numCyclesPerLoadTransfer; };
265    int wfSize() const { return wavefrontSize; };
266
267    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
268    void exec();
269    void initiateFetch(Wavefront *wavefront);
270    void fetch(PacketPtr pkt, Wavefront *wavefront);
271    void FillKernelState(Wavefront *w, NDRange *ndr);
272
273    void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
274                 int trueWgSizeTotal);
275
276    void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
277                             int trueWgSize[], int trueWgSizeTotal,
278                             LdsChunk *ldsChunk, uint64_t origSpillMemStart);
279
280    void StartWorkgroup(NDRange *ndr);
281    int ReadyWorkgroup(NDRange *ndr);
282
283    bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
284    bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
285    bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
286    int GlbMemUnitId() { return GLBMEM_PIPE; }
287    int ShrMemUnitId() { return LDSMEM_PIPE; }
288    int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
289    int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
290    /* This function cycles through all the wavefronts in all the phases to see
291     * if all of the wavefronts which should be associated with one barrier
292     * (denoted with _barrier_id), are all at the same barrier in the program
293     * (denoted by bcnt). When the number at the barrier matches bslots, then
294     * return true.
295     */
296    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
297    bool cedeSIMD(int simdId, int wfSlotId);
298
299    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
300    virtual void init();
301    void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
302    void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
303    void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
304                              bool kernelLaunch=true,
305                              RequestPtr req=nullptr);
306    void handleMemPacket(PacketPtr pkt, int memport_index);
307    bool processTimingPacket(PacketPtr pkt);
308    void processFetchReturn(PacketPtr pkt);
309    void updatePageDivergenceDist(Addr addr);
310
311    MasterID masterId() { return _masterId; }
312
313    bool isDone() const;
314    bool isSimdDone(uint32_t) const;
315
316  protected:
317    MasterID _masterId;
318
319    LdsState &lds;
320
321  public:
322    // the following stats compute the avg. TLB accesslatency per
323    // uncoalesced request (only for data)
324    Stats::Scalar tlbRequests;
325    Stats::Scalar tlbCycles;
326    Stats::Formula tlbLatency;
327    // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
328    Stats::Vector hitsPerTLBLevel;
329
330    Stats::Scalar ldsBankAccesses;
331    Stats::Distribution ldsBankConflictDist;
332
333    // over all memory instructions executed over all wavefronts
334    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
335    Stats::Distribution pageDivergenceDist;
336    Stats::Scalar dynamicGMemInstrCnt;
337    Stats::Scalar dynamicLMemInstrCnt;
338
339    Stats::Scalar wgBlockedDueLdsAllocation;
340    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
341    // when the instruction is committed, this number is still incremented by 1
342    Stats::Scalar numInstrExecuted;
343    // Number of cycles among successive instruction executions across all
344    // wavefronts of the same CU
345    Stats::Distribution execRateDist;
346    // number of individual vector operations executed
347    Stats::Scalar numVecOpsExecuted;
348    // Total cycles that something is running on the GPU
349    Stats::Scalar totalCycles;
350    Stats::Formula vpc; // vector ops per cycle
351    Stats::Formula ipc; // vector instructions per cycle
352    Stats::Distribution controlFlowDivergenceDist;
353    Stats::Distribution activeLanesPerGMemInstrDist;
354    Stats::Distribution activeLanesPerLMemInstrDist;
355    // number of vector ALU instructions received
356    Stats::Formula numALUInstsExecuted;
357    // number of times a WG can not start due to lack of free VGPRs in SIMDs
358    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
359    Stats::Scalar numCASOps;
360    Stats::Scalar numFailedCASOps;
361    Stats::Scalar completedWfs;
362    // flag per vector SIMD unit that is set when there is at least one
363    // WV that has a vector ALU instruction as the oldest in its
364    // Instruction Buffer: Defined in the Scoreboard stage, consumed
365    // by the Execute stage.
366    std::vector<bool> vectorAluInstAvail;
367    // number of available (oldest) LDS instructions that could have
368    // been issued to the LDS at a specific issue slot
369    int shrMemInstAvail;
370    // number of available Global memory instructions that could have
371    // been issued to TCP at a specific issue slot
372    int glbMemInstAvail;
373
374    void
375    regStats();
376
377    LdsState &
378    getLds() const
379    {
380        return lds;
381    }
382
383    int32_t
384    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
385
386    bool
387    sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
388
389    typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
390    pageDataStruct pageAccesses;
391
392    class CUExitCallback : public Callback
393    {
394      private:
395        ComputeUnit *computeUnit;
396
397      public:
398        virtual ~CUExitCallback() { }
399
400        CUExitCallback(ComputeUnit *_cu)
401        {
402            computeUnit = _cu;
403        }
404
405        virtual void
406        process();
407    };
408
409    CUExitCallback *cuExitCallback;
410
411    /** Data access Port **/
412    class DataPort : public MasterPort
413    {
414      public:
415        DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
416            : MasterPort(_name, _cu), computeUnit(_cu),
417              index(_index) { }
418
419        bool snoopRangeSent;
420
421        struct SenderState : public Packet::SenderState
422        {
423            GPUDynInstPtr _gpuDynInst;
424            int port_index;
425            Packet::SenderState *saved;
426
427            SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
428                        Packet::SenderState *sender_state=nullptr)
429                : _gpuDynInst(gpuDynInst),
430                  port_index(_port_index),
431                  saved(sender_state) { }
432        };
433
434        class MemReqEvent : public Event
435        {
436          private:
437            DataPort *dataPort;
438            PacketPtr pkt;
439
440          public:
441            MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
442                : Event(), dataPort(_data_port), pkt(_pkt)
443            {
444              setFlags(Event::AutoDelete);
445            }
446
447            void process();
448            const char *description() const;
449        };
450
451        class MemRespEvent : public Event
452        {
453          private:
454            DataPort *dataPort;
455            PacketPtr pkt;
456
457          public:
458            MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
459                : Event(), dataPort(_data_port), pkt(_pkt)
460            {
461              setFlags(Event::AutoDelete);
462            }
463
464            void process();
465            const char *description() const;
466        };
467
468        std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
469
470      protected:
471        ComputeUnit *computeUnit;
472        int index;
473
474        virtual bool recvTimingResp(PacketPtr pkt);
475        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
476        virtual void recvFunctional(PacketPtr pkt) { }
477        virtual void recvRangeChange() { }
478        virtual void recvReqRetry();
479
480        virtual void
481        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
482        {
483            resp.clear();
484            snoop = true;
485        }
486
487    };
488
489    // Instruction cache access port
490    class SQCPort : public MasterPort
491    {
492      public:
493        SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
494            : MasterPort(_name, _cu), computeUnit(_cu),
495              index(_index) { }
496
497        bool snoopRangeSent;
498
499        struct SenderState : public Packet::SenderState
500        {
501            Wavefront *wavefront;
502            Packet::SenderState *saved;
503
504            SenderState(Wavefront *_wavefront, Packet::SenderState
505                    *sender_state=nullptr)
506                : wavefront(_wavefront), saved(sender_state) { }
507        };
508
509        std::deque<std::pair<PacketPtr, Wavefront*>> retries;
510
511      protected:
512        ComputeUnit *computeUnit;
513        int index;
514
515        virtual bool recvTimingResp(PacketPtr pkt);
516        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
517        virtual void recvFunctional(PacketPtr pkt) { }
518        virtual void recvRangeChange() { }
519        virtual void recvReqRetry();
520
521        virtual void
522        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
523        {
524            resp.clear();
525            snoop = true;
526        }
527     };
528
529    /** Data TLB port **/
530    class DTLBPort : public MasterPort
531    {
532      public:
533        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
534            : MasterPort(_name, _cu), computeUnit(_cu),
535              index(_index), stalled(false)
536        { }
537
538        bool isStalled() { return stalled; }
539        void stallPort() { stalled = true; }
540        void unstallPort() { stalled = false; }
541
542        /**
543         * here we queue all the translation requests that were
544         * not successfully sent.
545         */
546        std::deque<PacketPtr> retries;
547
548        /** SenderState is information carried along with the packet
549         * throughout the TLB hierarchy
550         */
551        struct SenderState: public Packet::SenderState
552        {
553            // the memInst that this is associated with
554            GPUDynInstPtr _gpuDynInst;
555
556            // the lane in the memInst this is associated with, so we send
557            // the memory request down the right port
558            int portIndex;
559
560            // constructor used for packets involved in timing accesses
561            SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
562                : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
563
564        };
565
566      protected:
567        ComputeUnit *computeUnit;
568        int index;
569        bool stalled;
570
571        virtual bool recvTimingResp(PacketPtr pkt);
572        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
573        virtual void recvFunctional(PacketPtr pkt) { }
574        virtual void recvRangeChange() { }
575        virtual void recvReqRetry();
576    };
577
578    class ITLBPort : public MasterPort
579    {
580      public:
581        ITLBPort(const std::string &_name, ComputeUnit *_cu)
582            : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
583
584
585        bool isStalled() { return stalled; }
586        void stallPort() { stalled = true; }
587        void unstallPort() { stalled = false; }
588
589        /**
590         * here we queue all the translation requests that were
591         * not successfully sent.
592         */
593        std::deque<PacketPtr> retries;
594
595        /** SenderState is information carried along with the packet
596         * throughout the TLB hierarchy
597         */
598        struct SenderState: public Packet::SenderState
599        {
600            // The wavefront associated with this request
601            Wavefront *wavefront;
602
603            SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
604        };
605
606      protected:
607        ComputeUnit *computeUnit;
608        bool stalled;
609
610        virtual bool recvTimingResp(PacketPtr pkt);
611        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
612        virtual void recvFunctional(PacketPtr pkt) { }
613        virtual void recvRangeChange() { }
614        virtual void recvReqRetry();
615    };
616
617    /**
618     * the port intended to communicate between the CU and its LDS
619     */
620    class LDSPort : public MasterPort
621    {
622      public:
623        LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
624        : MasterPort(_name, _cu, _id), computeUnit(_cu)
625        {
626        }
627
628        bool isStalled() const { return stalled; }
629        void stallPort() { stalled = true; }
630        void unstallPort() { stalled = false; }
631
632        /**
633         * here we queue all the requests that were
634         * not successfully sent.
635         */
636        std::queue<PacketPtr> retries;
637
638        /**
639         *  SenderState is information carried along with the packet, esp. the
640         *  GPUDynInstPtr
641         */
642        class SenderState: public Packet::SenderState
643        {
644          protected:
645            // The actual read/write/atomic request that goes with this command
646            GPUDynInstPtr _gpuDynInst = nullptr;
647
648          public:
649            SenderState(GPUDynInstPtr gpuDynInst):
650              _gpuDynInst(gpuDynInst)
651            {
652            }
653
654            GPUDynInstPtr
655            getMemInst() const
656            {
657              return _gpuDynInst;
658            }
659        };
660
661        virtual bool
662        sendTimingReq(PacketPtr pkt);
663
664      protected:
665
666        bool stalled = false; ///< whether or not it is stalled
667
668        ComputeUnit *computeUnit;
669
670        virtual bool
671        recvTimingResp(PacketPtr pkt);
672
673        virtual Tick
674        recvAtomic(PacketPtr pkt) { return 0; }
675
676        virtual void
677        recvFunctional(PacketPtr pkt)
678        {
679        }
680
681        virtual void
682        recvRangeChange()
683        {
684        }
685
686        virtual void
687        recvReqRetry();
688    };
689
690    /** The port to access the Local Data Store
691     *  Can be connected to a LDS object
692     */
693    LDSPort *ldsPort = nullptr;
694
695    LDSPort *
696    getLdsPort() const
697    {
698        return ldsPort;
699    }
700
701    /** The memory port for SIMD data accesses.
702     *  Can be connected to PhysMem for Ruby for timing simulations
703     */
704    std::vector<DataPort*> memPort;
705    // port to the TLB hierarchy (i.e., the L1 TLB)
706    std::vector<DTLBPort*> tlbPort;
707    // port to the SQC (i.e. the I-cache)
708    SQCPort *sqcPort;
709    // port to the SQC TLB (there's a separate TLB for each I-cache)
710    ITLBPort *sqcTLBPort;
711
712    virtual BaseMasterPort&
713    getMasterPort(const std::string &if_name, PortID idx)
714    {
715        if (if_name == "memory_port") {
716            memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
717                                        this, idx);
718            return *memPort[idx];
719        } else if (if_name == "translation_port") {
720            tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
721                                        this, idx);
722            return *tlbPort[idx];
723        } else if (if_name == "sqc_port") {
724            sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
725                                  this, idx);
726            return *sqcPort;
727        } else if (if_name == "sqc_tlb_port") {
728            sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
729            return *sqcTLBPort;
730        } else if (if_name == "ldsPort") {
731            if (ldsPort) {
732                fatal("an LDS port was already allocated");
733            }
734            ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
735            return *ldsPort;
736        } else {
737            panic("incorrect port name");
738        }
739    }
740
741    // xact_cas_load()
742    class waveIdentifier
743    {
744      public:
745        waveIdentifier() { }
746        waveIdentifier(int _simdId, int _wfSlotId)
747          : simdId(_simdId), wfSlotId(_wfSlotId) { }
748
749        int simdId;
750        int wfSlotId;
751    };
752
753    class waveQueue
754    {
755      public:
756        std::list<waveIdentifier> waveIDQueue;
757    };
758    std::map<unsigned, waveQueue> xactCasLoadMap;
759
760    uint64_t getAndIncSeqNum() { return globalSeqNum++; }
761
762  private:
763    uint64_t globalSeqNum;
764    int wavefrontSize;
765};
766
767#endif // __COMPUTE_UNIT_HH__
768