compute_unit.hh revision 11698
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: John Kalamatianos, Anthony Gutierrez
34 */
35
36#ifndef __COMPUTE_UNIT_HH__
37#define __COMPUTE_UNIT_HH__
38
39#include <deque>
40#include <map>
41#include <unordered_map>
42#include <vector>
43
44#include "base/callback.hh"
45#include "base/statistics.hh"
46#include "base/types.hh"
47#include "enums/PrefetchType.hh"
48#include "gpu-compute/exec_stage.hh"
49#include "gpu-compute/fetch_stage.hh"
50#include "gpu-compute/global_memory_pipeline.hh"
51#include "gpu-compute/local_memory_pipeline.hh"
52#include "gpu-compute/qstruct.hh"
53#include "gpu-compute/schedule_stage.hh"
54#include "gpu-compute/scoreboard_check_stage.hh"
55#include "mem/mem_object.hh"
56#include "mem/port.hh"
57
58static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
59static const int MAX_WIDTH_FOR_MEM_INST = 32;
60
61class NDRange;
62class Shader;
63class VectorRegisterFile;
64
65struct ComputeUnitParams;
66
67enum EXEC_POLICY
68{
69    OLDEST = 0,
70    RR
71};
72
73// List of execution units
74enum EXEC_UNIT
75{
76    SIMD0 = 0,
77    SIMD1,
78    SIMD2,
79    SIMD3,
80    GLBMEM_PIPE,
81    LDSMEM_PIPE,
82    NUM_UNITS
83};
84
85enum TLB_CACHE
86{
87    TLB_MISS_CACHE_MISS = 0,
88    TLB_MISS_CACHE_HIT,
89    TLB_HIT_CACHE_MISS,
90    TLB_HIT_CACHE_HIT
91};
92
93class ComputeUnit : public MemObject
94{
95  public:
96    FetchStage fetchStage;
97    ScoreboardCheckStage scoreboardCheckStage;
98    ScheduleStage scheduleStage;
99    ExecStage execStage;
100    GlobalMemPipeline globalMemoryPipe;
101    LocalMemPipeline localMemoryPipe;
102
103    // Buffers used to communicate between various pipeline stages
104
105    // List of waves which are ready to be scheduled.
106    // Each execution resource has a ready list. readyList is
107    // used to communicate between scoreboardCheck stage and
108    // schedule stage
109    // TODO: make enum to index readyList
110    std::vector<std::vector<Wavefront*>> readyList;
111
112    // Stores the status of waves. A READY implies the
113    // wave is ready to be scheduled this cycle and
114    // is already present in the readyList. waveStatusList is
115    // used to communicate between scoreboardCheck stage and
116    // schedule stage
117    // TODO: convert std::pair to a class to increase readability
118    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
119
120    // List of waves which will be dispatched to
121    // each execution resource. A FILLED implies
122    // dispatch list is non-empty and
123    // execution unit has something to execute
124    // this cycle. Currently, the dispatch list of
125    // an execution resource can hold only one wave because
126    // an execution resource can execute only one wave in a cycle.
127    // dispatchList is used to communicate between schedule
128    // and exec stage
129    // TODO: convert std::pair to a class to increase readability
130    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
131
132    int rrNextMemID; // used by RR WF exec policy to cycle through WF's
133    int rrNextALUWp;
134    typedef ComputeUnitParams Params;
135    std::vector<std::vector<Wavefront*>> wfList;
136    int cu_id;
137
138    // array of vector register files, one per SIMD
139    std::vector<VectorRegisterFile*> vrf;
140    // Number of vector ALU units (SIMDs) in CU
141    int numSIMDs;
142    // number of pipe stages for bypassing data to next dependent single
143    // precision vector instruction inside the vector ALU pipeline
144    int spBypassPipeLength;
145    // number of pipe stages for bypassing data to next dependent double
146    // precision vector instruction inside the vector ALU pipeline
147    int dpBypassPipeLength;
148    // number of cycles per issue period
149    int issuePeriod;
150
151    // Number of global and local memory execution resources in CU
152    int numGlbMemUnits;
153    int numLocMemUnits;
154    // tracks the last cycle a vector instruction was executed on a SIMD
155    std::vector<uint64_t> lastExecCycle;
156
157    // true if we allow a separate TLB per lane
158    bool perLaneTLB;
159    // if 0, TLB prefetching is off.
160    int prefetchDepth;
161    // if fixed-stride prefetching, this is the stride.
162    int prefetchStride;
163
164    std::vector<Addr> lastVaddrCU;
165    std::vector<std::vector<Addr>> lastVaddrSimd;
166    std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
167    Enums::PrefetchType prefetchType;
168    EXEC_POLICY exec_policy;
169
170    bool xact_cas_mode;
171    bool debugSegFault;
172    bool functionalTLB;
173    bool localMemBarrier;
174
175    /*
176     * for Counting page accesses
177     *
178     * cuExitCallback inherits from Callback. When you register a callback
179     * function as an exit callback, it will get added to an exit callback
180     * queue, such that on simulation exit, all callbacks in the callback
181     * queue will have their process() function called.
182     */
183    bool countPages;
184
185    Shader *shader;
186    uint32_t barrier_id;
187    // vector of Vector ALU (MACC) pipelines
188    std::vector<WaitClass> aluPipe;
189    // minimum issue period per SIMD unit (in cycles)
190    std::vector<WaitClass> wfWait;
191
192    // Resource control for Vector Register File->Global Memory pipe buses
193    std::vector<WaitClass> vrfToGlobalMemPipeBus;
194    // Resource control for Vector Register File->Local Memory pipe buses
195    std::vector<WaitClass> vrfToLocalMemPipeBus;
196    int nextGlbMemBus;
197    int nextLocMemBus;
198    // Resource control for global memory to VRF data/address bus
199    WaitClass glbMemToVrfBus;
200    // Resource control for local memory to VRF data/address bus
201    WaitClass locMemToVrfBus;
202
203    uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
204    uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
205    uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
206    uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load
207
208    Tick req_tick_latency;
209    Tick resp_tick_latency;
210
211    // number of vector registers being reserved for each SIMD unit
212    std::vector<int> vectorRegsReserved;
213    // number of vector registers per SIMD unit
214    uint32_t numVecRegsPerSimd;
215    // Support for scheduling VGPR status update events
216    std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
217    std::vector<uint64_t> timestampVec;
218    std::vector<uint8_t>  statusVec;
219
220    void
221    registerEvent(uint32_t simdId,
222                  uint32_t regIdx,
223                  uint32_t operandSize,
224                  uint64_t when,
225                  uint8_t newStatus) {
226        regIdxVec.push_back(std::make_pair(simdId, regIdx));
227        timestampVec.push_back(when);
228        statusVec.push_back(newStatus);
229        if (operandSize > 4) {
230            regIdxVec.push_back(std::make_pair(simdId,
231                                               ((regIdx + 1) %
232                                                numVecRegsPerSimd)));
233            timestampVec.push_back(when);
234            statusVec.push_back(newStatus);
235        }
236    }
237
238    void updateEvents();
239
240    // this hash map will keep track of page divergence
241    // per memory instruction per wavefront. The hash map
242    // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
243    std::map<Addr, int> pagesTouched;
244
245    ComputeUnit(const Params *p);
246    ~ComputeUnit();
247    int spBypassLength() { return spBypassPipeLength; };
248    int dpBypassLength() { return dpBypassPipeLength; };
249    int storeBusLength() { return numCyclesPerStoreTransfer; };
250    int loadBusLength() { return numCyclesPerLoadTransfer; };
251    int wfSize() const { return wavefrontSize; };
252
253    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
254    void exec();
255    void initiateFetch(Wavefront *wavefront);
256    void fetch(PacketPtr pkt, Wavefront *wavefront);
257    void fillKernelState(Wavefront *w, NDRange *ndr);
258
259    void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
260                        NDRange *ndr);
261
262    void StartWorkgroup(NDRange *ndr);
263    int ReadyWorkgroup(NDRange *ndr);
264
265    bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
266    bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
267    bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
268    int GlbMemUnitId() { return GLBMEM_PIPE; }
269    int ShrMemUnitId() { return LDSMEM_PIPE; }
270    int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
271    int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
272    /* This function cycles through all the wavefronts in all the phases to see
273     * if all of the wavefronts which should be associated with one barrier
274     * (denoted with _barrier_id), are all at the same barrier in the program
275     * (denoted by bcnt). When the number at the barrier matches bslots, then
276     * return true.
277     */
278    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
279    bool cedeSIMD(int simdId, int wfSlotId);
280
281    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
282    virtual void init();
283    void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
284    void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
285    void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
286                              bool kernelLaunch=true,
287                              RequestPtr req=nullptr);
288    void handleMemPacket(PacketPtr pkt, int memport_index);
289    bool processTimingPacket(PacketPtr pkt);
290    void processFetchReturn(PacketPtr pkt);
291    void updatePageDivergenceDist(Addr addr);
292
293    MasterID masterId() { return _masterId; }
294
295    bool isDone() const;
296    bool isSimdDone(uint32_t) const;
297
298  protected:
299    MasterID _masterId;
300
301    LdsState &lds;
302
303  public:
304    Stats::Scalar vALUInsts;
305    Stats::Formula vALUInstsPerWF;
306    Stats::Scalar sALUInsts;
307    Stats::Formula sALUInstsPerWF;
308    Stats::Scalar instCyclesVALU;
309    Stats::Scalar instCyclesSALU;
310    Stats::Scalar threadCyclesVALU;
311    Stats::Formula vALUUtilization;
312    Stats::Scalar ldsNoFlatInsts;
313    Stats::Formula ldsNoFlatInstsPerWF;
314    Stats::Scalar flatVMemInsts;
315    Stats::Formula flatVMemInstsPerWF;
316    Stats::Scalar flatLDSInsts;
317    Stats::Formula flatLDSInstsPerWF;
318    Stats::Scalar vectorMemWrites;
319    Stats::Formula vectorMemWritesPerWF;
320    Stats::Scalar vectorMemReads;
321    Stats::Formula vectorMemReadsPerWF;
322    Stats::Scalar scalarMemWrites;
323    Stats::Formula scalarMemWritesPerWF;
324    Stats::Scalar scalarMemReads;
325    Stats::Formula scalarMemReadsPerWF;
326
327    void updateInstStats(GPUDynInstPtr gpuDynInst);
328
329    // the following stats compute the avg. TLB accesslatency per
330    // uncoalesced request (only for data)
331    Stats::Scalar tlbRequests;
332    Stats::Scalar tlbCycles;
333    Stats::Formula tlbLatency;
334    // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
335    Stats::Vector hitsPerTLBLevel;
336
337    Stats::Scalar ldsBankAccesses;
338    Stats::Distribution ldsBankConflictDist;
339
340    // over all memory instructions executed over all wavefronts
341    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
342    Stats::Distribution pageDivergenceDist;
343    Stats::Scalar dynamicGMemInstrCnt;
344    Stats::Scalar dynamicLMemInstrCnt;
345
346    Stats::Scalar wgBlockedDueLdsAllocation;
347    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
348    // when the instruction is committed, this number is still incremented by 1
349    Stats::Scalar numInstrExecuted;
350    // Number of cycles among successive instruction executions across all
351    // wavefronts of the same CU
352    Stats::Distribution execRateDist;
353    // number of individual vector operations executed
354    Stats::Scalar numVecOpsExecuted;
355    // Total cycles that something is running on the GPU
356    Stats::Scalar totalCycles;
357    Stats::Formula vpc; // vector ops per cycle
358    Stats::Formula ipc; // vector instructions per cycle
359    Stats::Distribution controlFlowDivergenceDist;
360    Stats::Distribution activeLanesPerGMemInstrDist;
361    Stats::Distribution activeLanesPerLMemInstrDist;
362    // number of vector ALU instructions received
363    Stats::Formula numALUInstsExecuted;
364    // number of times a WG can not start due to lack of free VGPRs in SIMDs
365    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
366    Stats::Scalar numCASOps;
367    Stats::Scalar numFailedCASOps;
368    Stats::Scalar completedWfs;
369    // flag per vector SIMD unit that is set when there is at least one
370    // WV that has a vector ALU instruction as the oldest in its
371    // Instruction Buffer: Defined in the Scoreboard stage, consumed
372    // by the Execute stage.
373    std::vector<bool> vectorAluInstAvail;
374    // number of available (oldest) LDS instructions that could have
375    // been issued to the LDS at a specific issue slot
376    int shrMemInstAvail;
377    // number of available Global memory instructions that could have
378    // been issued to TCP at a specific issue slot
379    int glbMemInstAvail;
380
381    void
382    regStats();
383
384    LdsState &
385    getLds() const
386    {
387        return lds;
388    }
389
390    int32_t
391    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
392
393    int cacheLineSize() const { return _cacheLineSize; }
394
395    bool
396    sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
397
398    typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
399    pageDataStruct pageAccesses;
400
401    class CUExitCallback : public Callback
402    {
403      private:
404        ComputeUnit *computeUnit;
405
406      public:
407        virtual ~CUExitCallback() { }
408
409        CUExitCallback(ComputeUnit *_cu)
410        {
411            computeUnit = _cu;
412        }
413
414        virtual void
415        process();
416    };
417
418    CUExitCallback *cuExitCallback;
419
420    /** Data access Port **/
421    class DataPort : public MasterPort
422    {
423      public:
424        DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
425            : MasterPort(_name, _cu), computeUnit(_cu),
426              index(_index) { }
427
428        bool snoopRangeSent;
429
430        struct SenderState : public Packet::SenderState
431        {
432            GPUDynInstPtr _gpuDynInst;
433            int port_index;
434            Packet::SenderState *saved;
435
436            SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
437                        Packet::SenderState *sender_state=nullptr)
438                : _gpuDynInst(gpuDynInst),
439                  port_index(_port_index),
440                  saved(sender_state) { }
441        };
442
443        class MemReqEvent : public Event
444        {
445          private:
446            DataPort *dataPort;
447            PacketPtr pkt;
448
449          public:
450            MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
451                : Event(), dataPort(_data_port), pkt(_pkt)
452            {
453              setFlags(Event::AutoDelete);
454            }
455
456            void process();
457            const char *description() const;
458        };
459
460        class MemRespEvent : public Event
461        {
462          private:
463            DataPort *dataPort;
464            PacketPtr pkt;
465
466          public:
467            MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
468                : Event(), dataPort(_data_port), pkt(_pkt)
469            {
470              setFlags(Event::AutoDelete);
471            }
472
473            void process();
474            const char *description() const;
475        };
476
477        std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
478
479      protected:
480        ComputeUnit *computeUnit;
481        int index;
482
483        virtual bool recvTimingResp(PacketPtr pkt);
484        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
485        virtual void recvFunctional(PacketPtr pkt) { }
486        virtual void recvRangeChange() { }
487        virtual void recvReqRetry();
488
489        virtual void
490        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
491        {
492            resp.clear();
493            snoop = true;
494        }
495
496    };
497
498    // Instruction cache access port
499    class SQCPort : public MasterPort
500    {
501      public:
502        SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
503            : MasterPort(_name, _cu), computeUnit(_cu),
504              index(_index) { }
505
506        bool snoopRangeSent;
507
508        struct SenderState : public Packet::SenderState
509        {
510            Wavefront *wavefront;
511            Packet::SenderState *saved;
512
513            SenderState(Wavefront *_wavefront, Packet::SenderState
514                    *sender_state=nullptr)
515                : wavefront(_wavefront), saved(sender_state) { }
516        };
517
518        std::deque<std::pair<PacketPtr, Wavefront*>> retries;
519
520      protected:
521        ComputeUnit *computeUnit;
522        int index;
523
524        virtual bool recvTimingResp(PacketPtr pkt);
525        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
526        virtual void recvFunctional(PacketPtr pkt) { }
527        virtual void recvRangeChange() { }
528        virtual void recvReqRetry();
529
530        virtual void
531        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
532        {
533            resp.clear();
534            snoop = true;
535        }
536     };
537
538    /** Data TLB port **/
539    class DTLBPort : public MasterPort
540    {
541      public:
542        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
543            : MasterPort(_name, _cu), computeUnit(_cu),
544              index(_index), stalled(false)
545        { }
546
547        bool isStalled() { return stalled; }
548        void stallPort() { stalled = true; }
549        void unstallPort() { stalled = false; }
550
551        /**
552         * here we queue all the translation requests that were
553         * not successfully sent.
554         */
555        std::deque<PacketPtr> retries;
556
557        /** SenderState is information carried along with the packet
558         * throughout the TLB hierarchy
559         */
560        struct SenderState: public Packet::SenderState
561        {
562            // the memInst that this is associated with
563            GPUDynInstPtr _gpuDynInst;
564
565            // the lane in the memInst this is associated with, so we send
566            // the memory request down the right port
567            int portIndex;
568
569            // constructor used for packets involved in timing accesses
570            SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
571                : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
572
573        };
574
575      protected:
576        ComputeUnit *computeUnit;
577        int index;
578        bool stalled;
579
580        virtual bool recvTimingResp(PacketPtr pkt);
581        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
582        virtual void recvFunctional(PacketPtr pkt) { }
583        virtual void recvRangeChange() { }
584        virtual void recvReqRetry();
585    };
586
587    class ITLBPort : public MasterPort
588    {
589      public:
590        ITLBPort(const std::string &_name, ComputeUnit *_cu)
591            : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
592
593
594        bool isStalled() { return stalled; }
595        void stallPort() { stalled = true; }
596        void unstallPort() { stalled = false; }
597
598        /**
599         * here we queue all the translation requests that were
600         * not successfully sent.
601         */
602        std::deque<PacketPtr> retries;
603
604        /** SenderState is information carried along with the packet
605         * throughout the TLB hierarchy
606         */
607        struct SenderState: public Packet::SenderState
608        {
609            // The wavefront associated with this request
610            Wavefront *wavefront;
611
612            SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
613        };
614
615      protected:
616        ComputeUnit *computeUnit;
617        bool stalled;
618
619        virtual bool recvTimingResp(PacketPtr pkt);
620        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
621        virtual void recvFunctional(PacketPtr pkt) { }
622        virtual void recvRangeChange() { }
623        virtual void recvReqRetry();
624    };
625
626    /**
627     * the port intended to communicate between the CU and its LDS
628     */
629    class LDSPort : public MasterPort
630    {
631      public:
632        LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
633        : MasterPort(_name, _cu, _id), computeUnit(_cu)
634        {
635        }
636
637        bool isStalled() const { return stalled; }
638        void stallPort() { stalled = true; }
639        void unstallPort() { stalled = false; }
640
641        /**
642         * here we queue all the requests that were
643         * not successfully sent.
644         */
645        std::queue<PacketPtr> retries;
646
647        /**
648         *  SenderState is information carried along with the packet, esp. the
649         *  GPUDynInstPtr
650         */
651        class SenderState: public Packet::SenderState
652        {
653          protected:
654            // The actual read/write/atomic request that goes with this command
655            GPUDynInstPtr _gpuDynInst = nullptr;
656
657          public:
658            SenderState(GPUDynInstPtr gpuDynInst):
659              _gpuDynInst(gpuDynInst)
660            {
661            }
662
663            GPUDynInstPtr
664            getMemInst() const
665            {
666              return _gpuDynInst;
667            }
668        };
669
670        virtual bool
671        sendTimingReq(PacketPtr pkt);
672
673      protected:
674
675        bool stalled = false; ///< whether or not it is stalled
676
677        ComputeUnit *computeUnit;
678
679        virtual bool
680        recvTimingResp(PacketPtr pkt);
681
682        virtual Tick
683        recvAtomic(PacketPtr pkt) { return 0; }
684
685        virtual void
686        recvFunctional(PacketPtr pkt)
687        {
688        }
689
690        virtual void
691        recvRangeChange()
692        {
693        }
694
695        virtual void
696        recvReqRetry();
697    };
698
699    /** The port to access the Local Data Store
700     *  Can be connected to a LDS object
701     */
702    LDSPort *ldsPort = nullptr;
703
704    LDSPort *
705    getLdsPort() const
706    {
707        return ldsPort;
708    }
709
710    /** The memory port for SIMD data accesses.
711     *  Can be connected to PhysMem for Ruby for timing simulations
712     */
713    std::vector<DataPort*> memPort;
714    // port to the TLB hierarchy (i.e., the L1 TLB)
715    std::vector<DTLBPort*> tlbPort;
716    // port to the SQC (i.e. the I-cache)
717    SQCPort *sqcPort;
718    // port to the SQC TLB (there's a separate TLB for each I-cache)
719    ITLBPort *sqcTLBPort;
720
721    virtual BaseMasterPort&
722    getMasterPort(const std::string &if_name, PortID idx)
723    {
724        if (if_name == "memory_port") {
725            memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
726                                        this, idx);
727            return *memPort[idx];
728        } else if (if_name == "translation_port") {
729            tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
730                                        this, idx);
731            return *tlbPort[idx];
732        } else if (if_name == "sqc_port") {
733            sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
734                                  this, idx);
735            return *sqcPort;
736        } else if (if_name == "sqc_tlb_port") {
737            sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
738            return *sqcTLBPort;
739        } else if (if_name == "ldsPort") {
740            if (ldsPort) {
741                fatal("an LDS port was already allocated");
742            }
743            ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
744            return *ldsPort;
745        } else {
746            panic("incorrect port name");
747        }
748    }
749
750    // xact_cas_load()
751    class waveIdentifier
752    {
753      public:
754        waveIdentifier() { }
755        waveIdentifier(int _simdId, int _wfSlotId)
756          : simdId(_simdId), wfSlotId(_wfSlotId) { }
757
758        int simdId;
759        int wfSlotId;
760    };
761
762    class waveQueue
763    {
764      public:
765        std::list<waveIdentifier> waveIDQueue;
766    };
767    std::map<unsigned, waveQueue> xactCasLoadMap;
768
769    uint64_t getAndIncSeqNum() { return globalSeqNum++; }
770
771  private:
772    const int _cacheLineSize;
773    uint64_t globalSeqNum;
774    int wavefrontSize;
775    GPUStaticInst *kernelLaunchInst;
776};
777
778#endif // __COMPUTE_UNIT_HH__
779