src/gpu-compute/compute_unit.hh

2SN/A/*
1762SN/A * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
2SN/A * All rights reserved.
2SN/A *
2SN/A * For use for simulation and test purposes only
2SN/A *
2SN/A * Redistribution and use in source and binary forms, with or without
2SN/A * modification, are permitted provided that the following conditions are met:
2SN/A *
2SN/A * 1. Redistributions of source code must retain the above copyright notice,
2SN/A * this list of conditions and the following disclaimer.
2SN/A *
2SN/A * 2. Redistributions in binary form must reproduce the above copyright notice,
2SN/A * this list of conditions and the following disclaimer in the documentation
2SN/A * and/or other materials provided with the distribution.
2SN/A *
2SN/A * 3. Neither the name of the copyright holder nor the names of its contributors
2SN/A * may be used to endorse or promote products derived from this software
2SN/A * without specific prior written permission.
2SN/A *
2SN/A * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
2SN/A * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2SN/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2SN/A * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
2SN/A * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
2SN/A * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
2665Ssaidi@eecs.umich.edu * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
2665Ssaidi@eecs.umich.edu * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
2665Ssaidi@eecs.umich.edu * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
2665Ssaidi@eecs.umich.edu * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
2665Ssaidi@eecs.umich.edu * POSSIBILITY OF SUCH DAMAGE.
2SN/A *
2SN/A * Author: John Kalamatianos, Anthony Gutierrez
2SN/A */
2SN/A
7349SAli.Saidi@ARM.com#ifndef __COMPUTE_UNIT_HH__
7680Sgblack@eecs.umich.edu#define __COMPUTE_UNIT_HH__
56SN/A
8229Snate@binkert.org#include <deque>
1717SN/A#include <map>
2518SN/A#include <unordered_map>
56SN/A#include <vector>
4776Sgblack@eecs.umich.edu
8232Snate@binkert.org#include "base/callback.hh"
4762Snate@binkert.org#include "base/statistics.hh"
3065Sgblack@eecs.umich.edu#include "base/types.hh"
2SN/A#include "enums/PrefetchType.hh"
2973Sgblack@eecs.umich.edu#include "gpu-compute/exec_stage.hh"
2SN/A#include "gpu-compute/fetch_stage.hh"
3506Ssaidi@eecs.umich.edu#include "gpu-compute/global_memory_pipeline.hh"
4054Sbinkertn@umich.edu#include "gpu-compute/local_memory_pipeline.hh"
4054Sbinkertn@umich.edu#include "gpu-compute/qstruct.hh"
5866Sksewell@umich.edu#include "gpu-compute/schedule_stage.hh"
5866Sksewell@umich.edu#include "gpu-compute/scoreboard_check_stage.hh"
5866Sksewell@umich.edu#include "mem/mem_object.hh"
5866Sksewell@umich.edu#include "mem/port.hh"
5866Sksewell@umich.edu
5866Sksewell@umich.edustatic const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
10417Sandreas.hansson@arm.comstatic const int MAX_WIDTH_FOR_MEM_INST = 32;
4054Sbinkertn@umich.edu
4776Sgblack@eecs.umich.educlass NDRange;
4054Sbinkertn@umich.educlass Shader;
8300Schander.sudanthi@arm.comclass VectorRegisterFile;
8300Schander.sudanthi@arm.com
8300Schander.sudanthi@arm.comstruct ComputeUnitParams;
8300Schander.sudanthi@arm.com
8300Schander.sudanthi@arm.comenum EXEC_POLICY
8300Schander.sudanthi@arm.com{
8232Snate@binkert.org    OLDEST = 0,
5866Sksewell@umich.edu    RR
4054Sbinkertn@umich.edu};
4776Sgblack@eecs.umich.edu
4054Sbinkertn@umich.edu// List of execution units
8300Schander.sudanthi@arm.comenum EXEC_UNIT
8300Schander.sudanthi@arm.com{
8300Schander.sudanthi@arm.com    SIMD0 = 0,
8232Snate@binkert.org    SIMD1,
5715Shsul@eecs.umich.edu    SIMD2,
4776Sgblack@eecs.umich.edu    SIMD3,
4776Sgblack@eecs.umich.edu    GLBMEM_PIPE,
4776Sgblack@eecs.umich.edu    LDSMEM_PIPE,
7720Sgblack@eecs.umich.edu    NUM_UNITS
9809Sumesh.b2006@gmail.com};
9809Sumesh.b2006@gmail.com
9809Sumesh.b2006@gmail.comenum TLB_CACHE
7349SAli.Saidi@ARM.com{
7349SAli.Saidi@ARM.com    TLB_MISS_CACHE_MISS = 0,
5784Sgblack@eecs.umich.edu    TLB_MISS_CACHE_HIT,
7720Sgblack@eecs.umich.edu    TLB_HIT_CACHE_MISS,
7349SAli.Saidi@ARM.com    TLB_HIT_CACHE_HIT
4776Sgblack@eecs.umich.edu};
4776Sgblack@eecs.umich.edu
5784Sgblack@eecs.umich.educlass ComputeUnit : public MemObject
7720Sgblack@eecs.umich.edu{
5784Sgblack@eecs.umich.edu  public:
5784Sgblack@eecs.umich.edu    FetchStage fetchStage;
5784Sgblack@eecs.umich.edu    ScoreboardCheckStage scoreboardCheckStage;
5784Sgblack@eecs.umich.edu    ScheduleStage scheduleStage;
5784Sgblack@eecs.umich.edu    ExecStage execStage;
5784Sgblack@eecs.umich.edu    GlobalMemPipeline globalMemoryPipe;
4776Sgblack@eecs.umich.edu    LocalMemPipeline localMemoryPipe;
4776Sgblack@eecs.umich.edu
4776Sgblack@eecs.umich.edu    // Buffers used to communicate between various pipeline stages
4776Sgblack@eecs.umich.edu
4776Sgblack@eecs.umich.edu    // List of waves which are ready to be scheduled.
7349SAli.Saidi@ARM.com    // Each execution resource has a ready list. readyList is
4776Sgblack@eecs.umich.edu    // used to communicate between scoreboardCheck stage and
5784Sgblack@eecs.umich.edu    // schedule stage
5784Sgblack@eecs.umich.edu    // TODO: make enum to index readyList
5784Sgblack@eecs.umich.edu    std::vector<std::vector<Wavefront*>> readyList;
8232Snate@binkert.org
5784Sgblack@eecs.umich.edu    // Stores the status of waves. A READY implies the
5784Sgblack@eecs.umich.edu    // wave is ready to be scheduled this cycle and
5784Sgblack@eecs.umich.edu    // is already present in the readyList. waveStatusList is
10231Ssteve.reinhardt@amd.com    // used to communicate between scoreboardCheck stage and
7600Sminkyu.jeong@arm.com    // schedule stage
7600Sminkyu.jeong@arm.com    // TODO: convert std::pair to a class to increase readability
7600Sminkyu.jeong@arm.com    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
8232Snate@binkert.org
5784Sgblack@eecs.umich.edu    // List of waves which will be dispatched to
5784Sgblack@eecs.umich.edu    // each execution resource. A FILLED implies
5784Sgblack@eecs.umich.edu    // dispatch list is non-empty and
10665SAli.Saidi@ARM.com    // execution unit has something to execute
5784Sgblack@eecs.umich.edu    // this cycle. Currently, the dispatch list of
5784Sgblack@eecs.umich.edu    // an execution resource can hold only one wave because
8232Snate@binkert.org    // an execution resource can execute only one wave in a cycle.
5784Sgblack@eecs.umich.edu    // dispatchList is used to communicate between schedule
5784Sgblack@eecs.umich.edu    // and exec stage
8232Snate@binkert.org    // TODO: convert std::pair to a class to increase readability
5784Sgblack@eecs.umich.edu    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
10383Smitch.hayenga@arm.com
10383Smitch.hayenga@arm.com    int rrNextMemID; // used by RR WF exec policy to cycle through WF's
10383Smitch.hayenga@arm.com    int rrNextALUWp;
10383Smitch.hayenga@arm.com    typedef ComputeUnitParams Params;
10383Smitch.hayenga@arm.com    std::vector<std::vector<Wavefront*>> wfList;
10383Smitch.hayenga@arm.com    int cu_id;
4776Sgblack@eecs.umich.edu
4776Sgblack@eecs.umich.edu    // array of vector register files, one per SIMD
4776Sgblack@eecs.umich.edu    std::vector<VectorRegisterFile*> vrf;
4776Sgblack@eecs.umich.edu    // Number of vector ALU units (SIMDs) in CU
4776Sgblack@eecs.umich.edu    int numSIMDs;
4776Sgblack@eecs.umich.edu    // number of pipe stages for bypassing data to next dependent single
3506Ssaidi@eecs.umich.edu    // precision vector instruction inside the vector ALU pipeline
3506Ssaidi@eecs.umich.edu    int spBypassPipeLength;
5784Sgblack@eecs.umich.edu    // number of pipe stages for bypassing data to next dependent double
5784Sgblack@eecs.umich.edu    // precision vector instruction inside the vector ALU pipeline
5784Sgblack@eecs.umich.edu    int dpBypassPipeLength;
5784Sgblack@eecs.umich.edu    // number of cycles per issue period
5784Sgblack@eecs.umich.edu    int issuePeriod;
5784Sgblack@eecs.umich.edu
5784Sgblack@eecs.umich.edu    // Number of global and local memory execution resources in CU
5784Sgblack@eecs.umich.edu    int numGlbMemUnits;
5784Sgblack@eecs.umich.edu    int numLocMemUnits;
5784Sgblack@eecs.umich.edu    // tracks the last cycle a vector instruction was executed on a SIMD
5784Sgblack@eecs.umich.edu    std::vector<uint64_t> lastExecCycle;
8232Snate@binkert.org
8232Snate@binkert.org    // true if we allow a separate TLB per lane
8232Snate@binkert.org    bool perLaneTLB;
8232Snate@binkert.org    // if 0, TLB prefetching is off.
5791Srstrong@cs.ucsd.edu    int prefetchDepth;
5784Sgblack@eecs.umich.edu    // if fixed-stride prefetching, this is the stride.
5784Sgblack@eecs.umich.edu    int prefetchStride;
8232Snate@binkert.org
5784Sgblack@eecs.umich.edu    std::vector<Addr> lastVaddrCU;
5784Sgblack@eecs.umich.edu    std::vector<std::vector<Addr>> lastVaddrSimd;
5784Sgblack@eecs.umich.edu    std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
5784Sgblack@eecs.umich.edu    Enums::PrefetchType prefetchType;
7811Ssteve.reinhardt@amd.com    EXEC_POLICY exec_policy;
4776Sgblack@eecs.umich.edu
2SN/A    bool xact_cas_mode;
2SN/A    bool debugSegFault;
4776Sgblack@eecs.umich.edu    bool functionalTLB;
2SN/A    bool localMemBarrier;
4776Sgblack@eecs.umich.edu
4776Sgblack@eecs.umich.edu    /*
3748Sgblack@eecs.umich.edu     * for Counting page accesses
5034Smilesck@eecs.umich.edu     *
8902Sandreas.hansson@arm.com     * cuExitCallback inherits from Callback. When you register a callback
     * function as an exit callback, it will get added to an exit callback
     * queue, such that on simulation exit, all callbacks in the callback
     * queue will have their process() function called.
     */
    bool countPages;

    Shader *shader;
    uint32_t barrier_id;
    // vector of Vector ALU (MACC) pipelines
    std::vector<WaitClass> aluPipe;
    // minimum issue period per SIMD unit (in cycles)
    std::vector<WaitClass> wfWait;

    // Resource control for Vector Register File->Global Memory pipe buses
    std::vector<WaitClass> vrfToGlobalMemPipeBus;
    // Resource control for Vector Register File->Local Memory pipe buses
    std::vector<WaitClass> vrfToLocalMemPipeBus;
    int nextGlbMemBus;
    int nextLocMemBus;
    // Resource control for global memory to VRF data/address bus
    WaitClass glbMemToVrfBus;
    // Resource control for local memory to VRF data/address bus
    WaitClass locMemToVrfBus;

    uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
    uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
    uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
    uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load

    Tick req_tick_latency;
    Tick resp_tick_latency;

    // number of vector registers being reserved for each SIMD unit
    std::vector<int> vectorRegsReserved;
    // number of vector registers per SIMD unit
    uint32_t numVecRegsPerSimd;
    // Support for scheduling VGPR status update events
    std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
    std::vector<uint64_t> timestampVec;
    std::vector<uint8_t>  statusVec;

    void
    registerEvent(uint32_t simdId,
                  uint32_t regIdx,
                  uint32_t operandSize,
                  uint64_t when,
                  uint8_t newStatus) {
        regIdxVec.push_back(std::make_pair(simdId, regIdx));
        timestampVec.push_back(when);
        statusVec.push_back(newStatus);
        if (operandSize > 4) {
            regIdxVec.push_back(std::make_pair(simdId,
                                               ((regIdx + 1) %
                                                numVecRegsPerSimd)));
            timestampVec.push_back(when);
            statusVec.push_back(newStatus);
        }
    }

    void updateEvents();

    // this hash map will keep track of page divergence
    // per memory instruction per wavefront. The hash map
    // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
    std::map<Addr, int> pagesTouched;

    ComputeUnit(const Params *p);
    ~ComputeUnit();
    int spBypassLength() { return spBypassPipeLength; };
    int dpBypassLength() { return dpBypassPipeLength; };
    int storeBusLength() { return numCyclesPerStoreTransfer; };
    int loadBusLength() { return numCyclesPerLoadTransfer; };
    int wfSize() const { return wavefrontSize; };

    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
    void exec();
    void initiateFetch(Wavefront *wavefront);
    void fetch(PacketPtr pkt, Wavefront *wavefront);
    void fillKernelState(Wavefront *w, NDRange *ndr);

    void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
                        NDRange *ndr);

    void StartWorkgroup(NDRange *ndr);
    int ReadyWorkgroup(NDRange *ndr);

    bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
    bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
    bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
    int GlbMemUnitId() { return GLBMEM_PIPE; }
    int ShrMemUnitId() { return LDSMEM_PIPE; }
    int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
    int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
    /* This function cycles through all the wavefronts in all the phases to see
     * if all of the wavefronts which should be associated with one barrier
     * (denoted with _barrier_id), are all at the same barrier in the program
     * (denoted by bcnt). When the number at the barrier matches bslots, then
     * return true.
     */
    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
    bool cedeSIMD(int simdId, int wfSlotId);

    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
    virtual void init();
    void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
    void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
    void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
                              bool kernelLaunch=true,
                              RequestPtr req=nullptr);
    void handleMemPacket(PacketPtr pkt, int memport_index);
    bool processTimingPacket(PacketPtr pkt);
    void processFetchReturn(PacketPtr pkt);
    void updatePageDivergenceDist(Addr addr);

    MasterID masterId() { return _masterId; }

    bool isDone() const;
    bool isSimdDone(uint32_t) const;

  protected:
    MasterID _masterId;

    LdsState &lds;

  public:
    Stats::Scalar vALUInsts;
    Stats::Formula vALUInstsPerWF;
    Stats::Scalar sALUInsts;
    Stats::Formula sALUInstsPerWF;
    Stats::Scalar instCyclesVALU;
    Stats::Scalar instCyclesSALU;
    Stats::Scalar threadCyclesVALU;
    Stats::Formula vALUUtilization;
    Stats::Scalar ldsNoFlatInsts;
    Stats::Formula ldsNoFlatInstsPerWF;
    Stats::Scalar flatVMemInsts;
    Stats::Formula flatVMemInstsPerWF;
    Stats::Scalar flatLDSInsts;
    Stats::Formula flatLDSInstsPerWF;
    Stats::Scalar vectorMemWrites;
    Stats::Formula vectorMemWritesPerWF;
    Stats::Scalar vectorMemReads;
    Stats::Formula vectorMemReadsPerWF;
    Stats::Scalar scalarMemWrites;
    Stats::Formula scalarMemWritesPerWF;
    Stats::Scalar scalarMemReads;
    Stats::Formula scalarMemReadsPerWF;

    void updateInstStats(GPUDynInstPtr gpuDynInst);

    // the following stats compute the avg. TLB accesslatency per
    // uncoalesced request (only for data)
    Stats::Scalar tlbRequests;
    Stats::Scalar tlbCycles;
    Stats::Formula tlbLatency;
    // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
    Stats::Vector hitsPerTLBLevel;

    Stats::Scalar ldsBankAccesses;
    Stats::Distribution ldsBankConflictDist;

    // over all memory instructions executed over all wavefronts
    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
    Stats::Distribution pageDivergenceDist;
    Stats::Scalar dynamicGMemInstrCnt;
    Stats::Scalar dynamicLMemInstrCnt;

    Stats::Scalar wgBlockedDueLdsAllocation;
    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
    // when the instruction is committed, this number is still incremented by 1
    Stats::Scalar numInstrExecuted;
    // Number of cycles among successive instruction executions across all
    // wavefronts of the same CU
    Stats::Distribution execRateDist;
    // number of individual vector operations executed
    Stats::Scalar numVecOpsExecuted;
    // Total cycles that something is running on the GPU
    Stats::Scalar totalCycles;
    Stats::Formula vpc; // vector ops per cycle
    Stats::Formula ipc; // vector instructions per cycle
    Stats::Distribution controlFlowDivergenceDist;
    Stats::Distribution activeLanesPerGMemInstrDist;
    Stats::Distribution activeLanesPerLMemInstrDist;
    // number of vector ALU instructions received
    Stats::Formula numALUInstsExecuted;
    // number of times a WG can not start due to lack of free VGPRs in SIMDs
    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
    Stats::Scalar numCASOps;
    Stats::Scalar numFailedCASOps;
    Stats::Scalar completedWfs;
    // flag per vector SIMD unit that is set when there is at least one
    // WV that has a vector ALU instruction as the oldest in its
    // Instruction Buffer: Defined in the Scoreboard stage, consumed
    // by the Execute stage.
    std::vector<bool> vectorAluInstAvail;
    // number of available (oldest) LDS instructions that could have
    // been issued to the LDS at a specific issue slot
    int shrMemInstAvail;
    // number of available Global memory instructions that could have
    // been issued to TCP at a specific issue slot
    int glbMemInstAvail;

    void
    regStats();

    LdsState &
    getLds() const
    {
        return lds;
    }

    int32_t
    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;

    bool
    sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));

    typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
    pageDataStruct pageAccesses;

    class CUExitCallback : public Callback
    {
      private:
        ComputeUnit *computeUnit;

      public:
        virtual ~CUExitCallback() { }

        CUExitCallback(ComputeUnit *_cu)
        {
            computeUnit = _cu;
        }

        virtual void
        process();
    };

    CUExitCallback *cuExitCallback;

    /** Data access Port **/
    class DataPort : public MasterPort
    {
      public:
        DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
            : MasterPort(_name, _cu), computeUnit(_cu),
              index(_index) { }

        bool snoopRangeSent;

        struct SenderState : public Packet::SenderState
        {
            GPUDynInstPtr _gpuDynInst;
            int port_index;
            Packet::SenderState *saved;

            SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
                        Packet::SenderState *sender_state=nullptr)
                : _gpuDynInst(gpuDynInst),
                  port_index(_port_index),
                  saved(sender_state) { }
        };

        class MemReqEvent : public Event
        {
          private:
            DataPort *dataPort;
            PacketPtr pkt;

          public:
            MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
                : Event(), dataPort(_data_port), pkt(_pkt)
            {
              setFlags(Event::AutoDelete);
            }

            void process();
            const char *description() const;
        };

        class MemRespEvent : public Event
        {
          private:
            DataPort *dataPort;
            PacketPtr pkt;

          public:
            MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
                : Event(), dataPort(_data_port), pkt(_pkt)
            {
              setFlags(Event::AutoDelete);
            }

            void process();
            const char *description() const;
        };

        std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;

      protected:
        ComputeUnit *computeUnit;
        int index;

        virtual bool recvTimingResp(PacketPtr pkt);
        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
        virtual void recvFunctional(PacketPtr pkt) { }
        virtual void recvRangeChange() { }
        virtual void recvReqRetry();

        virtual void
        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
        {
            resp.clear();
            snoop = true;
        }

    };

    // Instruction cache access port
    class SQCPort : public MasterPort
    {
      public:
        SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
            : MasterPort(_name, _cu), computeUnit(_cu),
              index(_index) { }

        bool snoopRangeSent;

        struct SenderState : public Packet::SenderState
        {
            Wavefront *wavefront;
            Packet::SenderState *saved;

            SenderState(Wavefront *_wavefront, Packet::SenderState
                    *sender_state=nullptr)
                : wavefront(_wavefront), saved(sender_state) { }
        };

        std::deque<std::pair<PacketPtr, Wavefront*>> retries;

      protected:
        ComputeUnit *computeUnit;
        int index;

        virtual bool recvTimingResp(PacketPtr pkt);
        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
        virtual void recvFunctional(PacketPtr pkt) { }
        virtual void recvRangeChange() { }
        virtual void recvReqRetry();

        virtual void
        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
        {
            resp.clear();
            snoop = true;
        }
     };

    /** Data TLB port **/
    class DTLBPort : public MasterPort
    {
      public:
        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
            : MasterPort(_name, _cu), computeUnit(_cu),
              index(_index), stalled(false)
        { }

        bool isStalled() { return stalled; }
        void stallPort() { stalled = true; }
        void unstallPort() { stalled = false; }

        /**
         * here we queue all the translation requests that were
         * not successfully sent.
         */
        std::deque<PacketPtr> retries;

        /** SenderState is information carried along with the packet
         * throughout the TLB hierarchy
         */
        struct SenderState: public Packet::SenderState
        {
            // the memInst that this is associated with
            GPUDynInstPtr _gpuDynInst;

            // the lane in the memInst this is associated with, so we send
            // the memory request down the right port
            int portIndex;

            // constructor used for packets involved in timing accesses
            SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
                : _gpuDynInst(gpuDynInst), portIndex(port_index) { }

        };

      protected:
        ComputeUnit *computeUnit;
        int index;
        bool stalled;

        virtual bool recvTimingResp(PacketPtr pkt);
        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
        virtual void recvFunctional(PacketPtr pkt) { }
        virtual void recvRangeChange() { }
        virtual void recvReqRetry();
    };

    class ITLBPort : public MasterPort
    {
      public:
        ITLBPort(const std::string &_name, ComputeUnit *_cu)
            : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }


        bool isStalled() { return stalled; }
        void stallPort() { stalled = true; }
        void unstallPort() { stalled = false; }

        /**
         * here we queue all the translation requests that were
         * not successfully sent.
         */
        std::deque<PacketPtr> retries;

        /** SenderState is information carried along with the packet
         * throughout the TLB hierarchy
         */
        struct SenderState: public Packet::SenderState
        {
            // The wavefront associated with this request
            Wavefront *wavefront;

            SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
        };

      protected:
        ComputeUnit *computeUnit;
        bool stalled;

        virtual bool recvTimingResp(PacketPtr pkt);
        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
        virtual void recvFunctional(PacketPtr pkt) { }
        virtual void recvRangeChange() { }
        virtual void recvReqRetry();
    };

    /**
     * the port intended to communicate between the CU and its LDS
     */
    class LDSPort : public MasterPort
    {
      public:
        LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
        : MasterPort(_name, _cu, _id), computeUnit(_cu)
        {
        }

        bool isStalled() const { return stalled; }
        void stallPort() { stalled = true; }
        void unstallPort() { stalled = false; }

        /**
         * here we queue all the requests that were
         * not successfully sent.
         */
        std::queue<PacketPtr> retries;

        /**
         *  SenderState is information carried along with the packet, esp. the
         *  GPUDynInstPtr
         */
        class SenderState: public Packet::SenderState
        {
          protected:
            // The actual read/write/atomic request that goes with this command
            GPUDynInstPtr _gpuDynInst = nullptr;

          public:
            SenderState(GPUDynInstPtr gpuDynInst):
              _gpuDynInst(gpuDynInst)
            {
            }

            GPUDynInstPtr
            getMemInst() const
            {
              return _gpuDynInst;
            }
        };

        virtual bool
        sendTimingReq(PacketPtr pkt);

      protected:

        bool stalled = false; ///< whether or not it is stalled

        ComputeUnit *computeUnit;

        virtual bool
        recvTimingResp(PacketPtr pkt);

        virtual Tick
        recvAtomic(PacketPtr pkt) { return 0; }

        virtual void
        recvFunctional(PacketPtr pkt)
        {
        }

        virtual void
        recvRangeChange()
        {
        }

        virtual void
        recvReqRetry();
    };

    /** The port to access the Local Data Store
     *  Can be connected to a LDS object
     */
    LDSPort *ldsPort = nullptr;

    LDSPort *
    getLdsPort() const
    {
        return ldsPort;
    }

    /** The memory port for SIMD data accesses.
     *  Can be connected to PhysMem for Ruby for timing simulations
     */
    std::vector<DataPort*> memPort;
    // port to the TLB hierarchy (i.e., the L1 TLB)
    std::vector<DTLBPort*> tlbPort;
    // port to the SQC (i.e. the I-cache)
    SQCPort *sqcPort;
    // port to the SQC TLB (there's a separate TLB for each I-cache)
    ITLBPort *sqcTLBPort;

    virtual BaseMasterPort&
    getMasterPort(const std::string &if_name, PortID idx)
    {
        if (if_name == "memory_port") {
            memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
                                        this, idx);
            return *memPort[idx];
        } else if (if_name == "translation_port") {
            tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
                                        this, idx);
            return *tlbPort[idx];
        } else if (if_name == "sqc_port") {
            sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
                                  this, idx);
            return *sqcPort;
        } else if (if_name == "sqc_tlb_port") {
            sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
            return *sqcTLBPort;
        } else if (if_name == "ldsPort") {
            if (ldsPort) {
                fatal("an LDS port was already allocated");
            }
            ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
            return *ldsPort;
        } else {
            panic("incorrect port name");
        }
    }

    // xact_cas_load()
    class waveIdentifier
    {
      public:
        waveIdentifier() { }
        waveIdentifier(int _simdId, int _wfSlotId)
          : simdId(_simdId), wfSlotId(_wfSlotId) { }

        int simdId;
        int wfSlotId;
    };

    class waveQueue
    {
      public:
        std::list<waveIdentifier> waveIDQueue;
    };
    std::map<unsigned, waveQueue> xactCasLoadMap;

    uint64_t getAndIncSeqNum() { return globalSeqNum++; }

  private:
    uint64_t globalSeqNum;
    int wavefrontSize;
    GPUStaticInst *kernelLaunchInst;
};

#endif // __COMPUTE_UNIT_HH__