compute_unit.hh revision 11695:0a65922d564d
12SN/A/*
21762SN/A * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
32SN/A * All rights reserved.
42SN/A *
52SN/A * For use for simulation and test purposes only
62SN/A *
72SN/A * Redistribution and use in source and binary forms, with or without
82SN/A * modification, are permitted provided that the following conditions are met:
92SN/A *
102SN/A * 1. Redistributions of source code must retain the above copyright notice,
112SN/A * this list of conditions and the following disclaimer.
122SN/A *
132SN/A * 2. Redistributions in binary form must reproduce the above copyright notice,
142SN/A * this list of conditions and the following disclaimer in the documentation
152SN/A * and/or other materials provided with the distribution.
162SN/A *
172SN/A * 3. Neither the name of the copyright holder nor the names of its contributors
182SN/A * may be used to endorse or promote products derived from this software
192SN/A * without specific prior written permission.
202SN/A *
212SN/A * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
222SN/A * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
232SN/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
242SN/A * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
252SN/A * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
262SN/A * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
272665Ssaidi@eecs.umich.edu * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
282665Ssaidi@eecs.umich.edu * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
292665Ssaidi@eecs.umich.edu * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
302665Ssaidi@eecs.umich.edu * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
312665Ssaidi@eecs.umich.edu * POSSIBILITY OF SUCH DAMAGE.
322SN/A *
332SN/A * Author: John Kalamatianos, Anthony Gutierrez
342SN/A */
352SN/A
367349SAli.Saidi@ARM.com#ifndef __COMPUTE_UNIT_HH__
377680Sgblack@eecs.umich.edu#define __COMPUTE_UNIT_HH__
3856SN/A
398229Snate@binkert.org#include <deque>
401717SN/A#include <map>
412518SN/A#include <unordered_map>
4256SN/A#include <vector>
434776Sgblack@eecs.umich.edu
448232Snate@binkert.org#include "base/callback.hh"
454762Snate@binkert.org#include "base/statistics.hh"
463065Sgblack@eecs.umich.edu#include "base/types.hh"
472SN/A#include "enums/PrefetchType.hh"
482973Sgblack@eecs.umich.edu#include "gpu-compute/exec_stage.hh"
492SN/A#include "gpu-compute/fetch_stage.hh"
503506Ssaidi@eecs.umich.edu#include "gpu-compute/global_memory_pipeline.hh"
514054Sbinkertn@umich.edu#include "gpu-compute/local_memory_pipeline.hh"
524054Sbinkertn@umich.edu#include "gpu-compute/qstruct.hh"
535866Sksewell@umich.edu#include "gpu-compute/schedule_stage.hh"
545866Sksewell@umich.edu#include "gpu-compute/scoreboard_check_stage.hh"
555866Sksewell@umich.edu#include "mem/mem_object.hh"
565866Sksewell@umich.edu#include "mem/port.hh"
575866Sksewell@umich.edu
585866Sksewell@umich.edustatic const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
5910417Sandreas.hansson@arm.comstatic const int MAX_WIDTH_FOR_MEM_INST = 32;
604054Sbinkertn@umich.edu
614776Sgblack@eecs.umich.educlass NDRange;
624054Sbinkertn@umich.educlass Shader;
638300Schander.sudanthi@arm.comclass VectorRegisterFile;
648300Schander.sudanthi@arm.com
658300Schander.sudanthi@arm.comstruct ComputeUnitParams;
668300Schander.sudanthi@arm.com
678300Schander.sudanthi@arm.comenum EXEC_POLICY
688300Schander.sudanthi@arm.com{
698232Snate@binkert.org    OLDEST = 0,
705866Sksewell@umich.edu    RR
714054Sbinkertn@umich.edu};
724776Sgblack@eecs.umich.edu
734054Sbinkertn@umich.edu// List of execution units
748300Schander.sudanthi@arm.comenum EXEC_UNIT
758300Schander.sudanthi@arm.com{
768300Schander.sudanthi@arm.com    SIMD0 = 0,
778232Snate@binkert.org    SIMD1,
785715Shsul@eecs.umich.edu    SIMD2,
794776Sgblack@eecs.umich.edu    SIMD3,
804776Sgblack@eecs.umich.edu    GLBMEM_PIPE,
814776Sgblack@eecs.umich.edu    LDSMEM_PIPE,
827720Sgblack@eecs.umich.edu    NUM_UNITS
839809Sumesh.b2006@gmail.com};
849809Sumesh.b2006@gmail.com
859809Sumesh.b2006@gmail.comenum TLB_CACHE
867349SAli.Saidi@ARM.com{
877349SAli.Saidi@ARM.com    TLB_MISS_CACHE_MISS = 0,
885784Sgblack@eecs.umich.edu    TLB_MISS_CACHE_HIT,
897720Sgblack@eecs.umich.edu    TLB_HIT_CACHE_MISS,
907349SAli.Saidi@ARM.com    TLB_HIT_CACHE_HIT
914776Sgblack@eecs.umich.edu};
924776Sgblack@eecs.umich.edu
935784Sgblack@eecs.umich.educlass ComputeUnit : public MemObject
947720Sgblack@eecs.umich.edu{
955784Sgblack@eecs.umich.edu  public:
965784Sgblack@eecs.umich.edu    FetchStage fetchStage;
975784Sgblack@eecs.umich.edu    ScoreboardCheckStage scoreboardCheckStage;
985784Sgblack@eecs.umich.edu    ScheduleStage scheduleStage;
995784Sgblack@eecs.umich.edu    ExecStage execStage;
1005784Sgblack@eecs.umich.edu    GlobalMemPipeline globalMemoryPipe;
1014776Sgblack@eecs.umich.edu    LocalMemPipeline localMemoryPipe;
1024776Sgblack@eecs.umich.edu
1034776Sgblack@eecs.umich.edu    // Buffers used to communicate between various pipeline stages
1044776Sgblack@eecs.umich.edu
1054776Sgblack@eecs.umich.edu    // List of waves which are ready to be scheduled.
1067349SAli.Saidi@ARM.com    // Each execution resource has a ready list. readyList is
1074776Sgblack@eecs.umich.edu    // used to communicate between scoreboardCheck stage and
1085784Sgblack@eecs.umich.edu    // schedule stage
1095784Sgblack@eecs.umich.edu    // TODO: make enum to index readyList
1105784Sgblack@eecs.umich.edu    std::vector<std::vector<Wavefront*>> readyList;
1118232Snate@binkert.org
1125784Sgblack@eecs.umich.edu    // Stores the status of waves. A READY implies the
1135784Sgblack@eecs.umich.edu    // wave is ready to be scheduled this cycle and
1145784Sgblack@eecs.umich.edu    // is already present in the readyList. waveStatusList is
11510231Ssteve.reinhardt@amd.com    // used to communicate between scoreboardCheck stage and
1167600Sminkyu.jeong@arm.com    // schedule stage
1177600Sminkyu.jeong@arm.com    // TODO: convert std::pair to a class to increase readability
1187600Sminkyu.jeong@arm.com    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
1198232Snate@binkert.org
1205784Sgblack@eecs.umich.edu    // List of waves which will be dispatched to
1215784Sgblack@eecs.umich.edu    // each execution resource. A FILLED implies
1225784Sgblack@eecs.umich.edu    // dispatch list is non-empty and
12310665SAli.Saidi@ARM.com    // execution unit has something to execute
1245784Sgblack@eecs.umich.edu    // this cycle. Currently, the dispatch list of
1255784Sgblack@eecs.umich.edu    // an execution resource can hold only one wave because
1268232Snate@binkert.org    // an execution resource can execute only one wave in a cycle.
1275784Sgblack@eecs.umich.edu    // dispatchList is used to communicate between schedule
1285784Sgblack@eecs.umich.edu    // and exec stage
1298232Snate@binkert.org    // TODO: convert std::pair to a class to increase readability
1305784Sgblack@eecs.umich.edu    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
13110383Smitch.hayenga@arm.com
13210383Smitch.hayenga@arm.com    int rrNextMemID; // used by RR WF exec policy to cycle through WF's
13310383Smitch.hayenga@arm.com    int rrNextALUWp;
13410383Smitch.hayenga@arm.com    typedef ComputeUnitParams Params;
13510383Smitch.hayenga@arm.com    std::vector<std::vector<Wavefront*>> wfList;
13610383Smitch.hayenga@arm.com    int cu_id;
1374776Sgblack@eecs.umich.edu
1384776Sgblack@eecs.umich.edu    // array of vector register files, one per SIMD
1394776Sgblack@eecs.umich.edu    std::vector<VectorRegisterFile*> vrf;
1404776Sgblack@eecs.umich.edu    // Number of vector ALU units (SIMDs) in CU
1414776Sgblack@eecs.umich.edu    int numSIMDs;
1424776Sgblack@eecs.umich.edu    // number of pipe stages for bypassing data to next dependent single
1433506Ssaidi@eecs.umich.edu    // precision vector instruction inside the vector ALU pipeline
1443506Ssaidi@eecs.umich.edu    int spBypassPipeLength;
1455784Sgblack@eecs.umich.edu    // number of pipe stages for bypassing data to next dependent double
1465784Sgblack@eecs.umich.edu    // precision vector instruction inside the vector ALU pipeline
1475784Sgblack@eecs.umich.edu    int dpBypassPipeLength;
1485784Sgblack@eecs.umich.edu    // number of cycles per issue period
1495784Sgblack@eecs.umich.edu    int issuePeriod;
1505784Sgblack@eecs.umich.edu
1515784Sgblack@eecs.umich.edu    // Number of global and local memory execution resources in CU
1525784Sgblack@eecs.umich.edu    int numGlbMemUnits;
1535784Sgblack@eecs.umich.edu    int numLocMemUnits;
1545784Sgblack@eecs.umich.edu    // tracks the last cycle a vector instruction was executed on a SIMD
1555784Sgblack@eecs.umich.edu    std::vector<uint64_t> lastExecCycle;
1568232Snate@binkert.org
1578232Snate@binkert.org    // true if we allow a separate TLB per lane
1588232Snate@binkert.org    bool perLaneTLB;
1598232Snate@binkert.org    // if 0, TLB prefetching is off.
1605791Srstrong@cs.ucsd.edu    int prefetchDepth;
1615784Sgblack@eecs.umich.edu    // if fixed-stride prefetching, this is the stride.
1625784Sgblack@eecs.umich.edu    int prefetchStride;
1638232Snate@binkert.org
1645784Sgblack@eecs.umich.edu    std::vector<Addr> lastVaddrCU;
1655784Sgblack@eecs.umich.edu    std::vector<std::vector<Addr>> lastVaddrSimd;
1665784Sgblack@eecs.umich.edu    std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
1675784Sgblack@eecs.umich.edu    Enums::PrefetchType prefetchType;
1687811Ssteve.reinhardt@amd.com    EXEC_POLICY exec_policy;
1694776Sgblack@eecs.umich.edu
1702SN/A    bool xact_cas_mode;
1712SN/A    bool debugSegFault;
1724776Sgblack@eecs.umich.edu    bool functionalTLB;
1732SN/A    bool localMemBarrier;
1744776Sgblack@eecs.umich.edu
1754776Sgblack@eecs.umich.edu    /*
1763748Sgblack@eecs.umich.edu     * for Counting page accesses
1775034Smilesck@eecs.umich.edu     *
1788902Sandreas.hansson@arm.com     * cuExitCallback inherits from Callback. When you register a callback
179     * function as an exit callback, it will get added to an exit callback
180     * queue, such that on simulation exit, all callbacks in the callback
181     * queue will have their process() function called.
182     */
183    bool countPages;
184
185    Shader *shader;
186    uint32_t barrier_id;
187    // vector of Vector ALU (MACC) pipelines
188    std::vector<WaitClass> aluPipe;
189    // minimum issue period per SIMD unit (in cycles)
190    std::vector<WaitClass> wfWait;
191
192    // Resource control for Vector Register File->Global Memory pipe buses
193    std::vector<WaitClass> vrfToGlobalMemPipeBus;
194    // Resource control for Vector Register File->Local Memory pipe buses
195    std::vector<WaitClass> vrfToLocalMemPipeBus;
196    int nextGlbMemBus;
197    int nextLocMemBus;
198    // Resource control for global memory to VRF data/address bus
199    WaitClass glbMemToVrfBus;
200    // Resource control for local memory to VRF data/address bus
201    WaitClass locMemToVrfBus;
202
203    uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
204    uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
205    uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
206    uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load
207
208    Tick req_tick_latency;
209    Tick resp_tick_latency;
210
211    // number of vector registers being reserved for each SIMD unit
212    std::vector<int> vectorRegsReserved;
213    // number of vector registers per SIMD unit
214    uint32_t numVecRegsPerSimd;
215    // Support for scheduling VGPR status update events
216    std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
217    std::vector<uint64_t> timestampVec;
218    std::vector<uint8_t>  statusVec;
219
220    void
221    registerEvent(uint32_t simdId,
222                  uint32_t regIdx,
223                  uint32_t operandSize,
224                  uint64_t when,
225                  uint8_t newStatus) {
226        regIdxVec.push_back(std::make_pair(simdId, regIdx));
227        timestampVec.push_back(when);
228        statusVec.push_back(newStatus);
229        if (operandSize > 4) {
230            regIdxVec.push_back(std::make_pair(simdId,
231                                               ((regIdx + 1) %
232                                                numVecRegsPerSimd)));
233            timestampVec.push_back(when);
234            statusVec.push_back(newStatus);
235        }
236    }
237
238    void updateEvents();
239
240    // this hash map will keep track of page divergence
241    // per memory instruction per wavefront. The hash map
242    // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
243    std::map<Addr, int> pagesTouched;
244
245    ComputeUnit(const Params *p);
246    ~ComputeUnit();
247    int spBypassLength() { return spBypassPipeLength; };
248    int dpBypassLength() { return dpBypassPipeLength; };
249    int storeBusLength() { return numCyclesPerStoreTransfer; };
250    int loadBusLength() { return numCyclesPerLoadTransfer; };
251    int wfSize() const { return wavefrontSize; };
252
253    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
254    void exec();
255    void initiateFetch(Wavefront *wavefront);
256    void fetch(PacketPtr pkt, Wavefront *wavefront);
257    void fillKernelState(Wavefront *w, NDRange *ndr);
258
259    void startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
260                        NDRange *ndr);
261
262    void StartWorkgroup(NDRange *ndr);
263    int ReadyWorkgroup(NDRange *ndr);
264
265    bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
266    bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
267    bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
268    int GlbMemUnitId() { return GLBMEM_PIPE; }
269    int ShrMemUnitId() { return LDSMEM_PIPE; }
270    int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
271    int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
272    /* This function cycles through all the wavefronts in all the phases to see
273     * if all of the wavefronts which should be associated with one barrier
274     * (denoted with _barrier_id), are all at the same barrier in the program
275     * (denoted by bcnt). When the number at the barrier matches bslots, then
276     * return true.
277     */
278    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
279    bool cedeSIMD(int simdId, int wfSlotId);
280
281    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
282    virtual void init();
283    void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
284    void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
285    void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
286                              bool kernelLaunch=true,
287                              RequestPtr req=nullptr);
288    void handleMemPacket(PacketPtr pkt, int memport_index);
289    bool processTimingPacket(PacketPtr pkt);
290    void processFetchReturn(PacketPtr pkt);
291    void updatePageDivergenceDist(Addr addr);
292
293    MasterID masterId() { return _masterId; }
294
295    bool isDone() const;
296    bool isSimdDone(uint32_t) const;
297
298  protected:
299    MasterID _masterId;
300
301    LdsState &lds;
302
303  public:
304    Stats::Scalar vALUInsts;
305    Stats::Formula vALUInstsPerWF;
306    Stats::Scalar sALUInsts;
307    Stats::Formula sALUInstsPerWF;
308    Stats::Scalar instCyclesVALU;
309    Stats::Scalar instCyclesSALU;
310    Stats::Scalar threadCyclesVALU;
311    Stats::Formula vALUUtilization;
312    Stats::Scalar ldsNoFlatInsts;
313    Stats::Formula ldsNoFlatInstsPerWF;
314    Stats::Scalar flatVMemInsts;
315    Stats::Formula flatVMemInstsPerWF;
316    Stats::Scalar flatLDSInsts;
317    Stats::Formula flatLDSInstsPerWF;
318    Stats::Scalar vectorMemWrites;
319    Stats::Formula vectorMemWritesPerWF;
320    Stats::Scalar vectorMemReads;
321    Stats::Formula vectorMemReadsPerWF;
322    Stats::Scalar scalarMemWrites;
323    Stats::Formula scalarMemWritesPerWF;
324    Stats::Scalar scalarMemReads;
325    Stats::Formula scalarMemReadsPerWF;
326
327    void updateInstStats(GPUDynInstPtr gpuDynInst);
328
329    // the following stats compute the avg. TLB accesslatency per
330    // uncoalesced request (only for data)
331    Stats::Scalar tlbRequests;
332    Stats::Scalar tlbCycles;
333    Stats::Formula tlbLatency;
334    // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
335    Stats::Vector hitsPerTLBLevel;
336
337    Stats::Scalar ldsBankAccesses;
338    Stats::Distribution ldsBankConflictDist;
339
340    // over all memory instructions executed over all wavefronts
341    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
342    Stats::Distribution pageDivergenceDist;
343    Stats::Scalar dynamicGMemInstrCnt;
344    Stats::Scalar dynamicLMemInstrCnt;
345
346    Stats::Scalar wgBlockedDueLdsAllocation;
347    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
348    // when the instruction is committed, this number is still incremented by 1
349    Stats::Scalar numInstrExecuted;
350    // Number of cycles among successive instruction executions across all
351    // wavefronts of the same CU
352    Stats::Distribution execRateDist;
353    // number of individual vector operations executed
354    Stats::Scalar numVecOpsExecuted;
355    // Total cycles that something is running on the GPU
356    Stats::Scalar totalCycles;
357    Stats::Formula vpc; // vector ops per cycle
358    Stats::Formula ipc; // vector instructions per cycle
359    Stats::Distribution controlFlowDivergenceDist;
360    Stats::Distribution activeLanesPerGMemInstrDist;
361    Stats::Distribution activeLanesPerLMemInstrDist;
362    // number of vector ALU instructions received
363    Stats::Formula numALUInstsExecuted;
364    // number of times a WG can not start due to lack of free VGPRs in SIMDs
365    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
366    Stats::Scalar numCASOps;
367    Stats::Scalar numFailedCASOps;
368    Stats::Scalar completedWfs;
369    // flag per vector SIMD unit that is set when there is at least one
370    // WV that has a vector ALU instruction as the oldest in its
371    // Instruction Buffer: Defined in the Scoreboard stage, consumed
372    // by the Execute stage.
373    std::vector<bool> vectorAluInstAvail;
374    // number of available (oldest) LDS instructions that could have
375    // been issued to the LDS at a specific issue slot
376    int shrMemInstAvail;
377    // number of available Global memory instructions that could have
378    // been issued to TCP at a specific issue slot
379    int glbMemInstAvail;
380
381    void
382    regStats();
383
384    LdsState &
385    getLds() const
386    {
387        return lds;
388    }
389
390    int32_t
391    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
392
393    bool
394    sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
395
396    typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
397    pageDataStruct pageAccesses;
398
399    class CUExitCallback : public Callback
400    {
401      private:
402        ComputeUnit *computeUnit;
403
404      public:
405        virtual ~CUExitCallback() { }
406
407        CUExitCallback(ComputeUnit *_cu)
408        {
409            computeUnit = _cu;
410        }
411
412        virtual void
413        process();
414    };
415
416    CUExitCallback *cuExitCallback;
417
418    /** Data access Port **/
419    class DataPort : public MasterPort
420    {
421      public:
422        DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
423            : MasterPort(_name, _cu), computeUnit(_cu),
424              index(_index) { }
425
426        bool snoopRangeSent;
427
428        struct SenderState : public Packet::SenderState
429        {
430            GPUDynInstPtr _gpuDynInst;
431            int port_index;
432            Packet::SenderState *saved;
433
434            SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
435                        Packet::SenderState *sender_state=nullptr)
436                : _gpuDynInst(gpuDynInst),
437                  port_index(_port_index),
438                  saved(sender_state) { }
439        };
440
441        class MemReqEvent : public Event
442        {
443          private:
444            DataPort *dataPort;
445            PacketPtr pkt;
446
447          public:
448            MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
449                : Event(), dataPort(_data_port), pkt(_pkt)
450            {
451              setFlags(Event::AutoDelete);
452            }
453
454            void process();
455            const char *description() const;
456        };
457
458        class MemRespEvent : public Event
459        {
460          private:
461            DataPort *dataPort;
462            PacketPtr pkt;
463
464          public:
465            MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
466                : Event(), dataPort(_data_port), pkt(_pkt)
467            {
468              setFlags(Event::AutoDelete);
469            }
470
471            void process();
472            const char *description() const;
473        };
474
475        std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
476
477      protected:
478        ComputeUnit *computeUnit;
479        int index;
480
481        virtual bool recvTimingResp(PacketPtr pkt);
482        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
483        virtual void recvFunctional(PacketPtr pkt) { }
484        virtual void recvRangeChange() { }
485        virtual void recvReqRetry();
486
487        virtual void
488        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
489        {
490            resp.clear();
491            snoop = true;
492        }
493
494    };
495
496    // Instruction cache access port
497    class SQCPort : public MasterPort
498    {
499      public:
500        SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
501            : MasterPort(_name, _cu), computeUnit(_cu),
502              index(_index) { }
503
504        bool snoopRangeSent;
505
506        struct SenderState : public Packet::SenderState
507        {
508            Wavefront *wavefront;
509            Packet::SenderState *saved;
510
511            SenderState(Wavefront *_wavefront, Packet::SenderState
512                    *sender_state=nullptr)
513                : wavefront(_wavefront), saved(sender_state) { }
514        };
515
516        std::deque<std::pair<PacketPtr, Wavefront*>> retries;
517
518      protected:
519        ComputeUnit *computeUnit;
520        int index;
521
522        virtual bool recvTimingResp(PacketPtr pkt);
523        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
524        virtual void recvFunctional(PacketPtr pkt) { }
525        virtual void recvRangeChange() { }
526        virtual void recvReqRetry();
527
528        virtual void
529        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
530        {
531            resp.clear();
532            snoop = true;
533        }
534     };
535
536    /** Data TLB port **/
537    class DTLBPort : public MasterPort
538    {
539      public:
540        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
541            : MasterPort(_name, _cu), computeUnit(_cu),
542              index(_index), stalled(false)
543        { }
544
545        bool isStalled() { return stalled; }
546        void stallPort() { stalled = true; }
547        void unstallPort() { stalled = false; }
548
549        /**
550         * here we queue all the translation requests that were
551         * not successfully sent.
552         */
553        std::deque<PacketPtr> retries;
554
555        /** SenderState is information carried along with the packet
556         * throughout the TLB hierarchy
557         */
558        struct SenderState: public Packet::SenderState
559        {
560            // the memInst that this is associated with
561            GPUDynInstPtr _gpuDynInst;
562
563            // the lane in the memInst this is associated with, so we send
564            // the memory request down the right port
565            int portIndex;
566
567            // constructor used for packets involved in timing accesses
568            SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
569                : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
570
571        };
572
573      protected:
574        ComputeUnit *computeUnit;
575        int index;
576        bool stalled;
577
578        virtual bool recvTimingResp(PacketPtr pkt);
579        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
580        virtual void recvFunctional(PacketPtr pkt) { }
581        virtual void recvRangeChange() { }
582        virtual void recvReqRetry();
583    };
584
585    class ITLBPort : public MasterPort
586    {
587      public:
588        ITLBPort(const std::string &_name, ComputeUnit *_cu)
589            : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
590
591
592        bool isStalled() { return stalled; }
593        void stallPort() { stalled = true; }
594        void unstallPort() { stalled = false; }
595
596        /**
597         * here we queue all the translation requests that were
598         * not successfully sent.
599         */
600        std::deque<PacketPtr> retries;
601
602        /** SenderState is information carried along with the packet
603         * throughout the TLB hierarchy
604         */
605        struct SenderState: public Packet::SenderState
606        {
607            // The wavefront associated with this request
608            Wavefront *wavefront;
609
610            SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
611        };
612
613      protected:
614        ComputeUnit *computeUnit;
615        bool stalled;
616
617        virtual bool recvTimingResp(PacketPtr pkt);
618        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
619        virtual void recvFunctional(PacketPtr pkt) { }
620        virtual void recvRangeChange() { }
621        virtual void recvReqRetry();
622    };
623
624    /**
625     * the port intended to communicate between the CU and its LDS
626     */
627    class LDSPort : public MasterPort
628    {
629      public:
630        LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
631        : MasterPort(_name, _cu, _id), computeUnit(_cu)
632        {
633        }
634
635        bool isStalled() const { return stalled; }
636        void stallPort() { stalled = true; }
637        void unstallPort() { stalled = false; }
638
639        /**
640         * here we queue all the requests that were
641         * not successfully sent.
642         */
643        std::queue<PacketPtr> retries;
644
645        /**
646         *  SenderState is information carried along with the packet, esp. the
647         *  GPUDynInstPtr
648         */
649        class SenderState: public Packet::SenderState
650        {
651          protected:
652            // The actual read/write/atomic request that goes with this command
653            GPUDynInstPtr _gpuDynInst = nullptr;
654
655          public:
656            SenderState(GPUDynInstPtr gpuDynInst):
657              _gpuDynInst(gpuDynInst)
658            {
659            }
660
661            GPUDynInstPtr
662            getMemInst() const
663            {
664              return _gpuDynInst;
665            }
666        };
667
668        virtual bool
669        sendTimingReq(PacketPtr pkt);
670
671      protected:
672
673        bool stalled = false; ///< whether or not it is stalled
674
675        ComputeUnit *computeUnit;
676
677        virtual bool
678        recvTimingResp(PacketPtr pkt);
679
680        virtual Tick
681        recvAtomic(PacketPtr pkt) { return 0; }
682
683        virtual void
684        recvFunctional(PacketPtr pkt)
685        {
686        }
687
688        virtual void
689        recvRangeChange()
690        {
691        }
692
693        virtual void
694        recvReqRetry();
695    };
696
697    /** The port to access the Local Data Store
698     *  Can be connected to a LDS object
699     */
700    LDSPort *ldsPort = nullptr;
701
702    LDSPort *
703    getLdsPort() const
704    {
705        return ldsPort;
706    }
707
708    /** The memory port for SIMD data accesses.
709     *  Can be connected to PhysMem for Ruby for timing simulations
710     */
711    std::vector<DataPort*> memPort;
712    // port to the TLB hierarchy (i.e., the L1 TLB)
713    std::vector<DTLBPort*> tlbPort;
714    // port to the SQC (i.e. the I-cache)
715    SQCPort *sqcPort;
716    // port to the SQC TLB (there's a separate TLB for each I-cache)
717    ITLBPort *sqcTLBPort;
718
719    virtual BaseMasterPort&
720    getMasterPort(const std::string &if_name, PortID idx)
721    {
722        if (if_name == "memory_port") {
723            memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
724                                        this, idx);
725            return *memPort[idx];
726        } else if (if_name == "translation_port") {
727            tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
728                                        this, idx);
729            return *tlbPort[idx];
730        } else if (if_name == "sqc_port") {
731            sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
732                                  this, idx);
733            return *sqcPort;
734        } else if (if_name == "sqc_tlb_port") {
735            sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
736            return *sqcTLBPort;
737        } else if (if_name == "ldsPort") {
738            if (ldsPort) {
739                fatal("an LDS port was already allocated");
740            }
741            ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
742            return *ldsPort;
743        } else {
744            panic("incorrect port name");
745        }
746    }
747
748    // xact_cas_load()
749    class waveIdentifier
750    {
751      public:
752        waveIdentifier() { }
753        waveIdentifier(int _simdId, int _wfSlotId)
754          : simdId(_simdId), wfSlotId(_wfSlotId) { }
755
756        int simdId;
757        int wfSlotId;
758    };
759
760    class waveQueue
761    {
762      public:
763        std::list<waveIdentifier> waveIDQueue;
764    };
765    std::map<unsigned, waveQueue> xactCasLoadMap;
766
767    uint64_t getAndIncSeqNum() { return globalSeqNum++; }
768
769  private:
770    uint64_t globalSeqNum;
771    int wavefrontSize;
772    GPUStaticInst *kernelLaunchInst;
773};
774
775#endif // __COMPUTE_UNIT_HH__
776