GPUCoalescer.hh revision 12697
19243SN/A/*
210206Sandreas.hansson@arm.com * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
39243SN/A * All rights reserved.
49243SN/A *
59243SN/A * For use for simulation and test purposes only
69243SN/A *
79243SN/A * Redistribution and use in source and binary forms, with or without
89243SN/A * modification, are permitted provided that the following conditions are met:
99243SN/A *
109243SN/A * 1. Redistributions of source code must retain the above copyright notice,
119243SN/A * this list of conditions and the following disclaimer.
129243SN/A *
139243SN/A * 2. Redistributions in binary form must reproduce the above copyright notice,
149831SN/A * this list of conditions and the following disclaimer in the documentation
159831SN/A * and/or other materials provided with the distribution.
169831SN/A *
179243SN/A * 3. Neither the name of the copyright holder nor the names of its
189243SN/A * contributors may be used to endorse or promote products derived from this
199243SN/A * software without specific prior written permission.
209243SN/A *
219243SN/A * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
229243SN/A * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
239243SN/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
249243SN/A * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
259243SN/A * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
269243SN/A * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
279243SN/A * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
289243SN/A * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
299243SN/A * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
309243SN/A * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
319243SN/A * POSSIBILITY OF SUCH DAMAGE.
329243SN/A *
339243SN/A * Authors: Sooraj Puthoor
349243SN/A */
359243SN/A
369243SN/A#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
379243SN/A#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
389243SN/A
399243SN/A#include <iostream>
409243SN/A#include <unordered_map>
419243SN/A
429967SN/A#include "base/statistics.hh"
439243SN/A#include "mem/protocol/HSAScope.hh"
449243SN/A#include "mem/protocol/HSASegment.hh"
459243SN/A#include "mem/protocol/PrefetchBit.hh"
469243SN/A#include "mem/protocol/RubyAccessMode.hh"
4710146Sandreas.hansson@arm.com#include "mem/protocol/RubyRequestType.hh"
489243SN/A#include "mem/protocol/SequencerRequestType.hh"
499243SN/A#include "mem/request.hh"
5010146Sandreas.hansson@arm.com#include "mem/ruby/common/Address.hh"
5110146Sandreas.hansson@arm.com#include "mem/ruby/common/Consumer.hh"
529243SN/A#include "mem/ruby/system/Sequencer.hh"
539488SN/A
549488SN/Aclass DataBlock;
559243SN/Aclass CacheMsg;
569243SN/Aclass MachineID;
579243SN/Aclass CacheMemory;
589243SN/A
599243SN/Aclass RubyGPUCoalescerParams;
609243SN/A
6110146Sandreas.hansson@arm.comHSAScope reqScopeToHSAScope(Request* req);
629243SN/AHSASegment reqSegmentToHSASegment(Request* req);
639243SN/A
649243SN/Astruct GPUCoalescerRequest
6510287Sandreas.hansson@arm.com{
6610287Sandreas.hansson@arm.com    PacketPtr pkt;
6710287Sandreas.hansson@arm.com    RubyRequestType m_type;
6810287Sandreas.hansson@arm.com    Cycles issue_time;
6910287Sandreas.hansson@arm.com
709243SN/A    GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
7110287Sandreas.hansson@arm.com                        Cycles _issue_time)
7210287Sandreas.hansson@arm.com        : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
7310287Sandreas.hansson@arm.com    {}
7410287Sandreas.hansson@arm.com};
7510287Sandreas.hansson@arm.com
7610287Sandreas.hansson@arm.comclass RequestDesc
7710287Sandreas.hansson@arm.com{
7810287Sandreas.hansson@arm.com  public:
7910287Sandreas.hansson@arm.com    RequestDesc(PacketPtr pkt, RubyRequestType p_type, RubyRequestType s_type)
8010287Sandreas.hansson@arm.com        : pkt(pkt), primaryType(p_type), secondaryType(s_type)
8110287Sandreas.hansson@arm.com    {
8210287Sandreas.hansson@arm.com    }
8310287Sandreas.hansson@arm.com
849243SN/A    RequestDesc() : pkt(nullptr), primaryType(RubyRequestType_NULL),
8510146Sandreas.hansson@arm.com        secondaryType(RubyRequestType_NULL)
869243SN/A    {
879243SN/A    }
889243SN/A
899243SN/A    PacketPtr pkt;
909243SN/A    RubyRequestType primaryType;
919243SN/A    RubyRequestType secondaryType;
929243SN/A};
939243SN/A
949243SN/Astd::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
959243SN/A
9610146Sandreas.hansson@arm.comclass GPUCoalescer : public RubyPort
979243SN/A{
989243SN/A  public:
999243SN/A    typedef RubyGPUCoalescerParams Params;
10010146Sandreas.hansson@arm.com    GPUCoalescer(const Params *);
1019243SN/A    ~GPUCoalescer();
1029243SN/A
1039243SN/A    // Public Methods
1049243SN/A    void wakeup(); // Used only for deadlock detection
1059243SN/A
1069243SN/A    void printProgress(std::ostream& out) const;
1079243SN/A    void resetStats();
1089243SN/A    void collateStats();
1099243SN/A    void regStats();
1109243SN/A
1119243SN/A    void writeCallback(Addr address, DataBlock& data);
1129243SN/A
1139243SN/A    void writeCallback(Addr address,
1149243SN/A                       MachineType mach,
1159243SN/A                       DataBlock& data);
1169243SN/A
1179243SN/A    void writeCallback(Addr address,
1189243SN/A                       MachineType mach,
1199243SN/A                       DataBlock& data,
1209243SN/A                       Cycles initialRequestTime,
1219243SN/A                       Cycles forwardRequestTime,
1229243SN/A                       Cycles firstResponseTime,
1239243SN/A                       bool isRegion);
1249243SN/A
1259243SN/A    void writeCallback(Addr address,
1269243SN/A                       MachineType mach,
12710206Sandreas.hansson@arm.com                       DataBlock& data,
12810206Sandreas.hansson@arm.com                       Cycles initialRequestTime,
1299243SN/A                       Cycles forwardRequestTime,
13010206Sandreas.hansson@arm.com                       Cycles firstResponseTime);
13110206Sandreas.hansson@arm.com
13210206Sandreas.hansson@arm.com    void readCallback(Addr address, DataBlock& data);
13310206Sandreas.hansson@arm.com
13410206Sandreas.hansson@arm.com    void readCallback(Addr address,
13510206Sandreas.hansson@arm.com                      MachineType mach,
13610206Sandreas.hansson@arm.com                      DataBlock& data);
13710206Sandreas.hansson@arm.com
1389243SN/A    void readCallback(Addr address,
1399488SN/A                      MachineType mach,
1409969SN/A                      DataBlock& data,
1419488SN/A                      Cycles initialRequestTime,
1429243SN/A                      Cycles forwardRequestTime,
14310210Sandreas.hansson@arm.com                      Cycles firstResponseTime);
14410210Sandreas.hansson@arm.com
14510211Sandreas.hansson@arm.com    void readCallback(Addr address,
14610211Sandreas.hansson@arm.com                      MachineType mach,
14710210Sandreas.hansson@arm.com                      DataBlock& data,
14810210Sandreas.hansson@arm.com                      Cycles initialRequestTime,
14910210Sandreas.hansson@arm.com                      Cycles forwardRequestTime,
1509243SN/A                      Cycles firstResponseTime,
1519243SN/A                      bool isRegion);
1529243SN/A    /* atomics need their own callback because the data
1539243SN/A       might be const coming from SLICC */
1549243SN/A    void atomicCallback(Addr address,
1559243SN/A                        MachineType mach,
15610207Sandreas.hansson@arm.com                        const DataBlock& data);
1579243SN/A
1589243SN/A    void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
15910246Sandreas.hansson@arm.com    void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
16010246Sandreas.hansson@arm.com
16110394Swendy.elsasser@arm.com    // Alternate implementations in VIPER Coalescer
1629243SN/A    virtual RequestStatus makeRequest(PacketPtr pkt);
16310211Sandreas.hansson@arm.com
16410210Sandreas.hansson@arm.com    int outstandingCount() const { return m_outstanding_count; }
1659969SN/A
1669243SN/A    bool
16710141SN/A    isDeadlockEventScheduled() const
1689727SN/A    {
1699727SN/A        return deadlockCheckEvent.scheduled();
1709727SN/A    }
17110394Swendy.elsasser@arm.com
17210246Sandreas.hansson@arm.com    void
17310141SN/A    descheduleDeadlockEvent()
1749243SN/A    {
1759243SN/A        deschedule(deadlockCheckEvent);
1769243SN/A    }
1779243SN/A
1789831SN/A    bool empty() const;
1799831SN/A
1809831SN/A    void print(std::ostream& out) const;
1819831SN/A    void checkCoherence(Addr address);
1829831SN/A
1839831SN/A    void markRemoved();
1849831SN/A    void removeRequest(GPUCoalescerRequest* request);
1859831SN/A    void evictionCallback(Addr address);
1869831SN/A    void completeIssue();
1879831SN/A
1889831SN/A    void insertKernel(int wavefront_id, PacketPtr pkt);
1899831SN/A
1909831SN/A    void recordRequestType(SequencerRequestType requestType);
1919831SN/A    Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
1929831SN/A
1939831SN/A    Stats::Histogram& getLatencyHist() { return m_latencyHist; }
1949831SN/A    Stats::Histogram& getTypeLatencyHist(uint32_t t)
1959831SN/A    { return *m_typeLatencyHist[t]; }
1969831SN/A
1979831SN/A    Stats::Histogram& getMissLatencyHist()
1989831SN/A    { return m_missLatencyHist; }
1999243SN/A    Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
2009243SN/A    { return *m_missTypeLatencyHist[t]; }
2019243SN/A
2029243SN/A    Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
2039243SN/A    { return *m_missMachLatencyHist[t]; }
2049243SN/A
2059243SN/A    Stats::Histogram&
2069243SN/A    getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
2079243SN/A    { return *m_missTypeMachLatencyHist[r][t]; }
2089243SN/A
2099243SN/A    Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
2109243SN/A    { return *m_IssueToInitialDelayHist[t]; }
2119243SN/A
2129243SN/A    Stats::Histogram&
2139243SN/A    getInitialToForwardDelayHist(const MachineType t) const
2149243SN/A    { return *m_InitialToForwardDelayHist[t]; }
2159966SN/A
2169966SN/A    Stats::Histogram&
2179243SN/A    getForwardRequestToFirstResponseHist(const MachineType t) const
2189243SN/A    { return *m_ForwardToFirstResponseDelayHist[t]; }
2199967SN/A
22010245Sandreas.hansson@arm.com    Stats::Histogram&
2219831SN/A    getFirstResponseToCompletionDelayHist(const MachineType t) const
2229831SN/A    { return *m_FirstResponseToCompletionDelayHist[t]; }
2239967SN/A
2249967SN/A  // Changed to protected to enable inheritance by VIPER Coalescer
2259967SN/A  protected:
2269967SN/A    bool tryCacheAccess(Addr addr, RubyRequestType type,
2279967SN/A                        Addr pc, RubyAccessMode access_mode,
2289967SN/A                        int size, DataBlock*& data_ptr);
2299967SN/A    // Alternate implementations in VIPER Coalescer
2309831SN/A    virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
2319831SN/A
2329831SN/A    void kernelCallback(int wavfront_id);
2339831SN/A
2349831SN/A    void hitCallback(GPUCoalescerRequest* request,
2359832SN/A                     MachineType mach,
2369831SN/A                     DataBlock& data,
2379831SN/A                     bool success,
2389831SN/A                     Cycles initialRequestTime,
2399831SN/A                     Cycles forwardRequestTime,
2409831SN/A                     Cycles firstResponseTime,
2419832SN/A                     bool isRegion);
2429831SN/A    void recordMissLatency(GPUCoalescerRequest* request,
2439831SN/A                           MachineType mach,
2449831SN/A                           Cycles initialRequestTime,
2459831SN/A                           Cycles forwardRequestTime,
2469831SN/A                           Cycles firstResponseTime,
2479831SN/A                           bool success, bool isRegion);
2489967SN/A    void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
2499243SN/A    PacketPtr mapAddrToPkt(Addr address);
2509967SN/A
25110245Sandreas.hansson@arm.com
2529967SN/A    RequestStatus getRequestStatus(PacketPtr pkt,
2539243SN/A                                   RubyRequestType request_type);
2549967SN/A    bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
2559967SN/A
2569967SN/A    bool handleLlsc(Addr address, GPUCoalescerRequest* request);
2579243SN/A
2589243SN/A    EventFunctionWrapper issueEvent;
2599243SN/A
2609243SN/A
2619243SN/A  // Changed to protected to enable inheritance by VIPER Coalescer
2629243SN/A  protected:
26310206Sandreas.hansson@arm.com    int m_max_outstanding_requests;
26410206Sandreas.hansson@arm.com    int m_deadlock_threshold;
2659243SN/A
2669243SN/A    CacheMemory* m_dataCache_ptr;
26710208Sandreas.hansson@arm.com    CacheMemory* m_instCache_ptr;
26810208Sandreas.hansson@arm.com
26910208Sandreas.hansson@arm.com    // The cache access latency for this GPU data cache. This is assessed at the
2709243SN/A    // beginning of each access. This should be very similar to the
27110146Sandreas.hansson@arm.com    // implementation in Sequencer() as this is very much like a Sequencer
2729243SN/A    Cycles m_data_cache_hit_latency;
27310208Sandreas.hansson@arm.com
27410208Sandreas.hansson@arm.com    // We need to track both the primary and secondary request types.
27510208Sandreas.hansson@arm.com    // The secondary request type comprises a subset of RubyRequestTypes that
27610208Sandreas.hansson@arm.com    // are understood by the L1 Controller. A primary request type can be any
27710208Sandreas.hansson@arm.com    // RubyRequestType.
27810208Sandreas.hansson@arm.com    typedef std::unordered_map<Addr, std::vector<RequestDesc>> CoalescingTable;
2799243SN/A    CoalescingTable reqCoalescer;
28010146Sandreas.hansson@arm.com    std::vector<Addr> newRequests;
2819243SN/A
28210208Sandreas.hansson@arm.com    typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
28310208Sandreas.hansson@arm.com    RequestTable m_writeRequestTable;
2849243SN/A    RequestTable m_readRequestTable;
2859243SN/A    // Global outstanding request count, across all request tables
2869243SN/A    int m_outstanding_count;
2879243SN/A    bool m_deadlock_check_scheduled;
2889831SN/A    std::unordered_map<int, PacketPtr> kernelEndList;
2899243SN/A    std::vector<int> newKernelEnds;
2909243SN/A
2919831SN/A    int m_store_waiting_on_load_cycles;
2929243SN/A    int m_store_waiting_on_store_cycles;
2939243SN/A    int m_load_waiting_on_store_cycles;
2949243SN/A    int m_load_waiting_on_load_cycles;
2959243SN/A
2969831SN/A    bool m_runningGarnetStandalone;
2979243SN/A
2989243SN/A    EventFunctionWrapper deadlockCheckEvent;
2999831SN/A    bool assumingRfOCoherence;
3009243SN/A
3019243SN/A    // m5 style stats for TCP hit/miss counts
3029243SN/A    Stats::Scalar GPU_TCPLdHits;
3039243SN/A    Stats::Scalar GPU_TCPLdTransfers;
3049831SN/A    Stats::Scalar GPU_TCCLdHits;
3059831SN/A    Stats::Scalar GPU_LdMiss;
3069831SN/A
3079243SN/A    Stats::Scalar GPU_TCPStHits;
3089243SN/A    Stats::Scalar GPU_TCPStTransfers;
3099243SN/A    Stats::Scalar GPU_TCCStHits;
3109243SN/A    Stats::Scalar GPU_StMiss;
3119831SN/A
3129831SN/A    Stats::Scalar CP_TCPLdHits;
3139831SN/A    Stats::Scalar CP_TCPLdTransfers;
3149243SN/A    Stats::Scalar CP_TCCLdHits;
3159831SN/A    Stats::Scalar CP_LdMiss;
3169243SN/A
3179243SN/A    Stats::Scalar CP_TCPStHits;
3189243SN/A    Stats::Scalar CP_TCPStTransfers;
3199243SN/A    Stats::Scalar CP_TCCStHits;
3209243SN/A    Stats::Scalar CP_StMiss;
3219243SN/A
3229243SN/A    //! Histogram for number of outstanding requests per cycle.
3239243SN/A    Stats::Histogram m_outstandReqHist;
3249831SN/A
3259831SN/A    //! Histogram for holding latency profile of all requests.
3269831SN/A    Stats::Histogram m_latencyHist;
3279243SN/A    std::vector<Stats::Histogram *> m_typeLatencyHist;
3289831SN/A
3299243SN/A    //! Histogram for holding latency profile of all requests that
3309243SN/A    //! miss in the controller connected to this sequencer.
3319243SN/A    Stats::Histogram m_missLatencyHist;
3329243SN/A    std::vector<Stats::Histogram *> m_missTypeLatencyHist;
3339243SN/A
3349243SN/A    //! Histograms for profiling the latencies for requests that
3359243SN/A    //! required external messages.
3369243SN/A    std::vector<Stats::Histogram *> m_missMachLatencyHist;
3379243SN/A    std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
3389243SN/A
3399243SN/A    //! Histograms for recording the breakdown of miss latency
3409243SN/A    std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
3419243SN/A    std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
3429243SN/A    std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
3439243SN/A    std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
3449243SN/A
3459243SN/Aprivate:
3469243SN/A    // Private copy constructor and assignment operator
3479243SN/A    GPUCoalescer(const GPUCoalescer& obj);
3489243SN/A    GPUCoalescer& operator=(const GPUCoalescer& obj);
3499726SN/A};
3509243SN/A
3519726SN/Ainline std::ostream&
3529243SN/Aoperator<<(std::ostream& out, const GPUCoalescer& obj)
3539243SN/A{
3549243SN/A    obj.print(out);
3559831SN/A    out << std::flush;
3569831SN/A    return out;
3579831SN/A}
3589243SN/A
3599243SN/A#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
3609831SN/A