GPUCoalescer.hh revision 11660:cfa97c37117a
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Sooraj Puthoor
34 */
35
36#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
37#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
38
39#include <iostream>
40#include <unordered_map>
41
42#include "base/statistics.hh"
43#include "mem/protocol/HSAScope.hh"
44#include "mem/protocol/HSASegment.hh"
45#include "mem/protocol/PrefetchBit.hh"
46#include "mem/protocol/RubyAccessMode.hh"
47#include "mem/protocol/RubyRequestType.hh"
48#include "mem/protocol/SequencerRequestType.hh"
49#include "mem/request.hh"
50#include "mem/ruby/common/Address.hh"
51#include "mem/ruby/common/Consumer.hh"
52#include "mem/ruby/system/RubyPort.hh"
53
54class DataBlock;
55class CacheMsg;
56class MachineID;
57class CacheMemory;
58
59class RubyGPUCoalescerParams;
60
61HSAScope reqScopeToHSAScope(Request* req);
62HSASegment reqSegmentToHSASegment(Request* req);
63
64struct GPUCoalescerRequest
65{
66    PacketPtr pkt;
67    RubyRequestType m_type;
68    Cycles issue_time;
69
70    GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
71                        Cycles _issue_time)
72        : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
73    {}
74};
75
76std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
77
78class GPUCoalescer : public RubyPort
79{
80  public:
81    typedef RubyGPUCoalescerParams Params;
82    GPUCoalescer(const Params *);
83    ~GPUCoalescer();
84
85    // Public Methods
86    void wakeup(); // Used only for deadlock detection
87
88    void printProgress(std::ostream& out) const;
89    void resetStats();
90    void collateStats();
91    void regStats();
92
93    void writeCallback(Addr address, DataBlock& data);
94
95    void writeCallback(Addr address,
96                       MachineType mach,
97                       DataBlock& data);
98
99    void writeCallback(Addr address,
100                       MachineType mach,
101                       DataBlock& data,
102                       Cycles initialRequestTime,
103                       Cycles forwardRequestTime,
104                       Cycles firstResponseTime,
105                       bool isRegion);
106
107    void writeCallback(Addr address,
108                       MachineType mach,
109                       DataBlock& data,
110                       Cycles initialRequestTime,
111                       Cycles forwardRequestTime,
112                       Cycles firstResponseTime);
113
114    void readCallback(Addr address, DataBlock& data);
115
116    void readCallback(Addr address,
117                      MachineType mach,
118                      DataBlock& data);
119
120    void readCallback(Addr address,
121                      MachineType mach,
122                      DataBlock& data,
123                      Cycles initialRequestTime,
124                      Cycles forwardRequestTime,
125                      Cycles firstResponseTime);
126
127    void readCallback(Addr address,
128                      MachineType mach,
129                      DataBlock& data,
130                      Cycles initialRequestTime,
131                      Cycles forwardRequestTime,
132                      Cycles firstResponseTime,
133                      bool isRegion);
134    /* atomics need their own callback because the data
135       might be const coming from SLICC */
136    void atomicCallback(Addr address,
137                        MachineType mach,
138                        const DataBlock& data);
139
140    void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
141    void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
142
143    // Alternate implementations in VIPER Coalescer
144    virtual RequestStatus makeRequest(PacketPtr pkt);
145
146    int outstandingCount() const { return m_outstanding_count; }
147
148    bool
149    isDeadlockEventScheduled() const
150    {
151        return deadlockCheckEvent.scheduled();
152    }
153
154    void
155    descheduleDeadlockEvent()
156    {
157        deschedule(deadlockCheckEvent);
158    }
159
160    bool empty() const;
161
162    void print(std::ostream& out) const;
163    void checkCoherence(Addr address);
164
165    void markRemoved();
166    void removeRequest(GPUCoalescerRequest* request);
167    void evictionCallback(Addr address);
168    void completeIssue();
169
170    void insertKernel(int wavefront_id, PacketPtr pkt);
171
172    void recordRequestType(SequencerRequestType requestType);
173    Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
174
175    Stats::Histogram& getLatencyHist() { return m_latencyHist; }
176    Stats::Histogram& getTypeLatencyHist(uint32_t t)
177    { return *m_typeLatencyHist[t]; }
178
179    Stats::Histogram& getMissLatencyHist()
180    { return m_missLatencyHist; }
181    Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
182    { return *m_missTypeLatencyHist[t]; }
183
184    Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
185    { return *m_missMachLatencyHist[t]; }
186
187    Stats::Histogram&
188    getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
189    { return *m_missTypeMachLatencyHist[r][t]; }
190
191    Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
192    { return *m_IssueToInitialDelayHist[t]; }
193
194    Stats::Histogram&
195    getInitialToForwardDelayHist(const MachineType t) const
196    { return *m_InitialToForwardDelayHist[t]; }
197
198    Stats::Histogram&
199    getForwardRequestToFirstResponseHist(const MachineType t) const
200    { return *m_ForwardToFirstResponseDelayHist[t]; }
201
202    Stats::Histogram&
203    getFirstResponseToCompletionDelayHist(const MachineType t) const
204    { return *m_FirstResponseToCompletionDelayHist[t]; }
205
206  // Changed to protected to enable inheritance by VIPER Coalescer
207  protected:
208    bool tryCacheAccess(Addr addr, RubyRequestType type,
209                        Addr pc, RubyAccessMode access_mode,
210                        int size, DataBlock*& data_ptr);
211    // Alternate implementations in VIPER Coalescer
212    virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
213
214    void kernelCallback(int wavfront_id);
215
216    void hitCallback(GPUCoalescerRequest* request,
217                     MachineType mach,
218                     DataBlock& data,
219                     bool success,
220                     Cycles initialRequestTime,
221                     Cycles forwardRequestTime,
222                     Cycles firstResponseTime,
223                     bool isRegion);
224    void recordMissLatency(GPUCoalescerRequest* request,
225                           MachineType mach,
226                           Cycles initialRequestTime,
227                           Cycles forwardRequestTime,
228                           Cycles firstResponseTime,
229                           bool success, bool isRegion);
230    void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
231    PacketPtr mapAddrToPkt(Addr address);
232
233
234    RequestStatus getRequestStatus(PacketPtr pkt,
235                                   RubyRequestType request_type);
236    bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
237
238    bool handleLlsc(Addr address, GPUCoalescerRequest* request);
239
240    // Private copy constructor and assignment operator
241    GPUCoalescer(const GPUCoalescer& obj);
242    GPUCoalescer& operator=(const GPUCoalescer& obj);
243
244    class IssueEvent : public Event
245    {
246      private:
247        GPUCoalescer *seq;
248      public:
249        IssueEvent(GPUCoalescer *_seq);
250        void process();
251        const char *description() const;
252    };
253
254    IssueEvent issueEvent;
255
256
257  // Changed to protected to enable inheritance by VIPER Coalescer
258  protected:
259    int m_max_outstanding_requests;
260    int m_deadlock_threshold;
261
262    CacheMemory* m_dataCache_ptr;
263    CacheMemory* m_instCache_ptr;
264
265    // The cache access latency for this GPU data cache. This is assessed at the
266    // beginning of each access. This should be very similar to the
267    // implementation in Sequencer() as this is very much like a Sequencer
268    Cycles m_data_cache_hit_latency;
269
270    // We need to track both the primary and secondary request types.
271    // The secondary request type comprises a subset of RubyRequestTypes that
272    // are understood by the L1 Controller. A primary request type can be any
273    // RubyRequestType.
274    enum {PrimaryType, SecondaryType};
275    typedef std::pair<PacketPtr, std::vector<RubyRequestType> > RequestDesc;
276    typedef std::unordered_map<Addr, std::vector<RequestDesc> > CoalescingTable;
277    CoalescingTable reqCoalescer;
278    std::vector<Addr> newRequests;
279
280    typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
281    RequestTable m_writeRequestTable;
282    RequestTable m_readRequestTable;
283    // Global outstanding request count, across all request tables
284    int m_outstanding_count;
285    bool m_deadlock_check_scheduled;
286    std::unordered_map<int, PacketPtr> kernelEndList;
287    std::vector<int> newKernelEnds;
288
289    int m_store_waiting_on_load_cycles;
290    int m_store_waiting_on_store_cycles;
291    int m_load_waiting_on_store_cycles;
292    int m_load_waiting_on_load_cycles;
293
294    bool m_runningGarnetStandalone;
295
296    class GPUCoalescerWakeupEvent : public Event
297    {
298      private:
299        GPUCoalescer *m_GPUCoalescer_ptr;
300
301      public:
302        GPUCoalescerWakeupEvent(GPUCoalescer *_seq) :
303            m_GPUCoalescer_ptr(_seq) {}
304        void process() { m_GPUCoalescer_ptr->wakeup(); }
305        const char *description() const
306        {
307            return "GPUCoalescer deadlock check";
308        }
309    };
310
311    GPUCoalescerWakeupEvent deadlockCheckEvent;
312    bool assumingRfOCoherence;
313
314    // m5 style stats for TCP hit/miss counts
315    Stats::Scalar GPU_TCPLdHits;
316    Stats::Scalar GPU_TCPLdTransfers;
317    Stats::Scalar GPU_TCCLdHits;
318    Stats::Scalar GPU_LdMiss;
319
320    Stats::Scalar GPU_TCPStHits;
321    Stats::Scalar GPU_TCPStTransfers;
322    Stats::Scalar GPU_TCCStHits;
323    Stats::Scalar GPU_StMiss;
324
325    Stats::Scalar CP_TCPLdHits;
326    Stats::Scalar CP_TCPLdTransfers;
327    Stats::Scalar CP_TCCLdHits;
328    Stats::Scalar CP_LdMiss;
329
330    Stats::Scalar CP_TCPStHits;
331    Stats::Scalar CP_TCPStTransfers;
332    Stats::Scalar CP_TCCStHits;
333    Stats::Scalar CP_StMiss;
334
335    //! Histogram for number of outstanding requests per cycle.
336    Stats::Histogram m_outstandReqHist;
337
338    //! Histogram for holding latency profile of all requests.
339    Stats::Histogram m_latencyHist;
340    std::vector<Stats::Histogram *> m_typeLatencyHist;
341
342    //! Histogram for holding latency profile of all requests that
343    //! miss in the controller connected to this sequencer.
344    Stats::Histogram m_missLatencyHist;
345    std::vector<Stats::Histogram *> m_missTypeLatencyHist;
346
347    //! Histograms for profiling the latencies for requests that
348    //! required external messages.
349    std::vector<Stats::Histogram *> m_missMachLatencyHist;
350    std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
351
352    //! Histograms for recording the breakdown of miss latency
353    std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
354    std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
355    std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
356    std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
357};
358
359inline std::ostream&
360operator<<(std::ostream& out, const GPUCoalescer& obj)
361{
362    obj.print(out);
363    out << std::flush;
364    return out;
365}
366
367#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
368
369