1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Sooraj Puthoor
34 */
35
36#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
37#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
38
39#include <iostream>
40#include <unordered_map>
41
42#include "base/statistics.hh"
43#include "mem/request.hh"
44#include "mem/ruby/common/Address.hh"
45#include "mem/ruby/common/Consumer.hh"
46#include "mem/ruby/protocol/HSAScope.hh"
47#include "mem/ruby/protocol/HSASegment.hh"
48#include "mem/ruby/protocol/PrefetchBit.hh"
49#include "mem/ruby/protocol/RubyAccessMode.hh"
50#include "mem/ruby/protocol/RubyRequestType.hh"
51#include "mem/ruby/protocol/SequencerRequestType.hh"
52#include "mem/ruby/system/Sequencer.hh"
53
54class DataBlock;
55class CacheMsg;
56class MachineID;
57class CacheMemory;
58
59class RubyGPUCoalescerParams;
60
61HSAScope reqScopeToHSAScope(const RequestPtr &req);
62HSASegment reqSegmentToHSASegment(const RequestPtr &req);
63
64struct GPUCoalescerRequest
65{
66    PacketPtr pkt;
67    RubyRequestType m_type;
68    Cycles issue_time;
69
70    GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
71                        Cycles _issue_time)
72        : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
73    {}
74};
75
76class RequestDesc
77{
78  public:
79    RequestDesc(PacketPtr pkt, RubyRequestType p_type, RubyRequestType s_type)
80        : pkt(pkt), primaryType(p_type), secondaryType(s_type)
81    {
82    }
83
84    RequestDesc() : pkt(nullptr), primaryType(RubyRequestType_NULL),
85        secondaryType(RubyRequestType_NULL)
86    {
87    }
88
89    PacketPtr pkt;
90    RubyRequestType primaryType;
91    RubyRequestType secondaryType;
92};
93
94std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
95
96class GPUCoalescer : public RubyPort
97{
98  public:
99    typedef RubyGPUCoalescerParams Params;
100    GPUCoalescer(const Params *);
101    ~GPUCoalescer();
102
103    // Public Methods
104    void wakeup(); // Used only for deadlock detection
105
106    void printProgress(std::ostream& out) const;
107    void resetStats();
108    void collateStats();
109    void regStats();
110
111    void writeCallback(Addr address, DataBlock& data);
112
113    void writeCallback(Addr address,
114                       MachineType mach,
115                       DataBlock& data);
116
117    void writeCallback(Addr address,
118                       MachineType mach,
119                       DataBlock& data,
120                       Cycles initialRequestTime,
121                       Cycles forwardRequestTime,
122                       Cycles firstResponseTime,
123                       bool isRegion);
124
125    void writeCallback(Addr address,
126                       MachineType mach,
127                       DataBlock& data,
128                       Cycles initialRequestTime,
129                       Cycles forwardRequestTime,
130                       Cycles firstResponseTime);
131
132    void readCallback(Addr address, DataBlock& data);
133
134    void readCallback(Addr address,
135                      MachineType mach,
136                      DataBlock& data);
137
138    void readCallback(Addr address,
139                      MachineType mach,
140                      DataBlock& data,
141                      Cycles initialRequestTime,
142                      Cycles forwardRequestTime,
143                      Cycles firstResponseTime);
144
145    void readCallback(Addr address,
146                      MachineType mach,
147                      DataBlock& data,
148                      Cycles initialRequestTime,
149                      Cycles forwardRequestTime,
150                      Cycles firstResponseTime,
151                      bool isRegion);
152    /* atomics need their own callback because the data
153       might be const coming from SLICC */
154    void atomicCallback(Addr address,
155                        MachineType mach,
156                        const DataBlock& data);
157
158    void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
159    void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
160
161    // Alternate implementations in VIPER Coalescer
162    virtual RequestStatus makeRequest(PacketPtr pkt);
163
164    int outstandingCount() const { return m_outstanding_count; }
165
166    bool
167    isDeadlockEventScheduled() const
168    {
169        return deadlockCheckEvent.scheduled();
170    }
171
172    void
173    descheduleDeadlockEvent()
174    {
175        deschedule(deadlockCheckEvent);
176    }
177
178    bool empty() const;
179
180    void print(std::ostream& out) const;
181    void checkCoherence(Addr address);
182
183    void markRemoved();
184    void removeRequest(GPUCoalescerRequest* request);
185    void evictionCallback(Addr address);
186    void completeIssue();
187
188    void insertKernel(int wavefront_id, PacketPtr pkt);
189
190    void recordRequestType(SequencerRequestType requestType);
191    Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
192
193    Stats::Histogram& getLatencyHist() { return m_latencyHist; }
194    Stats::Histogram& getTypeLatencyHist(uint32_t t)
195    { return *m_typeLatencyHist[t]; }
196
197    Stats::Histogram& getMissLatencyHist()
198    { return m_missLatencyHist; }
199    Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
200    { return *m_missTypeLatencyHist[t]; }
201
202    Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
203    { return *m_missMachLatencyHist[t]; }
204
205    Stats::Histogram&
206    getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
207    { return *m_missTypeMachLatencyHist[r][t]; }
208
209    Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
210    { return *m_IssueToInitialDelayHist[t]; }
211
212    Stats::Histogram&
213    getInitialToForwardDelayHist(const MachineType t) const
214    { return *m_InitialToForwardDelayHist[t]; }
215
216    Stats::Histogram&
217    getForwardRequestToFirstResponseHist(const MachineType t) const
218    { return *m_ForwardToFirstResponseDelayHist[t]; }
219
220    Stats::Histogram&
221    getFirstResponseToCompletionDelayHist(const MachineType t) const
222    { return *m_FirstResponseToCompletionDelayHist[t]; }
223
224  // Changed to protected to enable inheritance by VIPER Coalescer
225  protected:
226    bool tryCacheAccess(Addr addr, RubyRequestType type,
227                        Addr pc, RubyAccessMode access_mode,
228                        int size, DataBlock*& data_ptr);
229    // Alternate implementations in VIPER Coalescer
230    virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
231
232    void kernelCallback(int wavfront_id);
233
234    void hitCallback(GPUCoalescerRequest* request,
235                     MachineType mach,
236                     DataBlock& data,
237                     bool success,
238                     Cycles initialRequestTime,
239                     Cycles forwardRequestTime,
240                     Cycles firstResponseTime,
241                     bool isRegion);
242    void recordMissLatency(GPUCoalescerRequest* request,
243                           MachineType mach,
244                           Cycles initialRequestTime,
245                           Cycles forwardRequestTime,
246                           Cycles firstResponseTime,
247                           bool success, bool isRegion);
248    void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
249    PacketPtr mapAddrToPkt(Addr address);
250
251
252    RequestStatus getRequestStatus(PacketPtr pkt,
253                                   RubyRequestType request_type);
254    bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
255
256    bool handleLlsc(Addr address, GPUCoalescerRequest* request);
257
258    EventFunctionWrapper issueEvent;
259
260
261  // Changed to protected to enable inheritance by VIPER Coalescer
262  protected:
263    int m_max_outstanding_requests;
264    int m_deadlock_threshold;
265
266    CacheMemory* m_dataCache_ptr;
267    CacheMemory* m_instCache_ptr;
268
269    // We need to track both the primary and secondary request types.
270    // The secondary request type comprises a subset of RubyRequestTypes that
271    // are understood by the L1 Controller. A primary request type can be any
272    // RubyRequestType.
273    typedef std::unordered_map<Addr, std::vector<RequestDesc>> CoalescingTable;
274    CoalescingTable reqCoalescer;
275    std::vector<Addr> newRequests;
276
277    typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
278    RequestTable m_writeRequestTable;
279    RequestTable m_readRequestTable;
280    // Global outstanding request count, across all request tables
281    int m_outstanding_count;
282    bool m_deadlock_check_scheduled;
283    std::unordered_map<int, PacketPtr> kernelEndList;
284    std::vector<int> newKernelEnds;
285
286    int m_store_waiting_on_load_cycles;
287    int m_store_waiting_on_store_cycles;
288    int m_load_waiting_on_store_cycles;
289    int m_load_waiting_on_load_cycles;
290
291    bool m_runningGarnetStandalone;
292
293    EventFunctionWrapper deadlockCheckEvent;
294    bool assumingRfOCoherence;
295
296    // m5 style stats for TCP hit/miss counts
297    Stats::Scalar GPU_TCPLdHits;
298    Stats::Scalar GPU_TCPLdTransfers;
299    Stats::Scalar GPU_TCCLdHits;
300    Stats::Scalar GPU_LdMiss;
301
302    Stats::Scalar GPU_TCPStHits;
303    Stats::Scalar GPU_TCPStTransfers;
304    Stats::Scalar GPU_TCCStHits;
305    Stats::Scalar GPU_StMiss;
306
307    Stats::Scalar CP_TCPLdHits;
308    Stats::Scalar CP_TCPLdTransfers;
309    Stats::Scalar CP_TCCLdHits;
310    Stats::Scalar CP_LdMiss;
311
312    Stats::Scalar CP_TCPStHits;
313    Stats::Scalar CP_TCPStTransfers;
314    Stats::Scalar CP_TCCStHits;
315    Stats::Scalar CP_StMiss;
316
317    //! Histogram for number of outstanding requests per cycle.
318    Stats::Histogram m_outstandReqHist;
319
320    //! Histogram for holding latency profile of all requests.
321    Stats::Histogram m_latencyHist;
322    std::vector<Stats::Histogram *> m_typeLatencyHist;
323
324    //! Histogram for holding latency profile of all requests that
325    //! miss in the controller connected to this sequencer.
326    Stats::Histogram m_missLatencyHist;
327    std::vector<Stats::Histogram *> m_missTypeLatencyHist;
328
329    //! Histograms for profiling the latencies for requests that
330    //! required external messages.
331    std::vector<Stats::Histogram *> m_missMachLatencyHist;
332    std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
333
334    //! Histograms for recording the breakdown of miss latency
335    std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
336    std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
337    std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
338    std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
339
340private:
341    // Private copy constructor and assignment operator
342    GPUCoalescer(const GPUCoalescer& obj);
343    GPUCoalescer& operator=(const GPUCoalescer& obj);
344};
345
346inline std::ostream&
347operator<<(std::ostream& out, const GPUCoalescer& obj)
348{
349    obj.print(out);
350    out << std::flush;
351    return out;
352}
353
354#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
355