GPUCoalescer.hh revision 11660:cfa97c37117a
1/* 2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Sooraj Puthoor 34 */ 35 36#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ 37#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ 38 39#include <iostream> 40#include <unordered_map> 41 42#include "base/statistics.hh" 43#include "mem/protocol/HSAScope.hh" 44#include "mem/protocol/HSASegment.hh" 45#include "mem/protocol/PrefetchBit.hh" 46#include "mem/protocol/RubyAccessMode.hh" 47#include "mem/protocol/RubyRequestType.hh" 48#include "mem/protocol/SequencerRequestType.hh" 49#include "mem/request.hh" 50#include "mem/ruby/common/Address.hh" 51#include "mem/ruby/common/Consumer.hh" 52#include "mem/ruby/system/RubyPort.hh" 53 54class DataBlock; 55class CacheMsg; 56class MachineID; 57class CacheMemory; 58 59class RubyGPUCoalescerParams; 60 61HSAScope reqScopeToHSAScope(Request* req); 62HSASegment reqSegmentToHSASegment(Request* req); 63 64struct GPUCoalescerRequest 65{ 66 PacketPtr pkt; 67 RubyRequestType m_type; 68 Cycles issue_time; 69 70 GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type, 71 Cycles _issue_time) 72 : pkt(_pkt), m_type(_m_type), issue_time(_issue_time) 73 {} 74}; 75 76std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj); 77 78class GPUCoalescer : public RubyPort 79{ 80 public: 81 typedef RubyGPUCoalescerParams Params; 82 GPUCoalescer(const Params *); 83 ~GPUCoalescer(); 84 85 // Public Methods 86 void wakeup(); // Used only for deadlock detection 87 88 void printProgress(std::ostream& out) const; 89 void resetStats(); 90 void collateStats(); 91 void regStats(); 92 93 void writeCallback(Addr address, DataBlock& data); 94 95 void writeCallback(Addr address, 96 MachineType mach, 97 DataBlock& data); 98 99 void writeCallback(Addr address, 100 MachineType mach, 101 DataBlock& data, 102 Cycles initialRequestTime, 103 Cycles forwardRequestTime, 104 Cycles firstResponseTime, 105 bool isRegion); 106 107 void writeCallback(Addr address, 108 MachineType mach, 109 DataBlock& data, 110 Cycles initialRequestTime, 111 Cycles forwardRequestTime, 112 Cycles firstResponseTime); 113 114 void readCallback(Addr address, DataBlock& data); 115 116 void readCallback(Addr address, 117 MachineType mach, 118 DataBlock& data); 119 120 void readCallback(Addr address, 121 MachineType mach, 122 DataBlock& data, 123 Cycles initialRequestTime, 124 Cycles forwardRequestTime, 125 Cycles firstResponseTime); 126 127 void readCallback(Addr address, 128 MachineType mach, 129 DataBlock& data, 130 Cycles initialRequestTime, 131 Cycles forwardRequestTime, 132 Cycles firstResponseTime, 133 bool isRegion); 134 /* atomics need their own callback because the data 135 might be const coming from SLICC */ 136 void atomicCallback(Addr address, 137 MachineType mach, 138 const DataBlock& data); 139 140 void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID); 141 void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID); 142 143 // Alternate implementations in VIPER Coalescer 144 virtual RequestStatus makeRequest(PacketPtr pkt); 145 146 int outstandingCount() const { return m_outstanding_count; } 147 148 bool 149 isDeadlockEventScheduled() const 150 { 151 return deadlockCheckEvent.scheduled(); 152 } 153 154 void 155 descheduleDeadlockEvent() 156 { 157 deschedule(deadlockCheckEvent); 158 } 159 160 bool empty() const; 161 162 void print(std::ostream& out) const; 163 void checkCoherence(Addr address); 164 165 void markRemoved(); 166 void removeRequest(GPUCoalescerRequest* request); 167 void evictionCallback(Addr address); 168 void completeIssue(); 169 170 void insertKernel(int wavefront_id, PacketPtr pkt); 171 172 void recordRequestType(SequencerRequestType requestType); 173 Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; } 174 175 Stats::Histogram& getLatencyHist() { return m_latencyHist; } 176 Stats::Histogram& getTypeLatencyHist(uint32_t t) 177 { return *m_typeLatencyHist[t]; } 178 179 Stats::Histogram& getMissLatencyHist() 180 { return m_missLatencyHist; } 181 Stats::Histogram& getMissTypeLatencyHist(uint32_t t) 182 { return *m_missTypeLatencyHist[t]; } 183 184 Stats::Histogram& getMissMachLatencyHist(uint32_t t) const 185 { return *m_missMachLatencyHist[t]; } 186 187 Stats::Histogram& 188 getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const 189 { return *m_missTypeMachLatencyHist[r][t]; } 190 191 Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const 192 { return *m_IssueToInitialDelayHist[t]; } 193 194 Stats::Histogram& 195 getInitialToForwardDelayHist(const MachineType t) const 196 { return *m_InitialToForwardDelayHist[t]; } 197 198 Stats::Histogram& 199 getForwardRequestToFirstResponseHist(const MachineType t) const 200 { return *m_ForwardToFirstResponseDelayHist[t]; } 201 202 Stats::Histogram& 203 getFirstResponseToCompletionDelayHist(const MachineType t) const 204 { return *m_FirstResponseToCompletionDelayHist[t]; } 205 206 // Changed to protected to enable inheritance by VIPER Coalescer 207 protected: 208 bool tryCacheAccess(Addr addr, RubyRequestType type, 209 Addr pc, RubyAccessMode access_mode, 210 int size, DataBlock*& data_ptr); 211 // Alternate implementations in VIPER Coalescer 212 virtual void issueRequest(PacketPtr pkt, RubyRequestType type); 213 214 void kernelCallback(int wavfront_id); 215 216 void hitCallback(GPUCoalescerRequest* request, 217 MachineType mach, 218 DataBlock& data, 219 bool success, 220 Cycles initialRequestTime, 221 Cycles forwardRequestTime, 222 Cycles firstResponseTime, 223 bool isRegion); 224 void recordMissLatency(GPUCoalescerRequest* request, 225 MachineType mach, 226 Cycles initialRequestTime, 227 Cycles forwardRequestTime, 228 Cycles firstResponseTime, 229 bool success, bool isRegion); 230 void completeHitCallback(std::vector<PacketPtr> & mylist, int len); 231 PacketPtr mapAddrToPkt(Addr address); 232 233 234 RequestStatus getRequestStatus(PacketPtr pkt, 235 RubyRequestType request_type); 236 bool insertRequest(PacketPtr pkt, RubyRequestType request_type); 237 238 bool handleLlsc(Addr address, GPUCoalescerRequest* request); 239 240 // Private copy constructor and assignment operator 241 GPUCoalescer(const GPUCoalescer& obj); 242 GPUCoalescer& operator=(const GPUCoalescer& obj); 243 244 class IssueEvent : public Event 245 { 246 private: 247 GPUCoalescer *seq; 248 public: 249 IssueEvent(GPUCoalescer *_seq); 250 void process(); 251 const char *description() const; 252 }; 253 254 IssueEvent issueEvent; 255 256 257 // Changed to protected to enable inheritance by VIPER Coalescer 258 protected: 259 int m_max_outstanding_requests; 260 int m_deadlock_threshold; 261 262 CacheMemory* m_dataCache_ptr; 263 CacheMemory* m_instCache_ptr; 264 265 // The cache access latency for this GPU data cache. This is assessed at the 266 // beginning of each access. This should be very similar to the 267 // implementation in Sequencer() as this is very much like a Sequencer 268 Cycles m_data_cache_hit_latency; 269 270 // We need to track both the primary and secondary request types. 271 // The secondary request type comprises a subset of RubyRequestTypes that 272 // are understood by the L1 Controller. A primary request type can be any 273 // RubyRequestType. 274 enum {PrimaryType, SecondaryType}; 275 typedef std::pair<PacketPtr, std::vector<RubyRequestType> > RequestDesc; 276 typedef std::unordered_map<Addr, std::vector<RequestDesc> > CoalescingTable; 277 CoalescingTable reqCoalescer; 278 std::vector<Addr> newRequests; 279 280 typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable; 281 RequestTable m_writeRequestTable; 282 RequestTable m_readRequestTable; 283 // Global outstanding request count, across all request tables 284 int m_outstanding_count; 285 bool m_deadlock_check_scheduled; 286 std::unordered_map<int, PacketPtr> kernelEndList; 287 std::vector<int> newKernelEnds; 288 289 int m_store_waiting_on_load_cycles; 290 int m_store_waiting_on_store_cycles; 291 int m_load_waiting_on_store_cycles; 292 int m_load_waiting_on_load_cycles; 293 294 bool m_runningGarnetStandalone; 295 296 class GPUCoalescerWakeupEvent : public Event 297 { 298 private: 299 GPUCoalescer *m_GPUCoalescer_ptr; 300 301 public: 302 GPUCoalescerWakeupEvent(GPUCoalescer *_seq) : 303 m_GPUCoalescer_ptr(_seq) {} 304 void process() { m_GPUCoalescer_ptr->wakeup(); } 305 const char *description() const 306 { 307 return "GPUCoalescer deadlock check"; 308 } 309 }; 310 311 GPUCoalescerWakeupEvent deadlockCheckEvent; 312 bool assumingRfOCoherence; 313 314 // m5 style stats for TCP hit/miss counts 315 Stats::Scalar GPU_TCPLdHits; 316 Stats::Scalar GPU_TCPLdTransfers; 317 Stats::Scalar GPU_TCCLdHits; 318 Stats::Scalar GPU_LdMiss; 319 320 Stats::Scalar GPU_TCPStHits; 321 Stats::Scalar GPU_TCPStTransfers; 322 Stats::Scalar GPU_TCCStHits; 323 Stats::Scalar GPU_StMiss; 324 325 Stats::Scalar CP_TCPLdHits; 326 Stats::Scalar CP_TCPLdTransfers; 327 Stats::Scalar CP_TCCLdHits; 328 Stats::Scalar CP_LdMiss; 329 330 Stats::Scalar CP_TCPStHits; 331 Stats::Scalar CP_TCPStTransfers; 332 Stats::Scalar CP_TCCStHits; 333 Stats::Scalar CP_StMiss; 334 335 //! Histogram for number of outstanding requests per cycle. 336 Stats::Histogram m_outstandReqHist; 337 338 //! Histogram for holding latency profile of all requests. 339 Stats::Histogram m_latencyHist; 340 std::vector<Stats::Histogram *> m_typeLatencyHist; 341 342 //! Histogram for holding latency profile of all requests that 343 //! miss in the controller connected to this sequencer. 344 Stats::Histogram m_missLatencyHist; 345 std::vector<Stats::Histogram *> m_missTypeLatencyHist; 346 347 //! Histograms for profiling the latencies for requests that 348 //! required external messages. 349 std::vector<Stats::Histogram *> m_missMachLatencyHist; 350 std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist; 351 352 //! Histograms for recording the breakdown of miss latency 353 std::vector<Stats::Histogram *> m_IssueToInitialDelayHist; 354 std::vector<Stats::Histogram *> m_InitialToForwardDelayHist; 355 std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist; 356 std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist; 357}; 358 359inline std::ostream& 360operator<<(std::ostream& out, const GPUCoalescer& obj) 361{ 362 obj.print(out); 363 out << std::flush; 364 return out; 365} 366 367#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ 368 369