1/* 2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Sooraj Puthoor 34 */ 35 36#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ 37#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ 38 39#include <iostream> 40#include <unordered_map> 41 42#include "base/statistics.hh" 43#include "mem/request.hh" 44#include "mem/ruby/common/Address.hh" 45#include "mem/ruby/common/Consumer.hh" 46#include "mem/ruby/protocol/HSAScope.hh" 47#include "mem/ruby/protocol/HSASegment.hh" 48#include "mem/ruby/protocol/PrefetchBit.hh" 49#include "mem/ruby/protocol/RubyAccessMode.hh" 50#include "mem/ruby/protocol/RubyRequestType.hh" 51#include "mem/ruby/protocol/SequencerRequestType.hh" 52#include "mem/ruby/system/Sequencer.hh" 53 54class DataBlock; 55class CacheMsg; 56class MachineID; 57class CacheMemory; 58 59class RubyGPUCoalescerParams; 60 61HSAScope reqScopeToHSAScope(const RequestPtr &req); 62HSASegment reqSegmentToHSASegment(const RequestPtr &req); 63 64struct GPUCoalescerRequest 65{ 66 PacketPtr pkt; 67 RubyRequestType m_type; 68 Cycles issue_time; 69 70 GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type, 71 Cycles _issue_time) 72 : pkt(_pkt), m_type(_m_type), issue_time(_issue_time) 73 {} 74}; 75 76class RequestDesc 77{ 78 public: 79 RequestDesc(PacketPtr pkt, RubyRequestType p_type, RubyRequestType s_type) 80 : pkt(pkt), primaryType(p_type), secondaryType(s_type) 81 { 82 } 83 84 RequestDesc() : pkt(nullptr), primaryType(RubyRequestType_NULL), 85 secondaryType(RubyRequestType_NULL) 86 { 87 } 88 89 PacketPtr pkt; 90 RubyRequestType primaryType; 91 RubyRequestType secondaryType; 92}; 93 94std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj); 95 96class GPUCoalescer : public RubyPort 97{ 98 public: 99 typedef RubyGPUCoalescerParams Params; 100 GPUCoalescer(const Params *); 101 ~GPUCoalescer(); 102 103 // Public Methods 104 void wakeup(); // Used only for deadlock detection 105 106 void printProgress(std::ostream& out) const; 107 void resetStats(); 108 void collateStats(); 109 void regStats(); 110 111 void writeCallback(Addr address, DataBlock& data); 112 113 void writeCallback(Addr address, 114 MachineType mach, 115 DataBlock& data); 116 117 void writeCallback(Addr address, 118 MachineType mach, 119 DataBlock& data, 120 Cycles initialRequestTime, 121 Cycles forwardRequestTime, 122 Cycles firstResponseTime, 123 bool isRegion); 124 125 void writeCallback(Addr address, 126 MachineType mach, 127 DataBlock& data, 128 Cycles initialRequestTime, 129 Cycles forwardRequestTime, 130 Cycles firstResponseTime); 131 132 void readCallback(Addr address, DataBlock& data); 133 134 void readCallback(Addr address, 135 MachineType mach, 136 DataBlock& data); 137 138 void readCallback(Addr address, 139 MachineType mach, 140 DataBlock& data, 141 Cycles initialRequestTime, 142 Cycles forwardRequestTime, 143 Cycles firstResponseTime); 144 145 void readCallback(Addr address, 146 MachineType mach, 147 DataBlock& data, 148 Cycles initialRequestTime, 149 Cycles forwardRequestTime, 150 Cycles firstResponseTime, 151 bool isRegion); 152 /* atomics need their own callback because the data 153 might be const coming from SLICC */ 154 void atomicCallback(Addr address, 155 MachineType mach, 156 const DataBlock& data); 157 158 void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID); 159 void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID); 160 161 // Alternate implementations in VIPER Coalescer 162 virtual RequestStatus makeRequest(PacketPtr pkt); 163 164 int outstandingCount() const { return m_outstanding_count; } 165 166 bool 167 isDeadlockEventScheduled() const 168 { 169 return deadlockCheckEvent.scheduled(); 170 } 171 172 void 173 descheduleDeadlockEvent() 174 { 175 deschedule(deadlockCheckEvent); 176 } 177 178 bool empty() const; 179 180 void print(std::ostream& out) const; 181 void checkCoherence(Addr address); 182 183 void markRemoved(); 184 void removeRequest(GPUCoalescerRequest* request); 185 void evictionCallback(Addr address); 186 void completeIssue(); 187 188 void insertKernel(int wavefront_id, PacketPtr pkt); 189 190 void recordRequestType(SequencerRequestType requestType); 191 Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; } 192 193 Stats::Histogram& getLatencyHist() { return m_latencyHist; } 194 Stats::Histogram& getTypeLatencyHist(uint32_t t) 195 { return *m_typeLatencyHist[t]; } 196 197 Stats::Histogram& getMissLatencyHist() 198 { return m_missLatencyHist; } 199 Stats::Histogram& getMissTypeLatencyHist(uint32_t t) 200 { return *m_missTypeLatencyHist[t]; } 201 202 Stats::Histogram& getMissMachLatencyHist(uint32_t t) const 203 { return *m_missMachLatencyHist[t]; } 204 205 Stats::Histogram& 206 getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const 207 { return *m_missTypeMachLatencyHist[r][t]; } 208 209 Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const 210 { return *m_IssueToInitialDelayHist[t]; } 211 212 Stats::Histogram& 213 getInitialToForwardDelayHist(const MachineType t) const 214 { return *m_InitialToForwardDelayHist[t]; } 215 216 Stats::Histogram& 217 getForwardRequestToFirstResponseHist(const MachineType t) const 218 { return *m_ForwardToFirstResponseDelayHist[t]; } 219 220 Stats::Histogram& 221 getFirstResponseToCompletionDelayHist(const MachineType t) const 222 { return *m_FirstResponseToCompletionDelayHist[t]; } 223 224 // Changed to protected to enable inheritance by VIPER Coalescer 225 protected: 226 bool tryCacheAccess(Addr addr, RubyRequestType type, 227 Addr pc, RubyAccessMode access_mode, 228 int size, DataBlock*& data_ptr); 229 // Alternate implementations in VIPER Coalescer 230 virtual void issueRequest(PacketPtr pkt, RubyRequestType type); 231 232 void kernelCallback(int wavfront_id); 233 234 void hitCallback(GPUCoalescerRequest* request, 235 MachineType mach, 236 DataBlock& data, 237 bool success, 238 Cycles initialRequestTime, 239 Cycles forwardRequestTime, 240 Cycles firstResponseTime, 241 bool isRegion); 242 void recordMissLatency(GPUCoalescerRequest* request, 243 MachineType mach, 244 Cycles initialRequestTime, 245 Cycles forwardRequestTime, 246 Cycles firstResponseTime, 247 bool success, bool isRegion); 248 void completeHitCallback(std::vector<PacketPtr> & mylist, int len); 249 PacketPtr mapAddrToPkt(Addr address); 250 251 252 RequestStatus getRequestStatus(PacketPtr pkt, 253 RubyRequestType request_type); 254 bool insertRequest(PacketPtr pkt, RubyRequestType request_type); 255 256 bool handleLlsc(Addr address, GPUCoalescerRequest* request); 257 258 EventFunctionWrapper issueEvent; 259 260 261 // Changed to protected to enable inheritance by VIPER Coalescer 262 protected: 263 int m_max_outstanding_requests; 264 int m_deadlock_threshold; 265 266 CacheMemory* m_dataCache_ptr; 267 CacheMemory* m_instCache_ptr; 268 269 // We need to track both the primary and secondary request types. 270 // The secondary request type comprises a subset of RubyRequestTypes that 271 // are understood by the L1 Controller. A primary request type can be any 272 // RubyRequestType. 273 typedef std::unordered_map<Addr, std::vector<RequestDesc>> CoalescingTable; 274 CoalescingTable reqCoalescer; 275 std::vector<Addr> newRequests; 276 277 typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable; 278 RequestTable m_writeRequestTable; 279 RequestTable m_readRequestTable; 280 // Global outstanding request count, across all request tables 281 int m_outstanding_count; 282 bool m_deadlock_check_scheduled; 283 std::unordered_map<int, PacketPtr> kernelEndList; 284 std::vector<int> newKernelEnds; 285 286 int m_store_waiting_on_load_cycles; 287 int m_store_waiting_on_store_cycles; 288 int m_load_waiting_on_store_cycles; 289 int m_load_waiting_on_load_cycles; 290 291 bool m_runningGarnetStandalone; 292 293 EventFunctionWrapper deadlockCheckEvent; 294 bool assumingRfOCoherence; 295 296 // m5 style stats for TCP hit/miss counts 297 Stats::Scalar GPU_TCPLdHits; 298 Stats::Scalar GPU_TCPLdTransfers; 299 Stats::Scalar GPU_TCCLdHits; 300 Stats::Scalar GPU_LdMiss; 301 302 Stats::Scalar GPU_TCPStHits; 303 Stats::Scalar GPU_TCPStTransfers; 304 Stats::Scalar GPU_TCCStHits; 305 Stats::Scalar GPU_StMiss; 306 307 Stats::Scalar CP_TCPLdHits; 308 Stats::Scalar CP_TCPLdTransfers; 309 Stats::Scalar CP_TCCLdHits; 310 Stats::Scalar CP_LdMiss; 311 312 Stats::Scalar CP_TCPStHits; 313 Stats::Scalar CP_TCPStTransfers; 314 Stats::Scalar CP_TCCStHits; 315 Stats::Scalar CP_StMiss; 316 317 //! Histogram for number of outstanding requests per cycle. 318 Stats::Histogram m_outstandReqHist; 319 320 //! Histogram for holding latency profile of all requests. 321 Stats::Histogram m_latencyHist; 322 std::vector<Stats::Histogram *> m_typeLatencyHist; 323 324 //! Histogram for holding latency profile of all requests that 325 //! miss in the controller connected to this sequencer. 326 Stats::Histogram m_missLatencyHist; 327 std::vector<Stats::Histogram *> m_missTypeLatencyHist; 328 329 //! Histograms for profiling the latencies for requests that 330 //! required external messages. 331 std::vector<Stats::Histogram *> m_missMachLatencyHist; 332 std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist; 333 334 //! Histograms for recording the breakdown of miss latency 335 std::vector<Stats::Histogram *> m_IssueToInitialDelayHist; 336 std::vector<Stats::Histogram *> m_InitialToForwardDelayHist; 337 std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist; 338 std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist; 339 340private: 341 // Private copy constructor and assignment operator 342 GPUCoalescer(const GPUCoalescer& obj); 343 GPUCoalescer& operator=(const GPUCoalescer& obj); 344}; 345 346inline std::ostream& 347operator<<(std::ostream& out, const GPUCoalescer& obj) 348{ 349 obj.print(out); 350 out << std::flush; 351 return out; 352} 353 354#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ 355