GPUCoalescer.cc (13399:98f54e365584) GPUCoalescer.cc (13974:af47a3ae0f6b)
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Sooraj Puthoor
34 */
35
36#include "base/logging.hh"
37#include "base/str.hh"
38#include "config/the_isa.hh"
39
40#if THE_ISA == X86_ISA
41#include "arch/x86/insts/microldstop.hh"
42
43#endif // X86_ISA
44#include "mem/ruby/system/GPUCoalescer.hh"
45
46#include "cpu/testers/rubytest/RubyTester.hh"
47#include "debug/GPUCoalescer.hh"
48#include "debug/MemoryAccess.hh"
49#include "debug/ProtocolTrace.hh"
50#include "debug/RubyPort.hh"
51#include "debug/RubyStats.hh"
52#include "gpu-compute/shader.hh"
53#include "mem/packet.hh"
54#include "mem/ruby/common/DataBlock.hh"
55#include "mem/ruby/common/SubBlock.hh"
56#include "mem/ruby/network/MessageBuffer.hh"
57#include "mem/ruby/profiler/Profiler.hh"
58#include "mem/ruby/slicc_interface/AbstractController.hh"
59#include "mem/ruby/slicc_interface/RubyRequest.hh"
60#include "mem/ruby/structures/CacheMemory.hh"
61#include "mem/ruby/system/RubySystem.hh"
62#include "params/RubyGPUCoalescer.hh"
63
64using namespace std;
65
66GPUCoalescer *
67RubyGPUCoalescerParams::create()
68{
69 return new GPUCoalescer(this);
70}
71
72HSAScope
73reqScopeToHSAScope(const RequestPtr &req)
74{
75 HSAScope accessScope = HSAScope_UNSPECIFIED;
76 if (req->isScoped()) {
77 if (req->isWavefrontScope()) {
78 accessScope = HSAScope_WAVEFRONT;
79 } else if (req->isWorkgroupScope()) {
80 accessScope = HSAScope_WORKGROUP;
81 } else if (req->isDeviceScope()) {
82 accessScope = HSAScope_DEVICE;
83 } else if (req->isSystemScope()) {
84 accessScope = HSAScope_SYSTEM;
85 } else {
86 fatal("Bad scope type");
87 }
88 }
89 return accessScope;
90}
91
92HSASegment
93reqSegmentToHSASegment(const RequestPtr &req)
94{
95 HSASegment accessSegment = HSASegment_GLOBAL;
96
97 if (req->isGlobalSegment()) {
98 accessSegment = HSASegment_GLOBAL;
99 } else if (req->isGroupSegment()) {
100 accessSegment = HSASegment_GROUP;
101 } else if (req->isPrivateSegment()) {
102 accessSegment = HSASegment_PRIVATE;
103 } else if (req->isKernargSegment()) {
104 accessSegment = HSASegment_KERNARG;
105 } else if (req->isReadonlySegment()) {
106 accessSegment = HSASegment_READONLY;
107 } else if (req->isSpillSegment()) {
108 accessSegment = HSASegment_SPILL;
109 } else if (req->isArgSegment()) {
110 accessSegment = HSASegment_ARG;
111 } else {
112 fatal("Bad segment type");
113 }
114
115 return accessSegment;
116}
117
118GPUCoalescer::GPUCoalescer(const Params *p)
119 : RubyPort(p),
120 issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
121 false, Event::Progress_Event_Pri),
122 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check")
123{
124 m_store_waiting_on_load_cycles = 0;
125 m_store_waiting_on_store_cycles = 0;
126 m_load_waiting_on_store_cycles = 0;
127 m_load_waiting_on_load_cycles = 0;
128
129 m_outstanding_count = 0;
130
131 m_max_outstanding_requests = 0;
132 m_deadlock_threshold = 0;
133 m_instCache_ptr = nullptr;
134 m_dataCache_ptr = nullptr;
135
136 m_instCache_ptr = p->icache;
137 m_dataCache_ptr = p->dcache;
138 m_max_outstanding_requests = p->max_outstanding_requests;
139 m_deadlock_threshold = p->deadlock_threshold;
140
141 assert(m_max_outstanding_requests > 0);
142 assert(m_deadlock_threshold > 0);
143 assert(m_instCache_ptr);
144 assert(m_dataCache_ptr);
145
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Sooraj Puthoor
34 */
35
36#include "base/logging.hh"
37#include "base/str.hh"
38#include "config/the_isa.hh"
39
40#if THE_ISA == X86_ISA
41#include "arch/x86/insts/microldstop.hh"
42
43#endif // X86_ISA
44#include "mem/ruby/system/GPUCoalescer.hh"
45
46#include "cpu/testers/rubytest/RubyTester.hh"
47#include "debug/GPUCoalescer.hh"
48#include "debug/MemoryAccess.hh"
49#include "debug/ProtocolTrace.hh"
50#include "debug/RubyPort.hh"
51#include "debug/RubyStats.hh"
52#include "gpu-compute/shader.hh"
53#include "mem/packet.hh"
54#include "mem/ruby/common/DataBlock.hh"
55#include "mem/ruby/common/SubBlock.hh"
56#include "mem/ruby/network/MessageBuffer.hh"
57#include "mem/ruby/profiler/Profiler.hh"
58#include "mem/ruby/slicc_interface/AbstractController.hh"
59#include "mem/ruby/slicc_interface/RubyRequest.hh"
60#include "mem/ruby/structures/CacheMemory.hh"
61#include "mem/ruby/system/RubySystem.hh"
62#include "params/RubyGPUCoalescer.hh"
63
64using namespace std;
65
66GPUCoalescer *
67RubyGPUCoalescerParams::create()
68{
69 return new GPUCoalescer(this);
70}
71
72HSAScope
73reqScopeToHSAScope(const RequestPtr &req)
74{
75 HSAScope accessScope = HSAScope_UNSPECIFIED;
76 if (req->isScoped()) {
77 if (req->isWavefrontScope()) {
78 accessScope = HSAScope_WAVEFRONT;
79 } else if (req->isWorkgroupScope()) {
80 accessScope = HSAScope_WORKGROUP;
81 } else if (req->isDeviceScope()) {
82 accessScope = HSAScope_DEVICE;
83 } else if (req->isSystemScope()) {
84 accessScope = HSAScope_SYSTEM;
85 } else {
86 fatal("Bad scope type");
87 }
88 }
89 return accessScope;
90}
91
92HSASegment
93reqSegmentToHSASegment(const RequestPtr &req)
94{
95 HSASegment accessSegment = HSASegment_GLOBAL;
96
97 if (req->isGlobalSegment()) {
98 accessSegment = HSASegment_GLOBAL;
99 } else if (req->isGroupSegment()) {
100 accessSegment = HSASegment_GROUP;
101 } else if (req->isPrivateSegment()) {
102 accessSegment = HSASegment_PRIVATE;
103 } else if (req->isKernargSegment()) {
104 accessSegment = HSASegment_KERNARG;
105 } else if (req->isReadonlySegment()) {
106 accessSegment = HSASegment_READONLY;
107 } else if (req->isSpillSegment()) {
108 accessSegment = HSASegment_SPILL;
109 } else if (req->isArgSegment()) {
110 accessSegment = HSASegment_ARG;
111 } else {
112 fatal("Bad segment type");
113 }
114
115 return accessSegment;
116}
117
118GPUCoalescer::GPUCoalescer(const Params *p)
119 : RubyPort(p),
120 issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
121 false, Event::Progress_Event_Pri),
122 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check")
123{
124 m_store_waiting_on_load_cycles = 0;
125 m_store_waiting_on_store_cycles = 0;
126 m_load_waiting_on_store_cycles = 0;
127 m_load_waiting_on_load_cycles = 0;
128
129 m_outstanding_count = 0;
130
131 m_max_outstanding_requests = 0;
132 m_deadlock_threshold = 0;
133 m_instCache_ptr = nullptr;
134 m_dataCache_ptr = nullptr;
135
136 m_instCache_ptr = p->icache;
137 m_dataCache_ptr = p->dcache;
138 m_max_outstanding_requests = p->max_outstanding_requests;
139 m_deadlock_threshold = p->deadlock_threshold;
140
141 assert(m_max_outstanding_requests > 0);
142 assert(m_deadlock_threshold > 0);
143 assert(m_instCache_ptr);
144 assert(m_dataCache_ptr);
145
146 m_data_cache_hit_latency = p->dcache_hit_latency;
147
148 m_runningGarnetStandalone = p->garnet_standalone;
149 assumingRfOCoherence = p->assume_rfo;
150}
151
152GPUCoalescer::~GPUCoalescer()
153{
154}
155
156void
157GPUCoalescer::wakeup()
158{
159 // Check for deadlock of any of the requests
160 Cycles current_time = curCycle();
161
162 // Check across all outstanding requests
163 int total_outstanding = 0;
164
165 RequestTable::iterator read = m_readRequestTable.begin();
166 RequestTable::iterator read_end = m_readRequestTable.end();
167 for (; read != read_end; ++read) {
168 GPUCoalescerRequest* request = read->second;
169 if (current_time - request->issue_time < m_deadlock_threshold)
170 continue;
171
172 panic("Possible Deadlock detected. Aborting!\n"
173 "version: %d request.paddr: 0x%x m_readRequestTable: %d "
174 "current time: %u issue_time: %d difference: %d\n", m_version,
175 request->pkt->getAddr(), m_readRequestTable.size(),
176 current_time * clockPeriod(), request->issue_time * clockPeriod(),
177 (current_time - request->issue_time)*clockPeriod());
178 }
179
180 RequestTable::iterator write = m_writeRequestTable.begin();
181 RequestTable::iterator write_end = m_writeRequestTable.end();
182 for (; write != write_end; ++write) {
183 GPUCoalescerRequest* request = write->second;
184 if (current_time - request->issue_time < m_deadlock_threshold)
185 continue;
186
187 panic("Possible Deadlock detected. Aborting!\n"
188 "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
189 "current time: %u issue_time: %d difference: %d\n", m_version,
190 request->pkt->getAddr(), m_writeRequestTable.size(),
191 current_time * clockPeriod(), request->issue_time * clockPeriod(),
192 (current_time - request->issue_time) * clockPeriod());
193 }
194
195 total_outstanding += m_writeRequestTable.size();
196 total_outstanding += m_readRequestTable.size();
197
198 assert(m_outstanding_count == total_outstanding);
199
200 if (m_outstanding_count > 0) {
201 // If there are still outstanding requests, keep checking
202 schedule(deadlockCheckEvent,
203 m_deadlock_threshold * clockPeriod() +
204 curTick());
205 }
206}
207
208void
209GPUCoalescer::resetStats()
210{
211 m_latencyHist.reset();
212 m_missLatencyHist.reset();
213 for (int i = 0; i < RubyRequestType_NUM; i++) {
214 m_typeLatencyHist[i]->reset();
215 m_missTypeLatencyHist[i]->reset();
216 for (int j = 0; j < MachineType_NUM; j++) {
217 m_missTypeMachLatencyHist[i][j]->reset();
218 }
219 }
220
221 for (int i = 0; i < MachineType_NUM; i++) {
222 m_missMachLatencyHist[i]->reset();
223
224 m_IssueToInitialDelayHist[i]->reset();
225 m_InitialToForwardDelayHist[i]->reset();
226 m_ForwardToFirstResponseDelayHist[i]->reset();
227 m_FirstResponseToCompletionDelayHist[i]->reset();
228 }
229}
230
231void
232GPUCoalescer::printProgress(ostream& out) const
233{
234}
235
236RequestStatus
237GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
238{
239 Addr line_addr = makeLineAddress(pkt->getAddr());
240
241 if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
242 return RequestStatus_BufferFull;
243 }
244
245 if (m_controller->isBlocked(line_addr) &&
246 request_type != RubyRequestType_Locked_RMW_Write) {
247 return RequestStatus_Aliased;
248 }
249
250 if ((request_type == RubyRequestType_ST) ||
251 (request_type == RubyRequestType_ATOMIC) ||
252 (request_type == RubyRequestType_ATOMIC_RETURN) ||
253 (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
254 (request_type == RubyRequestType_RMW_Read) ||
255 (request_type == RubyRequestType_RMW_Write) ||
256 (request_type == RubyRequestType_Load_Linked) ||
257 (request_type == RubyRequestType_Store_Conditional) ||
258 (request_type == RubyRequestType_Locked_RMW_Read) ||
259 (request_type == RubyRequestType_Locked_RMW_Write) ||
260 (request_type == RubyRequestType_FLUSH)) {
261
262 // Check if there is any outstanding read request for the same
263 // cache line.
264 if (m_readRequestTable.count(line_addr) > 0) {
265 m_store_waiting_on_load_cycles++;
266 return RequestStatus_Aliased;
267 }
268
269 if (m_writeRequestTable.count(line_addr) > 0) {
270 // There is an outstanding write request for the cache line
271 m_store_waiting_on_store_cycles++;
272 return RequestStatus_Aliased;
273 }
274 } else {
275 // Check if there is any outstanding write request for the same
276 // cache line.
277 if (m_writeRequestTable.count(line_addr) > 0) {
278 m_load_waiting_on_store_cycles++;
279 return RequestStatus_Aliased;
280 }
281
282 if (m_readRequestTable.count(line_addr) > 0) {
283 // There is an outstanding read request for the cache line
284 m_load_waiting_on_load_cycles++;
285 return RequestStatus_Aliased;
286 }
287 }
288
289 return RequestStatus_Ready;
290
291}
292
293
294
295// sets the kernelEndList
296void
297GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
298{
299 // Don't know if this will happen or is possible
300 // but I just want to be careful and not have it become
301 // simulator hang in the future
302 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
303 assert(kernelEndList.count(wavefront_id) == 0);
304
305 kernelEndList[wavefront_id] = pkt;
306 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
307 kernelEndList.size());
308}
309
310
311// Insert the request on the correct request table. Return true if
312// the entry was already present.
313bool
314GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
315{
316 assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
317 pkt->req->isLockedRMW() ||
318 !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
319
320 int total_outstanding M5_VAR_USED =
321 m_writeRequestTable.size() + m_readRequestTable.size();
322
323 assert(m_outstanding_count == total_outstanding);
324
325 // See if we should schedule a deadlock check
326 if (!deadlockCheckEvent.scheduled()) {
327 schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
328 }
329
330 Addr line_addr = makeLineAddress(pkt->getAddr());
331 if ((request_type == RubyRequestType_ST) ||
332 (request_type == RubyRequestType_ATOMIC) ||
333 (request_type == RubyRequestType_ATOMIC_RETURN) ||
334 (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
335 (request_type == RubyRequestType_RMW_Read) ||
336 (request_type == RubyRequestType_RMW_Write) ||
337 (request_type == RubyRequestType_Load_Linked) ||
338 (request_type == RubyRequestType_Store_Conditional) ||
339 (request_type == RubyRequestType_Locked_RMW_Read) ||
340 (request_type == RubyRequestType_Locked_RMW_Write) ||
341 (request_type == RubyRequestType_FLUSH)) {
342
343 pair<RequestTable::iterator, bool> r =
344 m_writeRequestTable.insert(RequestTable::value_type(line_addr,
345 (GPUCoalescerRequest*) NULL));
346 if (r.second) {
347 RequestTable::iterator i = r.first;
348 i->second = new GPUCoalescerRequest(pkt, request_type,
349 curCycle());
350 DPRINTF(GPUCoalescer,
351 "Inserting write request for paddr %#x for type %d\n",
352 pkt->req->getPaddr(), i->second->m_type);
353 m_outstanding_count++;
354 } else {
355 return true;
356 }
357 } else {
358 pair<RequestTable::iterator, bool> r =
359 m_readRequestTable.insert(RequestTable::value_type(line_addr,
360 (GPUCoalescerRequest*) NULL));
361
362 if (r.second) {
363 RequestTable::iterator i = r.first;
364 i->second = new GPUCoalescerRequest(pkt, request_type,
365 curCycle());
366 DPRINTF(GPUCoalescer,
367 "Inserting read request for paddr %#x for type %d\n",
368 pkt->req->getPaddr(), i->second->m_type);
369 m_outstanding_count++;
370 } else {
371 return true;
372 }
373 }
374
375 m_outstandReqHist.sample(m_outstanding_count);
376
377 total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
378 assert(m_outstanding_count == total_outstanding);
379
380 return false;
381}
382
383void
384GPUCoalescer::markRemoved()
385{
386 m_outstanding_count--;
387 assert(m_outstanding_count ==
388 m_writeRequestTable.size() + m_readRequestTable.size());
389}
390
391void
392GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
393{
394 assert(m_outstanding_count ==
395 m_writeRequestTable.size() + m_readRequestTable.size());
396
397 Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
398 if ((srequest->m_type == RubyRequestType_ST) ||
399 (srequest->m_type == RubyRequestType_RMW_Read) ||
400 (srequest->m_type == RubyRequestType_RMW_Write) ||
401 (srequest->m_type == RubyRequestType_Load_Linked) ||
402 (srequest->m_type == RubyRequestType_Store_Conditional) ||
403 (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
404 (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
405 m_writeRequestTable.erase(line_addr);
406 } else {
407 m_readRequestTable.erase(line_addr);
408 }
409
410 markRemoved();
411}
412
413bool
414GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
415{
416 //
417 // The success flag indicates whether the LLSC operation was successful.
418 // LL ops will always succeed, but SC may fail if the cache line is no
419 // longer locked.
420 //
421 bool success = true;
422 if (request->m_type == RubyRequestType_Store_Conditional) {
423 if (!m_dataCache_ptr->isLocked(address, m_version)) {
424 //
425 // For failed SC requests, indicate the failure to the cpu by
426 // setting the extra data to zero.
427 //
428 request->pkt->req->setExtraData(0);
429 success = false;
430 } else {
431 //
432 // For successful SC requests, indicate the success to the cpu by
433 // setting the extra data to one.
434 //
435 request->pkt->req->setExtraData(1);
436 }
437 //
438 // Independent of success, all SC operations must clear the lock
439 //
440 m_dataCache_ptr->clearLocked(address);
441 } else if (request->m_type == RubyRequestType_Load_Linked) {
442 //
443 // Note: To fully follow Alpha LLSC semantics, should the LL clear any
444 // previously locked cache lines?
445 //
446 m_dataCache_ptr->setLocked(address, m_version);
447 } else if ((m_dataCache_ptr->isTagPresent(address)) &&
448 (m_dataCache_ptr->isLocked(address, m_version))) {
449 //
450 // Normal writes should clear the locked address
451 //
452 m_dataCache_ptr->clearLocked(address);
453 }
454 return success;
455}
456
457void
458GPUCoalescer::writeCallback(Addr address, DataBlock& data)
459{
460 writeCallback(address, MachineType_NULL, data);
461}
462
463void
464GPUCoalescer::writeCallback(Addr address,
465 MachineType mach,
466 DataBlock& data)
467{
468 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
469}
470
471void
472GPUCoalescer::writeCallback(Addr address,
473 MachineType mach,
474 DataBlock& data,
475 Cycles initialRequestTime,
476 Cycles forwardRequestTime,
477 Cycles firstResponseTime)
478{
479 writeCallback(address, mach, data,
480 initialRequestTime, forwardRequestTime, firstResponseTime,
481 false);
482}
483
484void
485GPUCoalescer::writeCallback(Addr address,
486 MachineType mach,
487 DataBlock& data,
488 Cycles initialRequestTime,
489 Cycles forwardRequestTime,
490 Cycles firstResponseTime,
491 bool isRegion)
492{
493 assert(address == makeLineAddress(address));
494
495 DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
496 assert(m_writeRequestTable.count(makeLineAddress(address)));
497
498 RequestTable::iterator i = m_writeRequestTable.find(address);
499 assert(i != m_writeRequestTable.end());
500 GPUCoalescerRequest* request = i->second;
501
502 m_writeRequestTable.erase(i);
503 markRemoved();
504
505 assert((request->m_type == RubyRequestType_ST) ||
506 (request->m_type == RubyRequestType_ATOMIC) ||
507 (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
508 (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
509 (request->m_type == RubyRequestType_RMW_Read) ||
510 (request->m_type == RubyRequestType_RMW_Write) ||
511 (request->m_type == RubyRequestType_Load_Linked) ||
512 (request->m_type == RubyRequestType_Store_Conditional) ||
513 (request->m_type == RubyRequestType_Locked_RMW_Read) ||
514 (request->m_type == RubyRequestType_Locked_RMW_Write) ||
515 (request->m_type == RubyRequestType_FLUSH));
516
517
518 //
519 // For Alpha, properly handle LL, SC, and write requests with respect to
520 // locked cache blocks.
521 //
522 // Not valid for Garnet_standalone protocl
523 //
524 bool success = true;
525 if (!m_runningGarnetStandalone)
526 success = handleLlsc(address, request);
527
528 if (request->m_type == RubyRequestType_Locked_RMW_Read) {
529 m_controller->blockOnQueue(address, m_mandatory_q_ptr);
530 } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
531 m_controller->unblock(address);
532 }
533
534 hitCallback(request, mach, data, success,
535 request->issue_time, forwardRequestTime, firstResponseTime,
536 isRegion);
537}
538
539void
540GPUCoalescer::readCallback(Addr address, DataBlock& data)
541{
542 readCallback(address, MachineType_NULL, data);
543}
544
545void
546GPUCoalescer::readCallback(Addr address,
547 MachineType mach,
548 DataBlock& data)
549{
550 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
551}
552
553void
554GPUCoalescer::readCallback(Addr address,
555 MachineType mach,
556 DataBlock& data,
557 Cycles initialRequestTime,
558 Cycles forwardRequestTime,
559 Cycles firstResponseTime)
560{
561
562 readCallback(address, mach, data,
563 initialRequestTime, forwardRequestTime, firstResponseTime,
564 false);
565}
566
567void
568GPUCoalescer::readCallback(Addr address,
569 MachineType mach,
570 DataBlock& data,
571 Cycles initialRequestTime,
572 Cycles forwardRequestTime,
573 Cycles firstResponseTime,
574 bool isRegion)
575{
576 assert(address == makeLineAddress(address));
577 assert(m_readRequestTable.count(makeLineAddress(address)));
578
579 DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
580 RequestTable::iterator i = m_readRequestTable.find(address);
581 assert(i != m_readRequestTable.end());
582 GPUCoalescerRequest* request = i->second;
583
584 m_readRequestTable.erase(i);
585 markRemoved();
586
587 assert((request->m_type == RubyRequestType_LD) ||
588 (request->m_type == RubyRequestType_IFETCH));
589
590 hitCallback(request, mach, data, true,
591 request->issue_time, forwardRequestTime, firstResponseTime,
592 isRegion);
593}
594
595void
596GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
597 MachineType mach,
598 DataBlock& data,
599 bool success,
600 Cycles initialRequestTime,
601 Cycles forwardRequestTime,
602 Cycles firstResponseTime,
603 bool isRegion)
604{
605 PacketPtr pkt = srequest->pkt;
606 Addr request_address = pkt->getAddr();
607 Addr request_line_address = makeLineAddress(request_address);
608
609 RubyRequestType type = srequest->m_type;
610
611 // Set this cache entry to the most recently used
612 if (type == RubyRequestType_IFETCH) {
613 if (m_instCache_ptr->isTagPresent(request_line_address))
614 m_instCache_ptr->setMRU(request_line_address);
615 } else {
616 if (m_dataCache_ptr->isTagPresent(request_line_address))
617 m_dataCache_ptr->setMRU(request_line_address);
618 }
619
620 recordMissLatency(srequest, mach,
621 initialRequestTime,
622 forwardRequestTime,
623 firstResponseTime,
624 success, isRegion);
625 // update the data
626 //
627 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
628 int len = reqCoalescer[request_line_address].size();
629 std::vector<PacketPtr> mylist;
630 for (int i = 0; i < len; ++i) {
631 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
632 assert(type == reqCoalescer[request_line_address][i].primaryType);
633 request_address = pkt->getAddr();
634 request_line_address = makeLineAddress(pkt->getAddr());
635 if (pkt->getPtr<uint8_t>()) {
636 if ((type == RubyRequestType_LD) ||
637 (type == RubyRequestType_ATOMIC) ||
638 (type == RubyRequestType_ATOMIC_RETURN) ||
639 (type == RubyRequestType_IFETCH) ||
640 (type == RubyRequestType_RMW_Read) ||
641 (type == RubyRequestType_Locked_RMW_Read) ||
642 (type == RubyRequestType_Load_Linked)) {
643 pkt->setData(
644 data.getData(getOffset(request_address), pkt->getSize()));
645 } else {
646 data.setData(pkt->getPtr<uint8_t>(),
647 getOffset(request_address), pkt->getSize());
648 }
649 } else {
650 DPRINTF(MemoryAccess,
651 "WARNING. Data not transfered from Ruby to M5 for type " \
652 "%s\n",
653 RubyRequestType_to_string(type));
654 }
655
656 // If using the RubyTester, update the RubyTester sender state's
657 // subBlock with the recieved data. The tester will later access
658 // this state.
659 // Note: RubyPort will access it's sender state before the
660 // RubyTester.
661 if (m_usingRubyTester) {
662 RubyPort::SenderState *requestSenderState =
663 safe_cast<RubyPort::SenderState*>(pkt->senderState);
664 RubyTester::SenderState* testerSenderState =
665 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
666 testerSenderState->subBlock.mergeFrom(data);
667 }
668
669 mylist.push_back(pkt);
670 }
671 delete srequest;
672 reqCoalescer.erase(request_line_address);
673 assert(!reqCoalescer.count(request_line_address));
674
675
676
677 completeHitCallback(mylist, len);
678}
679
680bool
681GPUCoalescer::empty() const
682{
683 return m_writeRequestTable.empty() && m_readRequestTable.empty();
684}
685
686// Analyzes the packet to see if this request can be coalesced.
687// If request can be coalesced, this request is added to the reqCoalescer table
688// and makeRequest returns RequestStatus_Issued;
689// If this is the first request to a cacheline, request is added to both
690// newRequests queue and to the reqCoalescer table; makeRequest
691// returns RequestStatus_Issued.
692// If there is a pending request to this cacheline and this request
693// can't be coalesced, RequestStatus_Aliased is returned and
694// the packet needs to be reissued.
695RequestStatus
696GPUCoalescer::makeRequest(PacketPtr pkt)
697{
698 // Check for GPU Barrier Kernel End or Kernel Begin
699 // Leave these to be handled by the child class
700 // Kernel End/Barrier = isFlush + isRelease
701 // Kernel Begin = isFlush + isAcquire
702 if (pkt->req->isKernel()) {
703 if (pkt->req->isAcquire()){
704 // This is a Kernel Begin leave handling to
705 // virtual xCoalescer::makeRequest
706 return RequestStatus_Issued;
707 }else if (pkt->req->isRelease()) {
708 // This is a Kernel End leave handling to
709 // virtual xCoalescer::makeRequest
710 // If we are here then we didn't call
711 // a virtual version of this function
712 // so we will also schedule the callback
713 int wf_id = 0;
714 if (pkt->req->hasContextId()) {
715 wf_id = pkt->req->contextId();
716 }
717 insertKernel(wf_id, pkt);
718 newKernelEnds.push_back(wf_id);
719 if (!issueEvent.scheduled()) {
720 schedule(issueEvent, curTick());
721 }
722 return RequestStatus_Issued;
723 }
724 }
725
726 // If number of outstanding requests greater than the max allowed,
727 // return RequestStatus_BufferFull. This logic can be extended to
728 // support proper backpressure.
729 if (m_outstanding_count >= m_max_outstanding_requests) {
730 return RequestStatus_BufferFull;
731 }
732
733 RubyRequestType primary_type = RubyRequestType_NULL;
734 RubyRequestType secondary_type = RubyRequestType_NULL;
735
736 if (pkt->isLLSC()) {
737 //
738 // Alpha LL/SC instructions need to be handled carefully by the cache
739 // coherence protocol to ensure they follow the proper semantics. In
740 // particular, by identifying the operations as atomic, the protocol
741 // should understand that migratory sharing optimizations should not
742 // be performed (i.e. a load between the LL and SC should not steal
743 // away exclusive permission).
744 //
745 if (pkt->isWrite()) {
746 primary_type = RubyRequestType_Store_Conditional;
747 } else {
748 assert(pkt->isRead());
749 primary_type = RubyRequestType_Load_Linked;
750 }
751 secondary_type = RubyRequestType_ATOMIC;
752 } else if (pkt->req->isLockedRMW()) {
753 //
754 // x86 locked instructions are translated to store cache coherence
755 // requests because these requests should always be treated as read
756 // exclusive operations and should leverage any migratory sharing
757 // optimization built into the protocol.
758 //
759 if (pkt->isWrite()) {
760 primary_type = RubyRequestType_Locked_RMW_Write;
761 } else {
762 assert(pkt->isRead());
763 primary_type = RubyRequestType_Locked_RMW_Read;
764 }
765 secondary_type = RubyRequestType_ST;
766 } else if (pkt->isAtomicOp()) {
767 //
768 // GPU Atomic Operation
769 //
770 primary_type = RubyRequestType_ATOMIC;
771 secondary_type = RubyRequestType_ATOMIC;
772 } else {
773 if (pkt->isRead()) {
774 if (pkt->req->isInstFetch()) {
775 primary_type = secondary_type = RubyRequestType_IFETCH;
776 } else {
777#if THE_ISA == X86_ISA
778 uint32_t flags = pkt->req->getFlags();
779 bool storeCheck = flags &
780 (TheISA::StoreCheck << TheISA::FlagShift);
781#else
782 bool storeCheck = false;
783#endif // X86_ISA
784 if (storeCheck) {
785 primary_type = RubyRequestType_RMW_Read;
786 secondary_type = RubyRequestType_ST;
787 } else {
788 primary_type = secondary_type = RubyRequestType_LD;
789 }
790 }
791 } else if (pkt->isWrite()) {
792 //
793 // Note: M5 packets do not differentiate ST from RMW_Write
794 //
795 primary_type = secondary_type = RubyRequestType_ST;
796 } else if (pkt->isFlush()) {
797 primary_type = secondary_type = RubyRequestType_FLUSH;
798 } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
799 if (assumingRfOCoherence) {
800 // If we reached here, this request must be a memFence
801 // and the protocol implements RfO, the coalescer can
802 // assume sequentially consistency and schedule the callback
803 // immediately.
804 // Currently the code implements fence callbacks
805 // by reusing the mechanism for kernel completions.
806 // This should be fixed.
807 int wf_id = 0;
808 if (pkt->req->hasContextId()) {
809 wf_id = pkt->req->contextId();
810 }
811 insertKernel(wf_id, pkt);
812 newKernelEnds.push_back(wf_id);
813 if (!issueEvent.scheduled()) {
814 schedule(issueEvent, curTick());
815 }
816 return RequestStatus_Issued;
817 } else {
818 // If not RfO, return issued here and let the child coalescer
819 // take care of it.
820 return RequestStatus_Issued;
821 }
822 } else {
823 panic("Unsupported ruby packet type\n");
824 }
825 }
826
827 // Check if there is any pending request to this cache line from
828 // previous cycles.
829 // If there is a pending request, return aliased. Since coalescing
830 // across time is not permitted, aliased requests are not coalesced.
831 // If a request for this address has already been issued, we must block
832 RequestStatus status = getRequestStatus(pkt, primary_type);
833 if (status != RequestStatus_Ready)
834 return status;
835
836 Addr line_addr = makeLineAddress(pkt->getAddr());
837
838 // Check if this request can be coalesced with previous
839 // requests from this cycle.
840 if (!reqCoalescer.count(line_addr)) {
841 // This is the first access to this cache line.
842 // A new request to the memory subsystem has to be
843 // made in the next cycle for this cache line, so
844 // add this line addr to the "newRequests" queue
845 newRequests.push_back(line_addr);
846
847 // There was a request to this cache line in this cycle,
848 // let us see if we can coalesce this request with the previous
849 // requests from this cycle
850 } else if (primary_type !=
851 reqCoalescer[line_addr][0].primaryType) {
852 // can't coalesce loads, stores and atomics!
853 return RequestStatus_Aliased;
854 } else if (pkt->req->isLockedRMW() ||
855 reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
856 // can't coalesce locked accesses, but can coalesce atomics!
857 return RequestStatus_Aliased;
858 } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
859 pkt->req->contextId() !=
860 reqCoalescer[line_addr][0].pkt->req->contextId()) {
861 // can't coalesce releases from different wavefronts
862 return RequestStatus_Aliased;
863 }
864
865 // in addition to the packet, we need to save both request types
866 reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
867 if (!issueEvent.scheduled())
868 schedule(issueEvent, curTick());
869 // TODO: issue hardware prefetches here
870 return RequestStatus_Issued;
871}
872
873void
874GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
875{
876
877 int proc_id = -1;
878 if (pkt != NULL && pkt->req->hasContextId()) {
879 proc_id = pkt->req->contextId();
880 }
881
882 // If valid, copy the pc to the ruby request
883 Addr pc = 0;
884 if (pkt->req->hasPC()) {
885 pc = pkt->req->getPC();
886 }
887
888 // At the moment setting scopes only counts
889 // for GPU spill space accesses
890 // which is pkt->req->isStack()
891 // this scope is REPLACE since it
892 // does not need to be flushed at the end
893 // of a kernel Private and local may need
894 // to be visible at the end of the kernel
895 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
896 HSAScope accessScope = reqScopeToHSAScope(pkt->req);
897
898 Addr line_addr = makeLineAddress(pkt->getAddr());
899
900 // Creating WriteMask that records written bytes
901 // and atomic operations. This enables partial writes
902 // and partial reads of those writes
903 DataBlock dataBlock;
904 dataBlock.clear();
905 uint32_t blockSize = RubySystem::getBlockSizeBytes();
906 std::vector<bool> accessMask(blockSize,false);
907 std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
908 uint32_t tableSize = reqCoalescer[line_addr].size();
909 for (int i = 0; i < tableSize; i++) {
910 PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
911 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
912 uint32_t tmpSize = tmpPkt->getSize();
913 if (tmpPkt->isAtomicOp()) {
914 std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
915 tmpPkt->getAtomicOp());
916 atomicOps.push_back(tmpAtomicOp);
917 } else if (tmpPkt->isWrite()) {
918 dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
919 tmpOffset, tmpSize);
920 }
921 for (int j = 0; j < tmpSize; j++) {
922 accessMask[tmpOffset + j] = true;
923 }
924 }
925 std::shared_ptr<RubyRequest> msg;
926 if (pkt->isAtomicOp()) {
927 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
928 pkt->getPtr<uint8_t>(),
929 pkt->getSize(), pc, secondary_type,
930 RubyAccessMode_Supervisor, pkt,
931 PrefetchBit_No, proc_id, 100,
932 blockSize, accessMask,
933 dataBlock, atomicOps,
934 accessScope, accessSegment);
935 } else {
936 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
937 pkt->getPtr<uint8_t>(),
938 pkt->getSize(), pc, secondary_type,
939 RubyAccessMode_Supervisor, pkt,
940 PrefetchBit_No, proc_id, 100,
941 blockSize, accessMask,
942 dataBlock,
943 accessScope, accessSegment);
944 }
945 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
946 curTick(), m_version, "Coal", "Begin", "", "",
947 printAddress(msg->getPhysicalAddress()),
948 RubyRequestType_to_string(secondary_type));
949
950 fatal_if(secondary_type == RubyRequestType_IFETCH,
951 "there should not be any I-Fetch requests in the GPU Coalescer");
952
146 m_runningGarnetStandalone = p->garnet_standalone;
147 assumingRfOCoherence = p->assume_rfo;
148}
149
150GPUCoalescer::~GPUCoalescer()
151{
152}
153
154void
155GPUCoalescer::wakeup()
156{
157 // Check for deadlock of any of the requests
158 Cycles current_time = curCycle();
159
160 // Check across all outstanding requests
161 int total_outstanding = 0;
162
163 RequestTable::iterator read = m_readRequestTable.begin();
164 RequestTable::iterator read_end = m_readRequestTable.end();
165 for (; read != read_end; ++read) {
166 GPUCoalescerRequest* request = read->second;
167 if (current_time - request->issue_time < m_deadlock_threshold)
168 continue;
169
170 panic("Possible Deadlock detected. Aborting!\n"
171 "version: %d request.paddr: 0x%x m_readRequestTable: %d "
172 "current time: %u issue_time: %d difference: %d\n", m_version,
173 request->pkt->getAddr(), m_readRequestTable.size(),
174 current_time * clockPeriod(), request->issue_time * clockPeriod(),
175 (current_time - request->issue_time)*clockPeriod());
176 }
177
178 RequestTable::iterator write = m_writeRequestTable.begin();
179 RequestTable::iterator write_end = m_writeRequestTable.end();
180 for (; write != write_end; ++write) {
181 GPUCoalescerRequest* request = write->second;
182 if (current_time - request->issue_time < m_deadlock_threshold)
183 continue;
184
185 panic("Possible Deadlock detected. Aborting!\n"
186 "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
187 "current time: %u issue_time: %d difference: %d\n", m_version,
188 request->pkt->getAddr(), m_writeRequestTable.size(),
189 current_time * clockPeriod(), request->issue_time * clockPeriod(),
190 (current_time - request->issue_time) * clockPeriod());
191 }
192
193 total_outstanding += m_writeRequestTable.size();
194 total_outstanding += m_readRequestTable.size();
195
196 assert(m_outstanding_count == total_outstanding);
197
198 if (m_outstanding_count > 0) {
199 // If there are still outstanding requests, keep checking
200 schedule(deadlockCheckEvent,
201 m_deadlock_threshold * clockPeriod() +
202 curTick());
203 }
204}
205
206void
207GPUCoalescer::resetStats()
208{
209 m_latencyHist.reset();
210 m_missLatencyHist.reset();
211 for (int i = 0; i < RubyRequestType_NUM; i++) {
212 m_typeLatencyHist[i]->reset();
213 m_missTypeLatencyHist[i]->reset();
214 for (int j = 0; j < MachineType_NUM; j++) {
215 m_missTypeMachLatencyHist[i][j]->reset();
216 }
217 }
218
219 for (int i = 0; i < MachineType_NUM; i++) {
220 m_missMachLatencyHist[i]->reset();
221
222 m_IssueToInitialDelayHist[i]->reset();
223 m_InitialToForwardDelayHist[i]->reset();
224 m_ForwardToFirstResponseDelayHist[i]->reset();
225 m_FirstResponseToCompletionDelayHist[i]->reset();
226 }
227}
228
229void
230GPUCoalescer::printProgress(ostream& out) const
231{
232}
233
234RequestStatus
235GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
236{
237 Addr line_addr = makeLineAddress(pkt->getAddr());
238
239 if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
240 return RequestStatus_BufferFull;
241 }
242
243 if (m_controller->isBlocked(line_addr) &&
244 request_type != RubyRequestType_Locked_RMW_Write) {
245 return RequestStatus_Aliased;
246 }
247
248 if ((request_type == RubyRequestType_ST) ||
249 (request_type == RubyRequestType_ATOMIC) ||
250 (request_type == RubyRequestType_ATOMIC_RETURN) ||
251 (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
252 (request_type == RubyRequestType_RMW_Read) ||
253 (request_type == RubyRequestType_RMW_Write) ||
254 (request_type == RubyRequestType_Load_Linked) ||
255 (request_type == RubyRequestType_Store_Conditional) ||
256 (request_type == RubyRequestType_Locked_RMW_Read) ||
257 (request_type == RubyRequestType_Locked_RMW_Write) ||
258 (request_type == RubyRequestType_FLUSH)) {
259
260 // Check if there is any outstanding read request for the same
261 // cache line.
262 if (m_readRequestTable.count(line_addr) > 0) {
263 m_store_waiting_on_load_cycles++;
264 return RequestStatus_Aliased;
265 }
266
267 if (m_writeRequestTable.count(line_addr) > 0) {
268 // There is an outstanding write request for the cache line
269 m_store_waiting_on_store_cycles++;
270 return RequestStatus_Aliased;
271 }
272 } else {
273 // Check if there is any outstanding write request for the same
274 // cache line.
275 if (m_writeRequestTable.count(line_addr) > 0) {
276 m_load_waiting_on_store_cycles++;
277 return RequestStatus_Aliased;
278 }
279
280 if (m_readRequestTable.count(line_addr) > 0) {
281 // There is an outstanding read request for the cache line
282 m_load_waiting_on_load_cycles++;
283 return RequestStatus_Aliased;
284 }
285 }
286
287 return RequestStatus_Ready;
288
289}
290
291
292
293// sets the kernelEndList
294void
295GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
296{
297 // Don't know if this will happen or is possible
298 // but I just want to be careful and not have it become
299 // simulator hang in the future
300 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
301 assert(kernelEndList.count(wavefront_id) == 0);
302
303 kernelEndList[wavefront_id] = pkt;
304 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
305 kernelEndList.size());
306}
307
308
309// Insert the request on the correct request table. Return true if
310// the entry was already present.
311bool
312GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
313{
314 assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
315 pkt->req->isLockedRMW() ||
316 !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
317
318 int total_outstanding M5_VAR_USED =
319 m_writeRequestTable.size() + m_readRequestTable.size();
320
321 assert(m_outstanding_count == total_outstanding);
322
323 // See if we should schedule a deadlock check
324 if (!deadlockCheckEvent.scheduled()) {
325 schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
326 }
327
328 Addr line_addr = makeLineAddress(pkt->getAddr());
329 if ((request_type == RubyRequestType_ST) ||
330 (request_type == RubyRequestType_ATOMIC) ||
331 (request_type == RubyRequestType_ATOMIC_RETURN) ||
332 (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
333 (request_type == RubyRequestType_RMW_Read) ||
334 (request_type == RubyRequestType_RMW_Write) ||
335 (request_type == RubyRequestType_Load_Linked) ||
336 (request_type == RubyRequestType_Store_Conditional) ||
337 (request_type == RubyRequestType_Locked_RMW_Read) ||
338 (request_type == RubyRequestType_Locked_RMW_Write) ||
339 (request_type == RubyRequestType_FLUSH)) {
340
341 pair<RequestTable::iterator, bool> r =
342 m_writeRequestTable.insert(RequestTable::value_type(line_addr,
343 (GPUCoalescerRequest*) NULL));
344 if (r.second) {
345 RequestTable::iterator i = r.first;
346 i->second = new GPUCoalescerRequest(pkt, request_type,
347 curCycle());
348 DPRINTF(GPUCoalescer,
349 "Inserting write request for paddr %#x for type %d\n",
350 pkt->req->getPaddr(), i->second->m_type);
351 m_outstanding_count++;
352 } else {
353 return true;
354 }
355 } else {
356 pair<RequestTable::iterator, bool> r =
357 m_readRequestTable.insert(RequestTable::value_type(line_addr,
358 (GPUCoalescerRequest*) NULL));
359
360 if (r.second) {
361 RequestTable::iterator i = r.first;
362 i->second = new GPUCoalescerRequest(pkt, request_type,
363 curCycle());
364 DPRINTF(GPUCoalescer,
365 "Inserting read request for paddr %#x for type %d\n",
366 pkt->req->getPaddr(), i->second->m_type);
367 m_outstanding_count++;
368 } else {
369 return true;
370 }
371 }
372
373 m_outstandReqHist.sample(m_outstanding_count);
374
375 total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
376 assert(m_outstanding_count == total_outstanding);
377
378 return false;
379}
380
381void
382GPUCoalescer::markRemoved()
383{
384 m_outstanding_count--;
385 assert(m_outstanding_count ==
386 m_writeRequestTable.size() + m_readRequestTable.size());
387}
388
389void
390GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
391{
392 assert(m_outstanding_count ==
393 m_writeRequestTable.size() + m_readRequestTable.size());
394
395 Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
396 if ((srequest->m_type == RubyRequestType_ST) ||
397 (srequest->m_type == RubyRequestType_RMW_Read) ||
398 (srequest->m_type == RubyRequestType_RMW_Write) ||
399 (srequest->m_type == RubyRequestType_Load_Linked) ||
400 (srequest->m_type == RubyRequestType_Store_Conditional) ||
401 (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
402 (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
403 m_writeRequestTable.erase(line_addr);
404 } else {
405 m_readRequestTable.erase(line_addr);
406 }
407
408 markRemoved();
409}
410
411bool
412GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
413{
414 //
415 // The success flag indicates whether the LLSC operation was successful.
416 // LL ops will always succeed, but SC may fail if the cache line is no
417 // longer locked.
418 //
419 bool success = true;
420 if (request->m_type == RubyRequestType_Store_Conditional) {
421 if (!m_dataCache_ptr->isLocked(address, m_version)) {
422 //
423 // For failed SC requests, indicate the failure to the cpu by
424 // setting the extra data to zero.
425 //
426 request->pkt->req->setExtraData(0);
427 success = false;
428 } else {
429 //
430 // For successful SC requests, indicate the success to the cpu by
431 // setting the extra data to one.
432 //
433 request->pkt->req->setExtraData(1);
434 }
435 //
436 // Independent of success, all SC operations must clear the lock
437 //
438 m_dataCache_ptr->clearLocked(address);
439 } else if (request->m_type == RubyRequestType_Load_Linked) {
440 //
441 // Note: To fully follow Alpha LLSC semantics, should the LL clear any
442 // previously locked cache lines?
443 //
444 m_dataCache_ptr->setLocked(address, m_version);
445 } else if ((m_dataCache_ptr->isTagPresent(address)) &&
446 (m_dataCache_ptr->isLocked(address, m_version))) {
447 //
448 // Normal writes should clear the locked address
449 //
450 m_dataCache_ptr->clearLocked(address);
451 }
452 return success;
453}
454
455void
456GPUCoalescer::writeCallback(Addr address, DataBlock& data)
457{
458 writeCallback(address, MachineType_NULL, data);
459}
460
461void
462GPUCoalescer::writeCallback(Addr address,
463 MachineType mach,
464 DataBlock& data)
465{
466 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
467}
468
469void
470GPUCoalescer::writeCallback(Addr address,
471 MachineType mach,
472 DataBlock& data,
473 Cycles initialRequestTime,
474 Cycles forwardRequestTime,
475 Cycles firstResponseTime)
476{
477 writeCallback(address, mach, data,
478 initialRequestTime, forwardRequestTime, firstResponseTime,
479 false);
480}
481
482void
483GPUCoalescer::writeCallback(Addr address,
484 MachineType mach,
485 DataBlock& data,
486 Cycles initialRequestTime,
487 Cycles forwardRequestTime,
488 Cycles firstResponseTime,
489 bool isRegion)
490{
491 assert(address == makeLineAddress(address));
492
493 DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
494 assert(m_writeRequestTable.count(makeLineAddress(address)));
495
496 RequestTable::iterator i = m_writeRequestTable.find(address);
497 assert(i != m_writeRequestTable.end());
498 GPUCoalescerRequest* request = i->second;
499
500 m_writeRequestTable.erase(i);
501 markRemoved();
502
503 assert((request->m_type == RubyRequestType_ST) ||
504 (request->m_type == RubyRequestType_ATOMIC) ||
505 (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
506 (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
507 (request->m_type == RubyRequestType_RMW_Read) ||
508 (request->m_type == RubyRequestType_RMW_Write) ||
509 (request->m_type == RubyRequestType_Load_Linked) ||
510 (request->m_type == RubyRequestType_Store_Conditional) ||
511 (request->m_type == RubyRequestType_Locked_RMW_Read) ||
512 (request->m_type == RubyRequestType_Locked_RMW_Write) ||
513 (request->m_type == RubyRequestType_FLUSH));
514
515
516 //
517 // For Alpha, properly handle LL, SC, and write requests with respect to
518 // locked cache blocks.
519 //
520 // Not valid for Garnet_standalone protocl
521 //
522 bool success = true;
523 if (!m_runningGarnetStandalone)
524 success = handleLlsc(address, request);
525
526 if (request->m_type == RubyRequestType_Locked_RMW_Read) {
527 m_controller->blockOnQueue(address, m_mandatory_q_ptr);
528 } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
529 m_controller->unblock(address);
530 }
531
532 hitCallback(request, mach, data, success,
533 request->issue_time, forwardRequestTime, firstResponseTime,
534 isRegion);
535}
536
537void
538GPUCoalescer::readCallback(Addr address, DataBlock& data)
539{
540 readCallback(address, MachineType_NULL, data);
541}
542
543void
544GPUCoalescer::readCallback(Addr address,
545 MachineType mach,
546 DataBlock& data)
547{
548 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
549}
550
551void
552GPUCoalescer::readCallback(Addr address,
553 MachineType mach,
554 DataBlock& data,
555 Cycles initialRequestTime,
556 Cycles forwardRequestTime,
557 Cycles firstResponseTime)
558{
559
560 readCallback(address, mach, data,
561 initialRequestTime, forwardRequestTime, firstResponseTime,
562 false);
563}
564
565void
566GPUCoalescer::readCallback(Addr address,
567 MachineType mach,
568 DataBlock& data,
569 Cycles initialRequestTime,
570 Cycles forwardRequestTime,
571 Cycles firstResponseTime,
572 bool isRegion)
573{
574 assert(address == makeLineAddress(address));
575 assert(m_readRequestTable.count(makeLineAddress(address)));
576
577 DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
578 RequestTable::iterator i = m_readRequestTable.find(address);
579 assert(i != m_readRequestTable.end());
580 GPUCoalescerRequest* request = i->second;
581
582 m_readRequestTable.erase(i);
583 markRemoved();
584
585 assert((request->m_type == RubyRequestType_LD) ||
586 (request->m_type == RubyRequestType_IFETCH));
587
588 hitCallback(request, mach, data, true,
589 request->issue_time, forwardRequestTime, firstResponseTime,
590 isRegion);
591}
592
593void
594GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
595 MachineType mach,
596 DataBlock& data,
597 bool success,
598 Cycles initialRequestTime,
599 Cycles forwardRequestTime,
600 Cycles firstResponseTime,
601 bool isRegion)
602{
603 PacketPtr pkt = srequest->pkt;
604 Addr request_address = pkt->getAddr();
605 Addr request_line_address = makeLineAddress(request_address);
606
607 RubyRequestType type = srequest->m_type;
608
609 // Set this cache entry to the most recently used
610 if (type == RubyRequestType_IFETCH) {
611 if (m_instCache_ptr->isTagPresent(request_line_address))
612 m_instCache_ptr->setMRU(request_line_address);
613 } else {
614 if (m_dataCache_ptr->isTagPresent(request_line_address))
615 m_dataCache_ptr->setMRU(request_line_address);
616 }
617
618 recordMissLatency(srequest, mach,
619 initialRequestTime,
620 forwardRequestTime,
621 firstResponseTime,
622 success, isRegion);
623 // update the data
624 //
625 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
626 int len = reqCoalescer[request_line_address].size();
627 std::vector<PacketPtr> mylist;
628 for (int i = 0; i < len; ++i) {
629 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
630 assert(type == reqCoalescer[request_line_address][i].primaryType);
631 request_address = pkt->getAddr();
632 request_line_address = makeLineAddress(pkt->getAddr());
633 if (pkt->getPtr<uint8_t>()) {
634 if ((type == RubyRequestType_LD) ||
635 (type == RubyRequestType_ATOMIC) ||
636 (type == RubyRequestType_ATOMIC_RETURN) ||
637 (type == RubyRequestType_IFETCH) ||
638 (type == RubyRequestType_RMW_Read) ||
639 (type == RubyRequestType_Locked_RMW_Read) ||
640 (type == RubyRequestType_Load_Linked)) {
641 pkt->setData(
642 data.getData(getOffset(request_address), pkt->getSize()));
643 } else {
644 data.setData(pkt->getPtr<uint8_t>(),
645 getOffset(request_address), pkt->getSize());
646 }
647 } else {
648 DPRINTF(MemoryAccess,
649 "WARNING. Data not transfered from Ruby to M5 for type " \
650 "%s\n",
651 RubyRequestType_to_string(type));
652 }
653
654 // If using the RubyTester, update the RubyTester sender state's
655 // subBlock with the recieved data. The tester will later access
656 // this state.
657 // Note: RubyPort will access it's sender state before the
658 // RubyTester.
659 if (m_usingRubyTester) {
660 RubyPort::SenderState *requestSenderState =
661 safe_cast<RubyPort::SenderState*>(pkt->senderState);
662 RubyTester::SenderState* testerSenderState =
663 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
664 testerSenderState->subBlock.mergeFrom(data);
665 }
666
667 mylist.push_back(pkt);
668 }
669 delete srequest;
670 reqCoalescer.erase(request_line_address);
671 assert(!reqCoalescer.count(request_line_address));
672
673
674
675 completeHitCallback(mylist, len);
676}
677
678bool
679GPUCoalescer::empty() const
680{
681 return m_writeRequestTable.empty() && m_readRequestTable.empty();
682}
683
684// Analyzes the packet to see if this request can be coalesced.
685// If request can be coalesced, this request is added to the reqCoalescer table
686// and makeRequest returns RequestStatus_Issued;
687// If this is the first request to a cacheline, request is added to both
688// newRequests queue and to the reqCoalescer table; makeRequest
689// returns RequestStatus_Issued.
690// If there is a pending request to this cacheline and this request
691// can't be coalesced, RequestStatus_Aliased is returned and
692// the packet needs to be reissued.
693RequestStatus
694GPUCoalescer::makeRequest(PacketPtr pkt)
695{
696 // Check for GPU Barrier Kernel End or Kernel Begin
697 // Leave these to be handled by the child class
698 // Kernel End/Barrier = isFlush + isRelease
699 // Kernel Begin = isFlush + isAcquire
700 if (pkt->req->isKernel()) {
701 if (pkt->req->isAcquire()){
702 // This is a Kernel Begin leave handling to
703 // virtual xCoalescer::makeRequest
704 return RequestStatus_Issued;
705 }else if (pkt->req->isRelease()) {
706 // This is a Kernel End leave handling to
707 // virtual xCoalescer::makeRequest
708 // If we are here then we didn't call
709 // a virtual version of this function
710 // so we will also schedule the callback
711 int wf_id = 0;
712 if (pkt->req->hasContextId()) {
713 wf_id = pkt->req->contextId();
714 }
715 insertKernel(wf_id, pkt);
716 newKernelEnds.push_back(wf_id);
717 if (!issueEvent.scheduled()) {
718 schedule(issueEvent, curTick());
719 }
720 return RequestStatus_Issued;
721 }
722 }
723
724 // If number of outstanding requests greater than the max allowed,
725 // return RequestStatus_BufferFull. This logic can be extended to
726 // support proper backpressure.
727 if (m_outstanding_count >= m_max_outstanding_requests) {
728 return RequestStatus_BufferFull;
729 }
730
731 RubyRequestType primary_type = RubyRequestType_NULL;
732 RubyRequestType secondary_type = RubyRequestType_NULL;
733
734 if (pkt->isLLSC()) {
735 //
736 // Alpha LL/SC instructions need to be handled carefully by the cache
737 // coherence protocol to ensure they follow the proper semantics. In
738 // particular, by identifying the operations as atomic, the protocol
739 // should understand that migratory sharing optimizations should not
740 // be performed (i.e. a load between the LL and SC should not steal
741 // away exclusive permission).
742 //
743 if (pkt->isWrite()) {
744 primary_type = RubyRequestType_Store_Conditional;
745 } else {
746 assert(pkt->isRead());
747 primary_type = RubyRequestType_Load_Linked;
748 }
749 secondary_type = RubyRequestType_ATOMIC;
750 } else if (pkt->req->isLockedRMW()) {
751 //
752 // x86 locked instructions are translated to store cache coherence
753 // requests because these requests should always be treated as read
754 // exclusive operations and should leverage any migratory sharing
755 // optimization built into the protocol.
756 //
757 if (pkt->isWrite()) {
758 primary_type = RubyRequestType_Locked_RMW_Write;
759 } else {
760 assert(pkt->isRead());
761 primary_type = RubyRequestType_Locked_RMW_Read;
762 }
763 secondary_type = RubyRequestType_ST;
764 } else if (pkt->isAtomicOp()) {
765 //
766 // GPU Atomic Operation
767 //
768 primary_type = RubyRequestType_ATOMIC;
769 secondary_type = RubyRequestType_ATOMIC;
770 } else {
771 if (pkt->isRead()) {
772 if (pkt->req->isInstFetch()) {
773 primary_type = secondary_type = RubyRequestType_IFETCH;
774 } else {
775#if THE_ISA == X86_ISA
776 uint32_t flags = pkt->req->getFlags();
777 bool storeCheck = flags &
778 (TheISA::StoreCheck << TheISA::FlagShift);
779#else
780 bool storeCheck = false;
781#endif // X86_ISA
782 if (storeCheck) {
783 primary_type = RubyRequestType_RMW_Read;
784 secondary_type = RubyRequestType_ST;
785 } else {
786 primary_type = secondary_type = RubyRequestType_LD;
787 }
788 }
789 } else if (pkt->isWrite()) {
790 //
791 // Note: M5 packets do not differentiate ST from RMW_Write
792 //
793 primary_type = secondary_type = RubyRequestType_ST;
794 } else if (pkt->isFlush()) {
795 primary_type = secondary_type = RubyRequestType_FLUSH;
796 } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
797 if (assumingRfOCoherence) {
798 // If we reached here, this request must be a memFence
799 // and the protocol implements RfO, the coalescer can
800 // assume sequentially consistency and schedule the callback
801 // immediately.
802 // Currently the code implements fence callbacks
803 // by reusing the mechanism for kernel completions.
804 // This should be fixed.
805 int wf_id = 0;
806 if (pkt->req->hasContextId()) {
807 wf_id = pkt->req->contextId();
808 }
809 insertKernel(wf_id, pkt);
810 newKernelEnds.push_back(wf_id);
811 if (!issueEvent.scheduled()) {
812 schedule(issueEvent, curTick());
813 }
814 return RequestStatus_Issued;
815 } else {
816 // If not RfO, return issued here and let the child coalescer
817 // take care of it.
818 return RequestStatus_Issued;
819 }
820 } else {
821 panic("Unsupported ruby packet type\n");
822 }
823 }
824
825 // Check if there is any pending request to this cache line from
826 // previous cycles.
827 // If there is a pending request, return aliased. Since coalescing
828 // across time is not permitted, aliased requests are not coalesced.
829 // If a request for this address has already been issued, we must block
830 RequestStatus status = getRequestStatus(pkt, primary_type);
831 if (status != RequestStatus_Ready)
832 return status;
833
834 Addr line_addr = makeLineAddress(pkt->getAddr());
835
836 // Check if this request can be coalesced with previous
837 // requests from this cycle.
838 if (!reqCoalescer.count(line_addr)) {
839 // This is the first access to this cache line.
840 // A new request to the memory subsystem has to be
841 // made in the next cycle for this cache line, so
842 // add this line addr to the "newRequests" queue
843 newRequests.push_back(line_addr);
844
845 // There was a request to this cache line in this cycle,
846 // let us see if we can coalesce this request with the previous
847 // requests from this cycle
848 } else if (primary_type !=
849 reqCoalescer[line_addr][0].primaryType) {
850 // can't coalesce loads, stores and atomics!
851 return RequestStatus_Aliased;
852 } else if (pkt->req->isLockedRMW() ||
853 reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
854 // can't coalesce locked accesses, but can coalesce atomics!
855 return RequestStatus_Aliased;
856 } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
857 pkt->req->contextId() !=
858 reqCoalescer[line_addr][0].pkt->req->contextId()) {
859 // can't coalesce releases from different wavefronts
860 return RequestStatus_Aliased;
861 }
862
863 // in addition to the packet, we need to save both request types
864 reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
865 if (!issueEvent.scheduled())
866 schedule(issueEvent, curTick());
867 // TODO: issue hardware prefetches here
868 return RequestStatus_Issued;
869}
870
871void
872GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
873{
874
875 int proc_id = -1;
876 if (pkt != NULL && pkt->req->hasContextId()) {
877 proc_id = pkt->req->contextId();
878 }
879
880 // If valid, copy the pc to the ruby request
881 Addr pc = 0;
882 if (pkt->req->hasPC()) {
883 pc = pkt->req->getPC();
884 }
885
886 // At the moment setting scopes only counts
887 // for GPU spill space accesses
888 // which is pkt->req->isStack()
889 // this scope is REPLACE since it
890 // does not need to be flushed at the end
891 // of a kernel Private and local may need
892 // to be visible at the end of the kernel
893 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
894 HSAScope accessScope = reqScopeToHSAScope(pkt->req);
895
896 Addr line_addr = makeLineAddress(pkt->getAddr());
897
898 // Creating WriteMask that records written bytes
899 // and atomic operations. This enables partial writes
900 // and partial reads of those writes
901 DataBlock dataBlock;
902 dataBlock.clear();
903 uint32_t blockSize = RubySystem::getBlockSizeBytes();
904 std::vector<bool> accessMask(blockSize,false);
905 std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
906 uint32_t tableSize = reqCoalescer[line_addr].size();
907 for (int i = 0; i < tableSize; i++) {
908 PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
909 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
910 uint32_t tmpSize = tmpPkt->getSize();
911 if (tmpPkt->isAtomicOp()) {
912 std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
913 tmpPkt->getAtomicOp());
914 atomicOps.push_back(tmpAtomicOp);
915 } else if (tmpPkt->isWrite()) {
916 dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
917 tmpOffset, tmpSize);
918 }
919 for (int j = 0; j < tmpSize; j++) {
920 accessMask[tmpOffset + j] = true;
921 }
922 }
923 std::shared_ptr<RubyRequest> msg;
924 if (pkt->isAtomicOp()) {
925 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
926 pkt->getPtr<uint8_t>(),
927 pkt->getSize(), pc, secondary_type,
928 RubyAccessMode_Supervisor, pkt,
929 PrefetchBit_No, proc_id, 100,
930 blockSize, accessMask,
931 dataBlock, atomicOps,
932 accessScope, accessSegment);
933 } else {
934 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
935 pkt->getPtr<uint8_t>(),
936 pkt->getSize(), pc, secondary_type,
937 RubyAccessMode_Supervisor, pkt,
938 PrefetchBit_No, proc_id, 100,
939 blockSize, accessMask,
940 dataBlock,
941 accessScope, accessSegment);
942 }
943 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
944 curTick(), m_version, "Coal", "Begin", "", "",
945 printAddress(msg->getPhysicalAddress()),
946 RubyRequestType_to_string(secondary_type));
947
948 fatal_if(secondary_type == RubyRequestType_IFETCH,
949 "there should not be any I-Fetch requests in the GPU Coalescer");
950
953 // Send the message to the cache controller
954 fatal_if(m_data_cache_hit_latency == 0,
955 "should not have a latency of zero");
951 Tick latency = cyclesToTicks(
952 m_controller->mandatoryQueueLatency(secondary_type));
953 assert(latency > 0);
956
957 assert(m_mandatory_q_ptr);
954
955 assert(m_mandatory_q_ptr);
958 m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
956 m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
959}
960
961template <class KEY, class VALUE>
962std::ostream &
963operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
964{
965 out << "[";
966 for (auto i = map.begin(); i != map.end(); ++i)
967 out << " " << i->first << "=" << i->second;
968 out << " ]";
969
970 return out;
971}
972
973void
974GPUCoalescer::print(ostream& out) const
975{
976 out << "[GPUCoalescer: " << m_version
977 << ", outstanding requests: " << m_outstanding_count
978 << ", read request table: " << m_readRequestTable
979 << ", write request table: " << m_writeRequestTable
980 << "]";
981}
982
983// this can be called from setState whenever coherence permissions are
984// upgraded when invoked, coherence violations will be checked for the
985// given block
986void
987GPUCoalescer::checkCoherence(Addr addr)
988{
989#ifdef CHECK_COHERENCE
990 m_ruby_system->checkGlobalCoherenceInvariant(addr);
991#endif
992}
993
994void
995GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
996 DPRINTF(RubyStats, "Recorded statistic: %s\n",
997 SequencerRequestType_to_string(requestType));
998}
999
1000
1001void
1002GPUCoalescer::completeIssue()
1003{
1004 // newRequests has the cacheline addresses of all the
1005 // requests which need to be issued to the memory subsystem
1006 // in this cycle
1007 int len = newRequests.size();
1008 DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
1009 for (int i = 0; i < len; ++i) {
1010 // Get the requests from reqCoalescer table. Get only the
1011 // first request for each cacheline, the remaining requests
1012 // can be coalesced with the first request. So, only
1013 // one request is issued per cacheline.
1014 RequestDesc info = reqCoalescer[newRequests[i]][0];
1015 PacketPtr pkt = info.pkt;
1016 DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
1017 i, pkt->req->getPaddr());
1018 // Insert this request to the read/writeRequestTables. These tables
1019 // are used to track aliased requests in makeRequest subroutine
1020 bool found = insertRequest(pkt, info.primaryType);
1021
1022 if (found) {
1023 panic("GPUCoalescer::makeRequest should never be called if the "
1024 "request is already outstanding\n");
1025 }
1026
1027 // Issue request to ruby subsystem
1028 issueRequest(pkt, info.secondaryType);
1029 }
1030 newRequests.clear();
1031
1032 // have Kernel End releases been issued this cycle
1033 len = newKernelEnds.size();
1034 for (int i = 0; i < len; i++) {
1035 kernelCallback(newKernelEnds[i]);
1036 }
1037 newKernelEnds.clear();
1038}
1039
1040void
1041GPUCoalescer::evictionCallback(Addr address)
1042{
1043 ruby_eviction_callback(address);
1044}
1045
1046void
1047GPUCoalescer::kernelCallback(int wavefront_id)
1048{
1049 assert(kernelEndList.count(wavefront_id));
1050
1051 ruby_hit_callback(kernelEndList[wavefront_id]);
1052
1053 kernelEndList.erase(wavefront_id);
1054}
1055
1056void
1057GPUCoalescer::atomicCallback(Addr address,
1058 MachineType mach,
1059 const DataBlock& data)
1060{
1061 assert(address == makeLineAddress(address));
1062
1063 DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
1064 assert(m_writeRequestTable.count(makeLineAddress(address)));
1065
1066 RequestTable::iterator i = m_writeRequestTable.find(address);
1067 assert(i != m_writeRequestTable.end());
1068 GPUCoalescerRequest* srequest = i->second;
1069
1070 m_writeRequestTable.erase(i);
1071 markRemoved();
1072
1073 assert((srequest->m_type == RubyRequestType_ATOMIC) ||
1074 (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
1075 (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
1076
1077
1078 // Atomics don't write to cache, so there is no MRU update...
1079
1080 recordMissLatency(srequest, mach,
1081 srequest->issue_time, Cycles(0), Cycles(0), true, false);
1082
1083 PacketPtr pkt = srequest->pkt;
1084 Addr request_address = pkt->getAddr();
1085 Addr request_line_address = makeLineAddress(pkt->getAddr());
1086
1087 int len = reqCoalescer[request_line_address].size();
1088 std::vector<PacketPtr> mylist;
1089 for (int i = 0; i < len; ++i) {
1090 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
1091 assert(srequest->m_type ==
1092 reqCoalescer[request_line_address][i].primaryType);
1093 request_address = (pkt->getAddr());
1094 request_line_address = makeLineAddress(request_address);
1095 if (pkt->getPtr<uint8_t>() &&
1096 srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
1097 /* atomics are done in memory, and return the data *before* the atomic op... */
1098 pkt->setData(
1099 data.getData(getOffset(request_address), pkt->getSize()));
1100 } else {
1101 DPRINTF(MemoryAccess,
1102 "WARNING. Data not transfered from Ruby to M5 for type " \
1103 "%s\n",
1104 RubyRequestType_to_string(srequest->m_type));
1105 }
1106
1107 // If using the RubyTester, update the RubyTester sender state's
1108 // subBlock with the recieved data. The tester will later access
1109 // this state.
1110 // Note: RubyPort will access it's sender state before the
1111 // RubyTester.
1112 if (m_usingRubyTester) {
1113 RubyPort::SenderState *requestSenderState =
1114 safe_cast<RubyPort::SenderState*>(pkt->senderState);
1115 RubyTester::SenderState* testerSenderState =
1116 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
1117 testerSenderState->subBlock.mergeFrom(data);
1118 }
1119
1120 mylist.push_back(pkt);
1121 }
1122 delete srequest;
1123 reqCoalescer.erase(request_line_address);
1124 assert(!reqCoalescer.count(request_line_address));
1125
1126 completeHitCallback(mylist, len);
1127}
1128
1129void
1130GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
1131{
1132 if (myMachID == senderMachID) {
1133 CP_TCPLdHits++;
1134 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1135 CP_TCPLdTransfers++;
1136 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1137 CP_TCCLdHits++;
1138 } else {
1139 CP_LdMiss++;
1140 }
1141}
1142
1143void
1144GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
1145{
1146 if (myMachID == senderMachID) {
1147 CP_TCPStHits++;
1148 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1149 CP_TCPStTransfers++;
1150 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1151 CP_TCCStHits++;
1152 } else {
1153 CP_StMiss++;
1154 }
1155}
1156
1157void
1158GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
1159{
1160 for (int i = 0; i < len; ++i) {
1161 RubyPort::SenderState *ss =
1162 safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
1163 MemSlavePort *port = ss->port;
1164 assert(port != NULL);
1165
1166 mylist[i]->senderState = ss->predecessor;
1167 delete ss;
1168 port->hitCallback(mylist[i]);
1169 trySendRetries();
1170 }
1171
1172 testDrainComplete();
1173}
1174
1175PacketPtr
1176GPUCoalescer::mapAddrToPkt(Addr address)
1177{
1178 RequestTable::iterator i = m_readRequestTable.find(address);
1179 assert(i != m_readRequestTable.end());
1180 GPUCoalescerRequest* request = i->second;
1181 return request->pkt;
1182}
1183
1184void
1185GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
1186 MachineType mach,
1187 Cycles initialRequestTime,
1188 Cycles forwardRequestTime,
1189 Cycles firstResponseTime,
1190 bool success, bool isRegion)
1191{
1192 RubyRequestType type = srequest->m_type;
1193 Cycles issued_time = srequest->issue_time;
1194 Cycles completion_time = curCycle();
1195 assert(completion_time >= issued_time);
1196 Cycles total_lat = completion_time - issued_time;
1197
1198 // cache stats (valid for RfO protocol only)
1199 if (mach == MachineType_TCP) {
1200 if (type == RubyRequestType_LD) {
1201 GPU_TCPLdHits++;
1202 } else {
1203 GPU_TCPStHits++;
1204 }
1205 } else if (mach == MachineType_L1Cache_wCC) {
1206 if (type == RubyRequestType_LD) {
1207 GPU_TCPLdTransfers++;
1208 } else {
1209 GPU_TCPStTransfers++;
1210 }
1211 } else if (mach == MachineType_TCC) {
1212 if (type == RubyRequestType_LD) {
1213 GPU_TCCLdHits++;
1214 } else {
1215 GPU_TCCStHits++;
1216 }
1217 } else {
1218 if (type == RubyRequestType_LD) {
1219 GPU_LdMiss++;
1220 } else {
1221 GPU_StMiss++;
1222 }
1223 }
1224
1225 // Profile all access latency, even zero latency accesses
1226 m_latencyHist.sample(total_lat);
1227 m_typeLatencyHist[type]->sample(total_lat);
1228
1229 // Profile the miss latency for all non-zero demand misses
1230 if (total_lat != Cycles(0)) {
1231 m_missLatencyHist.sample(total_lat);
1232 m_missTypeLatencyHist[type]->sample(total_lat);
1233
1234 if (mach != MachineType_NUM) {
1235 m_missMachLatencyHist[mach]->sample(total_lat);
1236 m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
1237
1238 if ((issued_time <= initialRequestTime) &&
1239 (initialRequestTime <= forwardRequestTime) &&
1240 (forwardRequestTime <= firstResponseTime) &&
1241 (firstResponseTime <= completion_time)) {
1242
1243 m_IssueToInitialDelayHist[mach]->sample(
1244 initialRequestTime - issued_time);
1245 m_InitialToForwardDelayHist[mach]->sample(
1246 forwardRequestTime - initialRequestTime);
1247 m_ForwardToFirstResponseDelayHist[mach]->sample(
1248 firstResponseTime - forwardRequestTime);
1249 m_FirstResponseToCompletionDelayHist[mach]->sample(
1250 completion_time - firstResponseTime);
1251 }
1252 }
1253
1254 }
1255
1256 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1257 curTick(), m_version, "Coal",
1258 success ? "Done" : "SC_Failed", "", "",
1259 printAddress(srequest->pkt->getAddr()), total_lat);
1260}
1261
1262void
1263GPUCoalescer::regStats()
1264{
1265 RubyPort::regStats();
1266
1267 // These statistical variables are not for display.
1268 // The profiler will collate these across different
1269 // coalescers and display those collated statistics.
1270 m_outstandReqHist.init(10);
1271 m_latencyHist.init(10);
1272 m_missLatencyHist.init(10);
1273
1274 for (int i = 0; i < RubyRequestType_NUM; i++) {
1275 m_typeLatencyHist.push_back(new Stats::Histogram());
1276 m_typeLatencyHist[i]->init(10);
1277
1278 m_missTypeLatencyHist.push_back(new Stats::Histogram());
1279 m_missTypeLatencyHist[i]->init(10);
1280 }
1281
1282 for (int i = 0; i < MachineType_NUM; i++) {
1283 m_missMachLatencyHist.push_back(new Stats::Histogram());
1284 m_missMachLatencyHist[i]->init(10);
1285
1286 m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
1287 m_IssueToInitialDelayHist[i]->init(10);
1288
1289 m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
1290 m_InitialToForwardDelayHist[i]->init(10);
1291
1292 m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
1293 m_ForwardToFirstResponseDelayHist[i]->init(10);
1294
1295 m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
1296 m_FirstResponseToCompletionDelayHist[i]->init(10);
1297 }
1298
1299 for (int i = 0; i < RubyRequestType_NUM; i++) {
1300 m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
1301
1302 for (int j = 0; j < MachineType_NUM; j++) {
1303 m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
1304 m_missTypeMachLatencyHist[i][j]->init(10);
1305 }
1306 }
1307
1308 // GPU cache stats
1309 GPU_TCPLdHits
1310 .name(name() + ".gpu_tcp_ld_hits")
1311 .desc("loads that hit in the TCP")
1312 ;
1313 GPU_TCPLdTransfers
1314 .name(name() + ".gpu_tcp_ld_transfers")
1315 .desc("TCP to TCP load transfers")
1316 ;
1317 GPU_TCCLdHits
1318 .name(name() + ".gpu_tcc_ld_hits")
1319 .desc("loads that hit in the TCC")
1320 ;
1321 GPU_LdMiss
1322 .name(name() + ".gpu_ld_misses")
1323 .desc("loads that miss in the GPU")
1324 ;
1325
1326 GPU_TCPStHits
1327 .name(name() + ".gpu_tcp_st_hits")
1328 .desc("stores that hit in the TCP")
1329 ;
1330 GPU_TCPStTransfers
1331 .name(name() + ".gpu_tcp_st_transfers")
1332 .desc("TCP to TCP store transfers")
1333 ;
1334 GPU_TCCStHits
1335 .name(name() + ".gpu_tcc_st_hits")
1336 .desc("stores that hit in the TCC")
1337 ;
1338 GPU_StMiss
1339 .name(name() + ".gpu_st_misses")
1340 .desc("stores that miss in the GPU")
1341 ;
1342
1343 // CP cache stats
1344 CP_TCPLdHits
1345 .name(name() + ".cp_tcp_ld_hits")
1346 .desc("loads that hit in the TCP")
1347 ;
1348 CP_TCPLdTransfers
1349 .name(name() + ".cp_tcp_ld_transfers")
1350 .desc("TCP to TCP load transfers")
1351 ;
1352 CP_TCCLdHits
1353 .name(name() + ".cp_tcc_ld_hits")
1354 .desc("loads that hit in the TCC")
1355 ;
1356 CP_LdMiss
1357 .name(name() + ".cp_ld_misses")
1358 .desc("loads that miss in the GPU")
1359 ;
1360
1361 CP_TCPStHits
1362 .name(name() + ".cp_tcp_st_hits")
1363 .desc("stores that hit in the TCP")
1364 ;
1365 CP_TCPStTransfers
1366 .name(name() + ".cp_tcp_st_transfers")
1367 .desc("TCP to TCP store transfers")
1368 ;
1369 CP_TCCStHits
1370 .name(name() + ".cp_tcc_st_hits")
1371 .desc("stores that hit in the TCC")
1372 ;
1373 CP_StMiss
1374 .name(name() + ".cp_st_misses")
1375 .desc("stores that miss in the GPU")
1376 ;
1377}
957}
958
959template <class KEY, class VALUE>
960std::ostream &
961operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
962{
963 out << "[";
964 for (auto i = map.begin(); i != map.end(); ++i)
965 out << " " << i->first << "=" << i->second;
966 out << " ]";
967
968 return out;
969}
970
971void
972GPUCoalescer::print(ostream& out) const
973{
974 out << "[GPUCoalescer: " << m_version
975 << ", outstanding requests: " << m_outstanding_count
976 << ", read request table: " << m_readRequestTable
977 << ", write request table: " << m_writeRequestTable
978 << "]";
979}
980
981// this can be called from setState whenever coherence permissions are
982// upgraded when invoked, coherence violations will be checked for the
983// given block
984void
985GPUCoalescer::checkCoherence(Addr addr)
986{
987#ifdef CHECK_COHERENCE
988 m_ruby_system->checkGlobalCoherenceInvariant(addr);
989#endif
990}
991
992void
993GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
994 DPRINTF(RubyStats, "Recorded statistic: %s\n",
995 SequencerRequestType_to_string(requestType));
996}
997
998
999void
1000GPUCoalescer::completeIssue()
1001{
1002 // newRequests has the cacheline addresses of all the
1003 // requests which need to be issued to the memory subsystem
1004 // in this cycle
1005 int len = newRequests.size();
1006 DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
1007 for (int i = 0; i < len; ++i) {
1008 // Get the requests from reqCoalescer table. Get only the
1009 // first request for each cacheline, the remaining requests
1010 // can be coalesced with the first request. So, only
1011 // one request is issued per cacheline.
1012 RequestDesc info = reqCoalescer[newRequests[i]][0];
1013 PacketPtr pkt = info.pkt;
1014 DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
1015 i, pkt->req->getPaddr());
1016 // Insert this request to the read/writeRequestTables. These tables
1017 // are used to track aliased requests in makeRequest subroutine
1018 bool found = insertRequest(pkt, info.primaryType);
1019
1020 if (found) {
1021 panic("GPUCoalescer::makeRequest should never be called if the "
1022 "request is already outstanding\n");
1023 }
1024
1025 // Issue request to ruby subsystem
1026 issueRequest(pkt, info.secondaryType);
1027 }
1028 newRequests.clear();
1029
1030 // have Kernel End releases been issued this cycle
1031 len = newKernelEnds.size();
1032 for (int i = 0; i < len; i++) {
1033 kernelCallback(newKernelEnds[i]);
1034 }
1035 newKernelEnds.clear();
1036}
1037
1038void
1039GPUCoalescer::evictionCallback(Addr address)
1040{
1041 ruby_eviction_callback(address);
1042}
1043
1044void
1045GPUCoalescer::kernelCallback(int wavefront_id)
1046{
1047 assert(kernelEndList.count(wavefront_id));
1048
1049 ruby_hit_callback(kernelEndList[wavefront_id]);
1050
1051 kernelEndList.erase(wavefront_id);
1052}
1053
1054void
1055GPUCoalescer::atomicCallback(Addr address,
1056 MachineType mach,
1057 const DataBlock& data)
1058{
1059 assert(address == makeLineAddress(address));
1060
1061 DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
1062 assert(m_writeRequestTable.count(makeLineAddress(address)));
1063
1064 RequestTable::iterator i = m_writeRequestTable.find(address);
1065 assert(i != m_writeRequestTable.end());
1066 GPUCoalescerRequest* srequest = i->second;
1067
1068 m_writeRequestTable.erase(i);
1069 markRemoved();
1070
1071 assert((srequest->m_type == RubyRequestType_ATOMIC) ||
1072 (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
1073 (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
1074
1075
1076 // Atomics don't write to cache, so there is no MRU update...
1077
1078 recordMissLatency(srequest, mach,
1079 srequest->issue_time, Cycles(0), Cycles(0), true, false);
1080
1081 PacketPtr pkt = srequest->pkt;
1082 Addr request_address = pkt->getAddr();
1083 Addr request_line_address = makeLineAddress(pkt->getAddr());
1084
1085 int len = reqCoalescer[request_line_address].size();
1086 std::vector<PacketPtr> mylist;
1087 for (int i = 0; i < len; ++i) {
1088 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
1089 assert(srequest->m_type ==
1090 reqCoalescer[request_line_address][i].primaryType);
1091 request_address = (pkt->getAddr());
1092 request_line_address = makeLineAddress(request_address);
1093 if (pkt->getPtr<uint8_t>() &&
1094 srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
1095 /* atomics are done in memory, and return the data *before* the atomic op... */
1096 pkt->setData(
1097 data.getData(getOffset(request_address), pkt->getSize()));
1098 } else {
1099 DPRINTF(MemoryAccess,
1100 "WARNING. Data not transfered from Ruby to M5 for type " \
1101 "%s\n",
1102 RubyRequestType_to_string(srequest->m_type));
1103 }
1104
1105 // If using the RubyTester, update the RubyTester sender state's
1106 // subBlock with the recieved data. The tester will later access
1107 // this state.
1108 // Note: RubyPort will access it's sender state before the
1109 // RubyTester.
1110 if (m_usingRubyTester) {
1111 RubyPort::SenderState *requestSenderState =
1112 safe_cast<RubyPort::SenderState*>(pkt->senderState);
1113 RubyTester::SenderState* testerSenderState =
1114 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
1115 testerSenderState->subBlock.mergeFrom(data);
1116 }
1117
1118 mylist.push_back(pkt);
1119 }
1120 delete srequest;
1121 reqCoalescer.erase(request_line_address);
1122 assert(!reqCoalescer.count(request_line_address));
1123
1124 completeHitCallback(mylist, len);
1125}
1126
1127void
1128GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
1129{
1130 if (myMachID == senderMachID) {
1131 CP_TCPLdHits++;
1132 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1133 CP_TCPLdTransfers++;
1134 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1135 CP_TCCLdHits++;
1136 } else {
1137 CP_LdMiss++;
1138 }
1139}
1140
1141void
1142GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
1143{
1144 if (myMachID == senderMachID) {
1145 CP_TCPStHits++;
1146 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1147 CP_TCPStTransfers++;
1148 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1149 CP_TCCStHits++;
1150 } else {
1151 CP_StMiss++;
1152 }
1153}
1154
1155void
1156GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
1157{
1158 for (int i = 0; i < len; ++i) {
1159 RubyPort::SenderState *ss =
1160 safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
1161 MemSlavePort *port = ss->port;
1162 assert(port != NULL);
1163
1164 mylist[i]->senderState = ss->predecessor;
1165 delete ss;
1166 port->hitCallback(mylist[i]);
1167 trySendRetries();
1168 }
1169
1170 testDrainComplete();
1171}
1172
1173PacketPtr
1174GPUCoalescer::mapAddrToPkt(Addr address)
1175{
1176 RequestTable::iterator i = m_readRequestTable.find(address);
1177 assert(i != m_readRequestTable.end());
1178 GPUCoalescerRequest* request = i->second;
1179 return request->pkt;
1180}
1181
1182void
1183GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
1184 MachineType mach,
1185 Cycles initialRequestTime,
1186 Cycles forwardRequestTime,
1187 Cycles firstResponseTime,
1188 bool success, bool isRegion)
1189{
1190 RubyRequestType type = srequest->m_type;
1191 Cycles issued_time = srequest->issue_time;
1192 Cycles completion_time = curCycle();
1193 assert(completion_time >= issued_time);
1194 Cycles total_lat = completion_time - issued_time;
1195
1196 // cache stats (valid for RfO protocol only)
1197 if (mach == MachineType_TCP) {
1198 if (type == RubyRequestType_LD) {
1199 GPU_TCPLdHits++;
1200 } else {
1201 GPU_TCPStHits++;
1202 }
1203 } else if (mach == MachineType_L1Cache_wCC) {
1204 if (type == RubyRequestType_LD) {
1205 GPU_TCPLdTransfers++;
1206 } else {
1207 GPU_TCPStTransfers++;
1208 }
1209 } else if (mach == MachineType_TCC) {
1210 if (type == RubyRequestType_LD) {
1211 GPU_TCCLdHits++;
1212 } else {
1213 GPU_TCCStHits++;
1214 }
1215 } else {
1216 if (type == RubyRequestType_LD) {
1217 GPU_LdMiss++;
1218 } else {
1219 GPU_StMiss++;
1220 }
1221 }
1222
1223 // Profile all access latency, even zero latency accesses
1224 m_latencyHist.sample(total_lat);
1225 m_typeLatencyHist[type]->sample(total_lat);
1226
1227 // Profile the miss latency for all non-zero demand misses
1228 if (total_lat != Cycles(0)) {
1229 m_missLatencyHist.sample(total_lat);
1230 m_missTypeLatencyHist[type]->sample(total_lat);
1231
1232 if (mach != MachineType_NUM) {
1233 m_missMachLatencyHist[mach]->sample(total_lat);
1234 m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
1235
1236 if ((issued_time <= initialRequestTime) &&
1237 (initialRequestTime <= forwardRequestTime) &&
1238 (forwardRequestTime <= firstResponseTime) &&
1239 (firstResponseTime <= completion_time)) {
1240
1241 m_IssueToInitialDelayHist[mach]->sample(
1242 initialRequestTime - issued_time);
1243 m_InitialToForwardDelayHist[mach]->sample(
1244 forwardRequestTime - initialRequestTime);
1245 m_ForwardToFirstResponseDelayHist[mach]->sample(
1246 firstResponseTime - forwardRequestTime);
1247 m_FirstResponseToCompletionDelayHist[mach]->sample(
1248 completion_time - firstResponseTime);
1249 }
1250 }
1251
1252 }
1253
1254 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1255 curTick(), m_version, "Coal",
1256 success ? "Done" : "SC_Failed", "", "",
1257 printAddress(srequest->pkt->getAddr()), total_lat);
1258}
1259
1260void
1261GPUCoalescer::regStats()
1262{
1263 RubyPort::regStats();
1264
1265 // These statistical variables are not for display.
1266 // The profiler will collate these across different
1267 // coalescers and display those collated statistics.
1268 m_outstandReqHist.init(10);
1269 m_latencyHist.init(10);
1270 m_missLatencyHist.init(10);
1271
1272 for (int i = 0; i < RubyRequestType_NUM; i++) {
1273 m_typeLatencyHist.push_back(new Stats::Histogram());
1274 m_typeLatencyHist[i]->init(10);
1275
1276 m_missTypeLatencyHist.push_back(new Stats::Histogram());
1277 m_missTypeLatencyHist[i]->init(10);
1278 }
1279
1280 for (int i = 0; i < MachineType_NUM; i++) {
1281 m_missMachLatencyHist.push_back(new Stats::Histogram());
1282 m_missMachLatencyHist[i]->init(10);
1283
1284 m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
1285 m_IssueToInitialDelayHist[i]->init(10);
1286
1287 m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
1288 m_InitialToForwardDelayHist[i]->init(10);
1289
1290 m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
1291 m_ForwardToFirstResponseDelayHist[i]->init(10);
1292
1293 m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
1294 m_FirstResponseToCompletionDelayHist[i]->init(10);
1295 }
1296
1297 for (int i = 0; i < RubyRequestType_NUM; i++) {
1298 m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
1299
1300 for (int j = 0; j < MachineType_NUM; j++) {
1301 m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
1302 m_missTypeMachLatencyHist[i][j]->init(10);
1303 }
1304 }
1305
1306 // GPU cache stats
1307 GPU_TCPLdHits
1308 .name(name() + ".gpu_tcp_ld_hits")
1309 .desc("loads that hit in the TCP")
1310 ;
1311 GPU_TCPLdTransfers
1312 .name(name() + ".gpu_tcp_ld_transfers")
1313 .desc("TCP to TCP load transfers")
1314 ;
1315 GPU_TCCLdHits
1316 .name(name() + ".gpu_tcc_ld_hits")
1317 .desc("loads that hit in the TCC")
1318 ;
1319 GPU_LdMiss
1320 .name(name() + ".gpu_ld_misses")
1321 .desc("loads that miss in the GPU")
1322 ;
1323
1324 GPU_TCPStHits
1325 .name(name() + ".gpu_tcp_st_hits")
1326 .desc("stores that hit in the TCP")
1327 ;
1328 GPU_TCPStTransfers
1329 .name(name() + ".gpu_tcp_st_transfers")
1330 .desc("TCP to TCP store transfers")
1331 ;
1332 GPU_TCCStHits
1333 .name(name() + ".gpu_tcc_st_hits")
1334 .desc("stores that hit in the TCC")
1335 ;
1336 GPU_StMiss
1337 .name(name() + ".gpu_st_misses")
1338 .desc("stores that miss in the GPU")
1339 ;
1340
1341 // CP cache stats
1342 CP_TCPLdHits
1343 .name(name() + ".cp_tcp_ld_hits")
1344 .desc("loads that hit in the TCP")
1345 ;
1346 CP_TCPLdTransfers
1347 .name(name() + ".cp_tcp_ld_transfers")
1348 .desc("TCP to TCP load transfers")
1349 ;
1350 CP_TCCLdHits
1351 .name(name() + ".cp_tcc_ld_hits")
1352 .desc("loads that hit in the TCC")
1353 ;
1354 CP_LdMiss
1355 .name(name() + ".cp_ld_misses")
1356 .desc("loads that miss in the GPU")
1357 ;
1358
1359 CP_TCPStHits
1360 .name(name() + ".cp_tcp_st_hits")
1361 .desc("stores that hit in the TCP")
1362 ;
1363 CP_TCPStTransfers
1364 .name(name() + ".cp_tcp_st_transfers")
1365 .desc("TCP to TCP store transfers")
1366 ;
1367 CP_TCCStHits
1368 .name(name() + ".cp_tcc_st_hits")
1369 .desc("stores that hit in the TCC")
1370 ;
1371 CP_StMiss
1372 .name(name() + ".cp_st_misses")
1373 .desc("stores that miss in the GPU")
1374 ;
1375}