GPUCoalescer.cc (12133:ca42be3276af) GPUCoalescer.cc (12334:e0ab29a34764)
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Sooraj Puthoor
34 */
35
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Sooraj Puthoor
34 */
35
36#include "base/misc.hh"
36#include "base/logging.hh"
37#include "base/str.hh"
38#include "config/the_isa.hh"
39
40#if THE_ISA == X86_ISA
41#include "arch/x86/insts/microldstop.hh"
42
43#endif // X86_ISA
44#include "mem/ruby/system/GPUCoalescer.hh"
45
46#include "cpu/testers/rubytest/RubyTester.hh"
47#include "debug/GPUCoalescer.hh"
48#include "debug/MemoryAccess.hh"
49#include "debug/ProtocolTrace.hh"
50#include "debug/RubyPort.hh"
51#include "debug/RubyStats.hh"
52#include "gpu-compute/shader.hh"
53#include "mem/packet.hh"
54#include "mem/ruby/common/DataBlock.hh"
55#include "mem/ruby/common/SubBlock.hh"
56#include "mem/ruby/network/MessageBuffer.hh"
57#include "mem/ruby/profiler/Profiler.hh"
58#include "mem/ruby/slicc_interface/AbstractController.hh"
59#include "mem/ruby/slicc_interface/RubyRequest.hh"
60#include "mem/ruby/structures/CacheMemory.hh"
61#include "mem/ruby/system/RubySystem.hh"
62#include "params/RubyGPUCoalescer.hh"
63
64using namespace std;
65
66GPUCoalescer *
67RubyGPUCoalescerParams::create()
68{
69 return new GPUCoalescer(this);
70}
71
72HSAScope
73reqScopeToHSAScope(Request* req)
74{
75 HSAScope accessScope = HSAScope_UNSPECIFIED;
76 if (req->isScoped()) {
77 if (req->isWavefrontScope()) {
78 accessScope = HSAScope_WAVEFRONT;
79 } else if (req->isWorkgroupScope()) {
80 accessScope = HSAScope_WORKGROUP;
81 } else if (req->isDeviceScope()) {
82 accessScope = HSAScope_DEVICE;
83 } else if (req->isSystemScope()) {
84 accessScope = HSAScope_SYSTEM;
85 } else {
86 fatal("Bad scope type");
87 }
88 }
89 return accessScope;
90}
91
92HSASegment
93reqSegmentToHSASegment(Request* req)
94{
95 HSASegment accessSegment = HSASegment_GLOBAL;
96
97 if (req->isGlobalSegment()) {
98 accessSegment = HSASegment_GLOBAL;
99 } else if (req->isGroupSegment()) {
100 accessSegment = HSASegment_GROUP;
101 } else if (req->isPrivateSegment()) {
102 accessSegment = HSASegment_PRIVATE;
103 } else if (req->isKernargSegment()) {
104 accessSegment = HSASegment_KERNARG;
105 } else if (req->isReadonlySegment()) {
106 accessSegment = HSASegment_READONLY;
107 } else if (req->isSpillSegment()) {
108 accessSegment = HSASegment_SPILL;
109 } else if (req->isArgSegment()) {
110 accessSegment = HSASegment_ARG;
111 } else {
112 fatal("Bad segment type");
113 }
114
115 return accessSegment;
116}
117
118GPUCoalescer::GPUCoalescer(const Params *p)
119 : RubyPort(p),
120 issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
121 false, Event::Progress_Event_Pri),
122 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check")
123{
124 m_store_waiting_on_load_cycles = 0;
125 m_store_waiting_on_store_cycles = 0;
126 m_load_waiting_on_store_cycles = 0;
127 m_load_waiting_on_load_cycles = 0;
128
129 m_outstanding_count = 0;
130
131 m_max_outstanding_requests = 0;
132 m_deadlock_threshold = 0;
133 m_instCache_ptr = nullptr;
134 m_dataCache_ptr = nullptr;
135
136 m_instCache_ptr = p->icache;
137 m_dataCache_ptr = p->dcache;
138 m_max_outstanding_requests = p->max_outstanding_requests;
139 m_deadlock_threshold = p->deadlock_threshold;
140
141 assert(m_max_outstanding_requests > 0);
142 assert(m_deadlock_threshold > 0);
143 assert(m_instCache_ptr);
144 assert(m_dataCache_ptr);
145
146 m_data_cache_hit_latency = p->dcache_hit_latency;
147
148 m_runningGarnetStandalone = p->garnet_standalone;
149 assumingRfOCoherence = p->assume_rfo;
150}
151
152GPUCoalescer::~GPUCoalescer()
153{
154}
155
156void
157GPUCoalescer::wakeup()
158{
159 // Check for deadlock of any of the requests
160 Cycles current_time = curCycle();
161
162 // Check across all outstanding requests
163 int total_outstanding = 0;
164
165 RequestTable::iterator read = m_readRequestTable.begin();
166 RequestTable::iterator read_end = m_readRequestTable.end();
167 for (; read != read_end; ++read) {
168 GPUCoalescerRequest* request = read->second;
169 if (current_time - request->issue_time < m_deadlock_threshold)
170 continue;
171
172 panic("Possible Deadlock detected. Aborting!\n"
173 "version: %d request.paddr: 0x%x m_readRequestTable: %d "
174 "current time: %u issue_time: %d difference: %d\n", m_version,
175 request->pkt->getAddr(), m_readRequestTable.size(),
176 current_time * clockPeriod(), request->issue_time * clockPeriod(),
177 (current_time - request->issue_time)*clockPeriod());
178 }
179
180 RequestTable::iterator write = m_writeRequestTable.begin();
181 RequestTable::iterator write_end = m_writeRequestTable.end();
182 for (; write != write_end; ++write) {
183 GPUCoalescerRequest* request = write->second;
184 if (current_time - request->issue_time < m_deadlock_threshold)
185 continue;
186
187 panic("Possible Deadlock detected. Aborting!\n"
188 "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
189 "current time: %u issue_time: %d difference: %d\n", m_version,
190 request->pkt->getAddr(), m_writeRequestTable.size(),
191 current_time * clockPeriod(), request->issue_time * clockPeriod(),
192 (current_time - request->issue_time) * clockPeriod());
193 }
194
195 total_outstanding += m_writeRequestTable.size();
196 total_outstanding += m_readRequestTable.size();
197
198 assert(m_outstanding_count == total_outstanding);
199
200 if (m_outstanding_count > 0) {
201 // If there are still outstanding requests, keep checking
202 schedule(deadlockCheckEvent,
203 m_deadlock_threshold * clockPeriod() +
204 curTick());
205 }
206}
207
208void
209GPUCoalescer::resetStats()
210{
211 m_latencyHist.reset();
212 m_missLatencyHist.reset();
213 for (int i = 0; i < RubyRequestType_NUM; i++) {
214 m_typeLatencyHist[i]->reset();
215 m_missTypeLatencyHist[i]->reset();
216 for (int j = 0; j < MachineType_NUM; j++) {
217 m_missTypeMachLatencyHist[i][j]->reset();
218 }
219 }
220
221 for (int i = 0; i < MachineType_NUM; i++) {
222 m_missMachLatencyHist[i]->reset();
223
224 m_IssueToInitialDelayHist[i]->reset();
225 m_InitialToForwardDelayHist[i]->reset();
226 m_ForwardToFirstResponseDelayHist[i]->reset();
227 m_FirstResponseToCompletionDelayHist[i]->reset();
228 }
229}
230
231void
232GPUCoalescer::printProgress(ostream& out) const
233{
234}
235
236RequestStatus
237GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
238{
239 Addr line_addr = makeLineAddress(pkt->getAddr());
240
241 if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
242 return RequestStatus_BufferFull;
243 }
244
245 if (m_controller->isBlocked(line_addr) &&
246 request_type != RubyRequestType_Locked_RMW_Write) {
247 return RequestStatus_Aliased;
248 }
249
250 if ((request_type == RubyRequestType_ST) ||
251 (request_type == RubyRequestType_ATOMIC) ||
252 (request_type == RubyRequestType_ATOMIC_RETURN) ||
253 (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
254 (request_type == RubyRequestType_RMW_Read) ||
255 (request_type == RubyRequestType_RMW_Write) ||
256 (request_type == RubyRequestType_Load_Linked) ||
257 (request_type == RubyRequestType_Store_Conditional) ||
258 (request_type == RubyRequestType_Locked_RMW_Read) ||
259 (request_type == RubyRequestType_Locked_RMW_Write) ||
260 (request_type == RubyRequestType_FLUSH)) {
261
262 // Check if there is any outstanding read request for the same
263 // cache line.
264 if (m_readRequestTable.count(line_addr) > 0) {
265 m_store_waiting_on_load_cycles++;
266 return RequestStatus_Aliased;
267 }
268
269 if (m_writeRequestTable.count(line_addr) > 0) {
270 // There is an outstanding write request for the cache line
271 m_store_waiting_on_store_cycles++;
272 return RequestStatus_Aliased;
273 }
274 } else {
275 // Check if there is any outstanding write request for the same
276 // cache line.
277 if (m_writeRequestTable.count(line_addr) > 0) {
278 m_load_waiting_on_store_cycles++;
279 return RequestStatus_Aliased;
280 }
281
282 if (m_readRequestTable.count(line_addr) > 0) {
283 // There is an outstanding read request for the cache line
284 m_load_waiting_on_load_cycles++;
285 return RequestStatus_Aliased;
286 }
287 }
288
289 return RequestStatus_Ready;
290
291}
292
293
294
295// sets the kernelEndList
296void
297GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
298{
299 // Don't know if this will happen or is possible
300 // but I just want to be careful and not have it become
301 // simulator hang in the future
302 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
303 assert(kernelEndList.count(wavefront_id) == 0);
304
305 kernelEndList[wavefront_id] = pkt;
306 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
307 kernelEndList.size());
308}
309
310
311// Insert the request on the correct request table. Return true if
312// the entry was already present.
313bool
314GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
315{
316 assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
317 pkt->req->isLockedRMW() ||
318 !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
319
320 int total_outstanding M5_VAR_USED =
321 m_writeRequestTable.size() + m_readRequestTable.size();
322
323 assert(m_outstanding_count == total_outstanding);
324
325 // See if we should schedule a deadlock check
326 if (!deadlockCheckEvent.scheduled()) {
327 schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
328 }
329
330 Addr line_addr = makeLineAddress(pkt->getAddr());
331 if ((request_type == RubyRequestType_ST) ||
332 (request_type == RubyRequestType_ATOMIC) ||
333 (request_type == RubyRequestType_ATOMIC_RETURN) ||
334 (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
335 (request_type == RubyRequestType_RMW_Read) ||
336 (request_type == RubyRequestType_RMW_Write) ||
337 (request_type == RubyRequestType_Load_Linked) ||
338 (request_type == RubyRequestType_Store_Conditional) ||
339 (request_type == RubyRequestType_Locked_RMW_Read) ||
340 (request_type == RubyRequestType_Locked_RMW_Write) ||
341 (request_type == RubyRequestType_FLUSH)) {
342
343 pair<RequestTable::iterator, bool> r =
344 m_writeRequestTable.insert(RequestTable::value_type(line_addr,
345 (GPUCoalescerRequest*) NULL));
346 if (r.second) {
347 RequestTable::iterator i = r.first;
348 i->second = new GPUCoalescerRequest(pkt, request_type,
349 curCycle());
350 DPRINTF(GPUCoalescer,
351 "Inserting write request for paddr %#x for type %d\n",
352 pkt->req->getPaddr(), i->second->m_type);
353 m_outstanding_count++;
354 } else {
355 return true;
356 }
357 } else {
358 pair<RequestTable::iterator, bool> r =
359 m_readRequestTable.insert(RequestTable::value_type(line_addr,
360 (GPUCoalescerRequest*) NULL));
361
362 if (r.second) {
363 RequestTable::iterator i = r.first;
364 i->second = new GPUCoalescerRequest(pkt, request_type,
365 curCycle());
366 DPRINTF(GPUCoalescer,
367 "Inserting read request for paddr %#x for type %d\n",
368 pkt->req->getPaddr(), i->second->m_type);
369 m_outstanding_count++;
370 } else {
371 return true;
372 }
373 }
374
375 m_outstandReqHist.sample(m_outstanding_count);
376
377 total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
378 assert(m_outstanding_count == total_outstanding);
379
380 return false;
381}
382
383void
384GPUCoalescer::markRemoved()
385{
386 m_outstanding_count--;
387 assert(m_outstanding_count ==
388 m_writeRequestTable.size() + m_readRequestTable.size());
389}
390
391void
392GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
393{
394 assert(m_outstanding_count ==
395 m_writeRequestTable.size() + m_readRequestTable.size());
396
397 Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
398 if ((srequest->m_type == RubyRequestType_ST) ||
399 (srequest->m_type == RubyRequestType_RMW_Read) ||
400 (srequest->m_type == RubyRequestType_RMW_Write) ||
401 (srequest->m_type == RubyRequestType_Load_Linked) ||
402 (srequest->m_type == RubyRequestType_Store_Conditional) ||
403 (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
404 (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
405 m_writeRequestTable.erase(line_addr);
406 } else {
407 m_readRequestTable.erase(line_addr);
408 }
409
410 markRemoved();
411}
412
413bool
414GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
415{
416 //
417 // The success flag indicates whether the LLSC operation was successful.
418 // LL ops will always succeed, but SC may fail if the cache line is no
419 // longer locked.
420 //
421 bool success = true;
422 if (request->m_type == RubyRequestType_Store_Conditional) {
423 if (!m_dataCache_ptr->isLocked(address, m_version)) {
424 //
425 // For failed SC requests, indicate the failure to the cpu by
426 // setting the extra data to zero.
427 //
428 request->pkt->req->setExtraData(0);
429 success = false;
430 } else {
431 //
432 // For successful SC requests, indicate the success to the cpu by
433 // setting the extra data to one.
434 //
435 request->pkt->req->setExtraData(1);
436 }
437 //
438 // Independent of success, all SC operations must clear the lock
439 //
440 m_dataCache_ptr->clearLocked(address);
441 } else if (request->m_type == RubyRequestType_Load_Linked) {
442 //
443 // Note: To fully follow Alpha LLSC semantics, should the LL clear any
444 // previously locked cache lines?
445 //
446 m_dataCache_ptr->setLocked(address, m_version);
447 } else if ((m_dataCache_ptr->isTagPresent(address)) &&
448 (m_dataCache_ptr->isLocked(address, m_version))) {
449 //
450 // Normal writes should clear the locked address
451 //
452 m_dataCache_ptr->clearLocked(address);
453 }
454 return success;
455}
456
457void
458GPUCoalescer::writeCallback(Addr address, DataBlock& data)
459{
460 writeCallback(address, MachineType_NULL, data);
461}
462
463void
464GPUCoalescer::writeCallback(Addr address,
465 MachineType mach,
466 DataBlock& data)
467{
468 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
469}
470
471void
472GPUCoalescer::writeCallback(Addr address,
473 MachineType mach,
474 DataBlock& data,
475 Cycles initialRequestTime,
476 Cycles forwardRequestTime,
477 Cycles firstResponseTime)
478{
479 writeCallback(address, mach, data,
480 initialRequestTime, forwardRequestTime, firstResponseTime,
481 false);
482}
483
484void
485GPUCoalescer::writeCallback(Addr address,
486 MachineType mach,
487 DataBlock& data,
488 Cycles initialRequestTime,
489 Cycles forwardRequestTime,
490 Cycles firstResponseTime,
491 bool isRegion)
492{
493 assert(address == makeLineAddress(address));
494
495 DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
496 assert(m_writeRequestTable.count(makeLineAddress(address)));
497
498 RequestTable::iterator i = m_writeRequestTable.find(address);
499 assert(i != m_writeRequestTable.end());
500 GPUCoalescerRequest* request = i->second;
501
502 m_writeRequestTable.erase(i);
503 markRemoved();
504
505 assert((request->m_type == RubyRequestType_ST) ||
506 (request->m_type == RubyRequestType_ATOMIC) ||
507 (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
508 (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
509 (request->m_type == RubyRequestType_RMW_Read) ||
510 (request->m_type == RubyRequestType_RMW_Write) ||
511 (request->m_type == RubyRequestType_Load_Linked) ||
512 (request->m_type == RubyRequestType_Store_Conditional) ||
513 (request->m_type == RubyRequestType_Locked_RMW_Read) ||
514 (request->m_type == RubyRequestType_Locked_RMW_Write) ||
515 (request->m_type == RubyRequestType_FLUSH));
516
517
518 //
519 // For Alpha, properly handle LL, SC, and write requests with respect to
520 // locked cache blocks.
521 //
522 // Not valid for Garnet_standalone protocl
523 //
524 bool success = true;
525 if (!m_runningGarnetStandalone)
526 success = handleLlsc(address, request);
527
528 if (request->m_type == RubyRequestType_Locked_RMW_Read) {
529 m_controller->blockOnQueue(address, m_mandatory_q_ptr);
530 } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
531 m_controller->unblock(address);
532 }
533
534 hitCallback(request, mach, data, success,
535 request->issue_time, forwardRequestTime, firstResponseTime,
536 isRegion);
537}
538
539void
540GPUCoalescer::readCallback(Addr address, DataBlock& data)
541{
542 readCallback(address, MachineType_NULL, data);
543}
544
545void
546GPUCoalescer::readCallback(Addr address,
547 MachineType mach,
548 DataBlock& data)
549{
550 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
551}
552
553void
554GPUCoalescer::readCallback(Addr address,
555 MachineType mach,
556 DataBlock& data,
557 Cycles initialRequestTime,
558 Cycles forwardRequestTime,
559 Cycles firstResponseTime)
560{
561
562 readCallback(address, mach, data,
563 initialRequestTime, forwardRequestTime, firstResponseTime,
564 false);
565}
566
567void
568GPUCoalescer::readCallback(Addr address,
569 MachineType mach,
570 DataBlock& data,
571 Cycles initialRequestTime,
572 Cycles forwardRequestTime,
573 Cycles firstResponseTime,
574 bool isRegion)
575{
576 assert(address == makeLineAddress(address));
577 assert(m_readRequestTable.count(makeLineAddress(address)));
578
579 DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
580 RequestTable::iterator i = m_readRequestTable.find(address);
581 assert(i != m_readRequestTable.end());
582 GPUCoalescerRequest* request = i->second;
583
584 m_readRequestTable.erase(i);
585 markRemoved();
586
587 assert((request->m_type == RubyRequestType_LD) ||
588 (request->m_type == RubyRequestType_IFETCH));
589
590 hitCallback(request, mach, data, true,
591 request->issue_time, forwardRequestTime, firstResponseTime,
592 isRegion);
593}
594
595void
596GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
597 MachineType mach,
598 DataBlock& data,
599 bool success,
600 Cycles initialRequestTime,
601 Cycles forwardRequestTime,
602 Cycles firstResponseTime,
603 bool isRegion)
604{
605 PacketPtr pkt = srequest->pkt;
606 Addr request_address = pkt->getAddr();
607 Addr request_line_address = makeLineAddress(request_address);
608
609 RubyRequestType type = srequest->m_type;
610
611 // Set this cache entry to the most recently used
612 if (type == RubyRequestType_IFETCH) {
613 if (m_instCache_ptr->isTagPresent(request_line_address))
614 m_instCache_ptr->setMRU(request_line_address);
615 } else {
616 if (m_dataCache_ptr->isTagPresent(request_line_address))
617 m_dataCache_ptr->setMRU(request_line_address);
618 }
619
620 recordMissLatency(srequest, mach,
621 initialRequestTime,
622 forwardRequestTime,
623 firstResponseTime,
624 success, isRegion);
625 // update the data
626 //
627 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
628 int len = reqCoalescer[request_line_address].size();
629 std::vector<PacketPtr> mylist;
630 for (int i = 0; i < len; ++i) {
631 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
632 assert(type == reqCoalescer[request_line_address][i].primaryType);
633 request_address = pkt->getAddr();
634 request_line_address = makeLineAddress(pkt->getAddr());
635 if (pkt->getPtr<uint8_t>()) {
636 if ((type == RubyRequestType_LD) ||
637 (type == RubyRequestType_ATOMIC) ||
638 (type == RubyRequestType_ATOMIC_RETURN) ||
639 (type == RubyRequestType_IFETCH) ||
640 (type == RubyRequestType_RMW_Read) ||
641 (type == RubyRequestType_Locked_RMW_Read) ||
642 (type == RubyRequestType_Load_Linked)) {
643 memcpy(pkt->getPtr<uint8_t>(),
644 data.getData(getOffset(request_address),
645 pkt->getSize()),
646 pkt->getSize());
647 } else {
648 data.setData(pkt->getPtr<uint8_t>(),
649 getOffset(request_address), pkt->getSize());
650 }
651 } else {
652 DPRINTF(MemoryAccess,
653 "WARNING. Data not transfered from Ruby to M5 for type " \
654 "%s\n",
655 RubyRequestType_to_string(type));
656 }
657
658 // If using the RubyTester, update the RubyTester sender state's
659 // subBlock with the recieved data. The tester will later access
660 // this state.
661 // Note: RubyPort will access it's sender state before the
662 // RubyTester.
663 if (m_usingRubyTester) {
664 RubyPort::SenderState *requestSenderState =
665 safe_cast<RubyPort::SenderState*>(pkt->senderState);
666 RubyTester::SenderState* testerSenderState =
667 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
668 testerSenderState->subBlock.mergeFrom(data);
669 }
670
671 mylist.push_back(pkt);
672 }
673 delete srequest;
674 reqCoalescer.erase(request_line_address);
675 assert(!reqCoalescer.count(request_line_address));
676
677
678
679 completeHitCallback(mylist, len);
680}
681
682bool
683GPUCoalescer::empty() const
684{
685 return m_writeRequestTable.empty() && m_readRequestTable.empty();
686}
687
688// Analyzes the packet to see if this request can be coalesced.
689// If request can be coalesced, this request is added to the reqCoalescer table
690// and makeRequest returns RequestStatus_Issued;
691// If this is the first request to a cacheline, request is added to both
692// newRequests queue and to the reqCoalescer table; makeRequest
693// returns RequestStatus_Issued.
694// If there is a pending request to this cacheline and this request
695// can't be coalesced, RequestStatus_Aliased is returned and
696// the packet needs to be reissued.
697RequestStatus
698GPUCoalescer::makeRequest(PacketPtr pkt)
699{
700 // Check for GPU Barrier Kernel End or Kernel Begin
701 // Leave these to be handled by the child class
702 // Kernel End/Barrier = isFlush + isRelease
703 // Kernel Begin = isFlush + isAcquire
704 if (pkt->req->isKernel()) {
705 if (pkt->req->isAcquire()){
706 // This is a Kernel Begin leave handling to
707 // virtual xCoalescer::makeRequest
708 return RequestStatus_Issued;
709 }else if (pkt->req->isRelease()) {
710 // This is a Kernel End leave handling to
711 // virtual xCoalescer::makeRequest
712 // If we are here then we didn't call
713 // a virtual version of this function
714 // so we will also schedule the callback
715 int wf_id = 0;
716 if (pkt->req->hasContextId()) {
717 wf_id = pkt->req->contextId();
718 }
719 insertKernel(wf_id, pkt);
720 newKernelEnds.push_back(wf_id);
721 if (!issueEvent.scheduled()) {
722 schedule(issueEvent, curTick());
723 }
724 return RequestStatus_Issued;
725 }
726 }
727
728 // If number of outstanding requests greater than the max allowed,
729 // return RequestStatus_BufferFull. This logic can be extended to
730 // support proper backpressure.
731 if (m_outstanding_count >= m_max_outstanding_requests) {
732 return RequestStatus_BufferFull;
733 }
734
735 RubyRequestType primary_type = RubyRequestType_NULL;
736 RubyRequestType secondary_type = RubyRequestType_NULL;
737
738 if (pkt->isLLSC()) {
739 //
740 // Alpha LL/SC instructions need to be handled carefully by the cache
741 // coherence protocol to ensure they follow the proper semantics. In
742 // particular, by identifying the operations as atomic, the protocol
743 // should understand that migratory sharing optimizations should not
744 // be performed (i.e. a load between the LL and SC should not steal
745 // away exclusive permission).
746 //
747 if (pkt->isWrite()) {
748 primary_type = RubyRequestType_Store_Conditional;
749 } else {
750 assert(pkt->isRead());
751 primary_type = RubyRequestType_Load_Linked;
752 }
753 secondary_type = RubyRequestType_ATOMIC;
754 } else if (pkt->req->isLockedRMW()) {
755 //
756 // x86 locked instructions are translated to store cache coherence
757 // requests because these requests should always be treated as read
758 // exclusive operations and should leverage any migratory sharing
759 // optimization built into the protocol.
760 //
761 if (pkt->isWrite()) {
762 primary_type = RubyRequestType_Locked_RMW_Write;
763 } else {
764 assert(pkt->isRead());
765 primary_type = RubyRequestType_Locked_RMW_Read;
766 }
767 secondary_type = RubyRequestType_ST;
768 } else if (pkt->isAtomicOp()) {
769 //
770 // GPU Atomic Operation
771 //
772 primary_type = RubyRequestType_ATOMIC;
773 secondary_type = RubyRequestType_ATOMIC;
774 } else {
775 if (pkt->isRead()) {
776 if (pkt->req->isInstFetch()) {
777 primary_type = secondary_type = RubyRequestType_IFETCH;
778 } else {
779#if THE_ISA == X86_ISA
780 uint32_t flags = pkt->req->getFlags();
781 bool storeCheck = flags &
782 (TheISA::StoreCheck << TheISA::FlagShift);
783#else
784 bool storeCheck = false;
785#endif // X86_ISA
786 if (storeCheck) {
787 primary_type = RubyRequestType_RMW_Read;
788 secondary_type = RubyRequestType_ST;
789 } else {
790 primary_type = secondary_type = RubyRequestType_LD;
791 }
792 }
793 } else if (pkt->isWrite()) {
794 //
795 // Note: M5 packets do not differentiate ST from RMW_Write
796 //
797 primary_type = secondary_type = RubyRequestType_ST;
798 } else if (pkt->isFlush()) {
799 primary_type = secondary_type = RubyRequestType_FLUSH;
800 } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
801 if (assumingRfOCoherence) {
802 // If we reached here, this request must be a memFence
803 // and the protocol implements RfO, the coalescer can
804 // assume sequentially consistency and schedule the callback
805 // immediately.
806 // Currently the code implements fence callbacks
807 // by reusing the mechanism for kernel completions.
808 // This should be fixed.
809 int wf_id = 0;
810 if (pkt->req->hasContextId()) {
811 wf_id = pkt->req->contextId();
812 }
813 insertKernel(wf_id, pkt);
814 newKernelEnds.push_back(wf_id);
815 if (!issueEvent.scheduled()) {
816 schedule(issueEvent, curTick());
817 }
818 return RequestStatus_Issued;
819 } else {
820 // If not RfO, return issued here and let the child coalescer
821 // take care of it.
822 return RequestStatus_Issued;
823 }
824 } else {
825 panic("Unsupported ruby packet type\n");
826 }
827 }
828
829 // Check if there is any pending request to this cache line from
830 // previous cycles.
831 // If there is a pending request, return aliased. Since coalescing
832 // across time is not permitted, aliased requests are not coalesced.
833 // If a request for this address has already been issued, we must block
834 RequestStatus status = getRequestStatus(pkt, primary_type);
835 if (status != RequestStatus_Ready)
836 return status;
837
838 Addr line_addr = makeLineAddress(pkt->getAddr());
839
840 // Check if this request can be coalesced with previous
841 // requests from this cycle.
842 if (!reqCoalescer.count(line_addr)) {
843 // This is the first access to this cache line.
844 // A new request to the memory subsystem has to be
845 // made in the next cycle for this cache line, so
846 // add this line addr to the "newRequests" queue
847 newRequests.push_back(line_addr);
848
849 // There was a request to this cache line in this cycle,
850 // let us see if we can coalesce this request with the previous
851 // requests from this cycle
852 } else if (primary_type !=
853 reqCoalescer[line_addr][0].primaryType) {
854 // can't coalesce loads, stores and atomics!
855 return RequestStatus_Aliased;
856 } else if (pkt->req->isLockedRMW() ||
857 reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
858 // can't coalesce locked accesses, but can coalesce atomics!
859 return RequestStatus_Aliased;
860 } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
861 pkt->req->contextId() !=
862 reqCoalescer[line_addr][0].pkt->req->contextId()) {
863 // can't coalesce releases from different wavefronts
864 return RequestStatus_Aliased;
865 }
866
867 // in addition to the packet, we need to save both request types
868 reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
869 if (!issueEvent.scheduled())
870 schedule(issueEvent, curTick());
871 // TODO: issue hardware prefetches here
872 return RequestStatus_Issued;
873}
874
875void
876GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
877{
878
879 int proc_id = -1;
880 if (pkt != NULL && pkt->req->hasContextId()) {
881 proc_id = pkt->req->contextId();
882 }
883
884 // If valid, copy the pc to the ruby request
885 Addr pc = 0;
886 if (pkt->req->hasPC()) {
887 pc = pkt->req->getPC();
888 }
889
890 // At the moment setting scopes only counts
891 // for GPU spill space accesses
892 // which is pkt->req->isStack()
893 // this scope is REPLACE since it
894 // does not need to be flushed at the end
895 // of a kernel Private and local may need
896 // to be visible at the end of the kernel
897 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
898 HSAScope accessScope = reqScopeToHSAScope(pkt->req);
899
900 Addr line_addr = makeLineAddress(pkt->getAddr());
901
902 // Creating WriteMask that records written bytes
903 // and atomic operations. This enables partial writes
904 // and partial reads of those writes
905 DataBlock dataBlock;
906 dataBlock.clear();
907 uint32_t blockSize = RubySystem::getBlockSizeBytes();
908 std::vector<bool> accessMask(blockSize,false);
909 std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
910 uint32_t tableSize = reqCoalescer[line_addr].size();
911 for (int i = 0; i < tableSize; i++) {
912 PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
913 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
914 uint32_t tmpSize = tmpPkt->getSize();
915 if (tmpPkt->isAtomicOp()) {
916 std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
917 tmpPkt->getAtomicOp());
918 atomicOps.push_back(tmpAtomicOp);
919 } else if (tmpPkt->isWrite()) {
920 dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
921 tmpOffset, tmpSize);
922 }
923 for (int j = 0; j < tmpSize; j++) {
924 accessMask[tmpOffset + j] = true;
925 }
926 }
927 std::shared_ptr<RubyRequest> msg;
928 if (pkt->isAtomicOp()) {
929 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
930 pkt->getPtr<uint8_t>(),
931 pkt->getSize(), pc, secondary_type,
932 RubyAccessMode_Supervisor, pkt,
933 PrefetchBit_No, proc_id, 100,
934 blockSize, accessMask,
935 dataBlock, atomicOps,
936 accessScope, accessSegment);
937 } else {
938 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
939 pkt->getPtr<uint8_t>(),
940 pkt->getSize(), pc, secondary_type,
941 RubyAccessMode_Supervisor, pkt,
942 PrefetchBit_No, proc_id, 100,
943 blockSize, accessMask,
944 dataBlock,
945 accessScope, accessSegment);
946 }
947 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
948 curTick(), m_version, "Coal", "Begin", "", "",
949 printAddress(msg->getPhysicalAddress()),
950 RubyRequestType_to_string(secondary_type));
951
952 fatal_if(secondary_type == RubyRequestType_IFETCH,
953 "there should not be any I-Fetch requests in the GPU Coalescer");
954
955 // Send the message to the cache controller
956 fatal_if(m_data_cache_hit_latency == 0,
957 "should not have a latency of zero");
958
959 assert(m_mandatory_q_ptr);
960 m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
961}
962
963template <class KEY, class VALUE>
964std::ostream &
965operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
966{
967 out << "[";
968 for (auto i = map.begin(); i != map.end(); ++i)
969 out << " " << i->first << "=" << i->second;
970 out << " ]";
971
972 return out;
973}
974
975void
976GPUCoalescer::print(ostream& out) const
977{
978 out << "[GPUCoalescer: " << m_version
979 << ", outstanding requests: " << m_outstanding_count
980 << ", read request table: " << m_readRequestTable
981 << ", write request table: " << m_writeRequestTable
982 << "]";
983}
984
985// this can be called from setState whenever coherence permissions are
986// upgraded when invoked, coherence violations will be checked for the
987// given block
988void
989GPUCoalescer::checkCoherence(Addr addr)
990{
991#ifdef CHECK_COHERENCE
992 m_ruby_system->checkGlobalCoherenceInvariant(addr);
993#endif
994}
995
996void
997GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
998 DPRINTF(RubyStats, "Recorded statistic: %s\n",
999 SequencerRequestType_to_string(requestType));
1000}
1001
1002
1003void
1004GPUCoalescer::completeIssue()
1005{
1006 // newRequests has the cacheline addresses of all the
1007 // requests which need to be issued to the memory subsystem
1008 // in this cycle
1009 int len = newRequests.size();
1010 DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
1011 for (int i = 0; i < len; ++i) {
1012 // Get the requests from reqCoalescer table. Get only the
1013 // first request for each cacheline, the remaining requests
1014 // can be coalesced with the first request. So, only
1015 // one request is issued per cacheline.
1016 RequestDesc info = reqCoalescer[newRequests[i]][0];
1017 PacketPtr pkt = info.pkt;
1018 DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
1019 i, pkt->req->getPaddr());
1020 // Insert this request to the read/writeRequestTables. These tables
1021 // are used to track aliased requests in makeRequest subroutine
1022 bool found = insertRequest(pkt, info.primaryType);
1023
1024 if (found) {
1025 panic("GPUCoalescer::makeRequest should never be called if the "
1026 "request is already outstanding\n");
1027 }
1028
1029 // Issue request to ruby subsystem
1030 issueRequest(pkt, info.secondaryType);
1031 }
1032 newRequests.clear();
1033
1034 // have Kernel End releases been issued this cycle
1035 len = newKernelEnds.size();
1036 for (int i = 0; i < len; i++) {
1037 kernelCallback(newKernelEnds[i]);
1038 }
1039 newKernelEnds.clear();
1040}
1041
1042void
1043GPUCoalescer::evictionCallback(Addr address)
1044{
1045 ruby_eviction_callback(address);
1046}
1047
1048void
1049GPUCoalescer::kernelCallback(int wavefront_id)
1050{
1051 assert(kernelEndList.count(wavefront_id));
1052
1053 ruby_hit_callback(kernelEndList[wavefront_id]);
1054
1055 kernelEndList.erase(wavefront_id);
1056}
1057
1058void
1059GPUCoalescer::atomicCallback(Addr address,
1060 MachineType mach,
1061 const DataBlock& data)
1062{
1063 assert(address == makeLineAddress(address));
1064
1065 DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
1066 assert(m_writeRequestTable.count(makeLineAddress(address)));
1067
1068 RequestTable::iterator i = m_writeRequestTable.find(address);
1069 assert(i != m_writeRequestTable.end());
1070 GPUCoalescerRequest* srequest = i->second;
1071
1072 m_writeRequestTable.erase(i);
1073 markRemoved();
1074
1075 assert((srequest->m_type == RubyRequestType_ATOMIC) ||
1076 (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
1077 (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
1078
1079
1080 // Atomics don't write to cache, so there is no MRU update...
1081
1082 recordMissLatency(srequest, mach,
1083 srequest->issue_time, Cycles(0), Cycles(0), true, false);
1084
1085 PacketPtr pkt = srequest->pkt;
1086 Addr request_address = pkt->getAddr();
1087 Addr request_line_address = makeLineAddress(pkt->getAddr());
1088
1089 int len = reqCoalescer[request_line_address].size();
1090 std::vector<PacketPtr> mylist;
1091 for (int i = 0; i < len; ++i) {
1092 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
1093 assert(srequest->m_type ==
1094 reqCoalescer[request_line_address][i].primaryType);
1095 request_address = (pkt->getAddr());
1096 request_line_address = makeLineAddress(request_address);
1097 if (pkt->getPtr<uint8_t>() &&
1098 srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
1099 /* atomics are done in memory, and return the data *before* the atomic op... */
1100 memcpy(pkt->getPtr<uint8_t>(),
1101 data.getData(getOffset(request_address),
1102 pkt->getSize()),
1103 pkt->getSize());
1104 } else {
1105 DPRINTF(MemoryAccess,
1106 "WARNING. Data not transfered from Ruby to M5 for type " \
1107 "%s\n",
1108 RubyRequestType_to_string(srequest->m_type));
1109 }
1110
1111 // If using the RubyTester, update the RubyTester sender state's
1112 // subBlock with the recieved data. The tester will later access
1113 // this state.
1114 // Note: RubyPort will access it's sender state before the
1115 // RubyTester.
1116 if (m_usingRubyTester) {
1117 RubyPort::SenderState *requestSenderState =
1118 safe_cast<RubyPort::SenderState*>(pkt->senderState);
1119 RubyTester::SenderState* testerSenderState =
1120 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
1121 testerSenderState->subBlock.mergeFrom(data);
1122 }
1123
1124 mylist.push_back(pkt);
1125 }
1126 delete srequest;
1127 reqCoalescer.erase(request_line_address);
1128 assert(!reqCoalescer.count(request_line_address));
1129
1130 completeHitCallback(mylist, len);
1131}
1132
1133void
1134GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
1135{
1136 if (myMachID == senderMachID) {
1137 CP_TCPLdHits++;
1138 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1139 CP_TCPLdTransfers++;
1140 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1141 CP_TCCLdHits++;
1142 } else {
1143 CP_LdMiss++;
1144 }
1145}
1146
1147void
1148GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
1149{
1150 if (myMachID == senderMachID) {
1151 CP_TCPStHits++;
1152 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1153 CP_TCPStTransfers++;
1154 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1155 CP_TCCStHits++;
1156 } else {
1157 CP_StMiss++;
1158 }
1159}
1160
1161void
1162GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
1163{
1164 for (int i = 0; i < len; ++i) {
1165 RubyPort::SenderState *ss =
1166 safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
1167 MemSlavePort *port = ss->port;
1168 assert(port != NULL);
1169
1170 mylist[i]->senderState = ss->predecessor;
1171 delete ss;
1172 port->hitCallback(mylist[i]);
1173 trySendRetries();
1174 }
1175
1176 testDrainComplete();
1177}
1178
1179PacketPtr
1180GPUCoalescer::mapAddrToPkt(Addr address)
1181{
1182 RequestTable::iterator i = m_readRequestTable.find(address);
1183 assert(i != m_readRequestTable.end());
1184 GPUCoalescerRequest* request = i->second;
1185 return request->pkt;
1186}
1187
1188void
1189GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
1190 MachineType mach,
1191 Cycles initialRequestTime,
1192 Cycles forwardRequestTime,
1193 Cycles firstResponseTime,
1194 bool success, bool isRegion)
1195{
1196 RubyRequestType type = srequest->m_type;
1197 Cycles issued_time = srequest->issue_time;
1198 Cycles completion_time = curCycle();
1199 assert(completion_time >= issued_time);
1200 Cycles total_lat = completion_time - issued_time;
1201
1202 // cache stats (valid for RfO protocol only)
1203 if (mach == MachineType_TCP) {
1204 if (type == RubyRequestType_LD) {
1205 GPU_TCPLdHits++;
1206 } else {
1207 GPU_TCPStHits++;
1208 }
1209 } else if (mach == MachineType_L1Cache_wCC) {
1210 if (type == RubyRequestType_LD) {
1211 GPU_TCPLdTransfers++;
1212 } else {
1213 GPU_TCPStTransfers++;
1214 }
1215 } else if (mach == MachineType_TCC) {
1216 if (type == RubyRequestType_LD) {
1217 GPU_TCCLdHits++;
1218 } else {
1219 GPU_TCCStHits++;
1220 }
1221 } else {
1222 if (type == RubyRequestType_LD) {
1223 GPU_LdMiss++;
1224 } else {
1225 GPU_StMiss++;
1226 }
1227 }
1228
1229 // Profile all access latency, even zero latency accesses
1230 m_latencyHist.sample(total_lat);
1231 m_typeLatencyHist[type]->sample(total_lat);
1232
1233 // Profile the miss latency for all non-zero demand misses
1234 if (total_lat != Cycles(0)) {
1235 m_missLatencyHist.sample(total_lat);
1236 m_missTypeLatencyHist[type]->sample(total_lat);
1237
1238 if (mach != MachineType_NUM) {
1239 m_missMachLatencyHist[mach]->sample(total_lat);
1240 m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
1241
1242 if ((issued_time <= initialRequestTime) &&
1243 (initialRequestTime <= forwardRequestTime) &&
1244 (forwardRequestTime <= firstResponseTime) &&
1245 (firstResponseTime <= completion_time)) {
1246
1247 m_IssueToInitialDelayHist[mach]->sample(
1248 initialRequestTime - issued_time);
1249 m_InitialToForwardDelayHist[mach]->sample(
1250 forwardRequestTime - initialRequestTime);
1251 m_ForwardToFirstResponseDelayHist[mach]->sample(
1252 firstResponseTime - forwardRequestTime);
1253 m_FirstResponseToCompletionDelayHist[mach]->sample(
1254 completion_time - firstResponseTime);
1255 }
1256 }
1257
1258 }
1259
1260 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1261 curTick(), m_version, "Coal",
1262 success ? "Done" : "SC_Failed", "", "",
1263 printAddress(srequest->pkt->getAddr()), total_lat);
1264}
1265
1266void
1267GPUCoalescer::regStats()
1268{
1269 RubyPort::regStats();
1270
1271 // These statistical variables are not for display.
1272 // The profiler will collate these across different
1273 // coalescers and display those collated statistics.
1274 m_outstandReqHist.init(10);
1275 m_latencyHist.init(10);
1276 m_missLatencyHist.init(10);
1277
1278 for (int i = 0; i < RubyRequestType_NUM; i++) {
1279 m_typeLatencyHist.push_back(new Stats::Histogram());
1280 m_typeLatencyHist[i]->init(10);
1281
1282 m_missTypeLatencyHist.push_back(new Stats::Histogram());
1283 m_missTypeLatencyHist[i]->init(10);
1284 }
1285
1286 for (int i = 0; i < MachineType_NUM; i++) {
1287 m_missMachLatencyHist.push_back(new Stats::Histogram());
1288 m_missMachLatencyHist[i]->init(10);
1289
1290 m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
1291 m_IssueToInitialDelayHist[i]->init(10);
1292
1293 m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
1294 m_InitialToForwardDelayHist[i]->init(10);
1295
1296 m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
1297 m_ForwardToFirstResponseDelayHist[i]->init(10);
1298
1299 m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
1300 m_FirstResponseToCompletionDelayHist[i]->init(10);
1301 }
1302
1303 for (int i = 0; i < RubyRequestType_NUM; i++) {
1304 m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
1305
1306 for (int j = 0; j < MachineType_NUM; j++) {
1307 m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
1308 m_missTypeMachLatencyHist[i][j]->init(10);
1309 }
1310 }
1311
1312 // GPU cache stats
1313 GPU_TCPLdHits
1314 .name(name() + ".gpu_tcp_ld_hits")
1315 .desc("loads that hit in the TCP")
1316 ;
1317 GPU_TCPLdTransfers
1318 .name(name() + ".gpu_tcp_ld_transfers")
1319 .desc("TCP to TCP load transfers")
1320 ;
1321 GPU_TCCLdHits
1322 .name(name() + ".gpu_tcc_ld_hits")
1323 .desc("loads that hit in the TCC")
1324 ;
1325 GPU_LdMiss
1326 .name(name() + ".gpu_ld_misses")
1327 .desc("loads that miss in the GPU")
1328 ;
1329
1330 GPU_TCPStHits
1331 .name(name() + ".gpu_tcp_st_hits")
1332 .desc("stores that hit in the TCP")
1333 ;
1334 GPU_TCPStTransfers
1335 .name(name() + ".gpu_tcp_st_transfers")
1336 .desc("TCP to TCP store transfers")
1337 ;
1338 GPU_TCCStHits
1339 .name(name() + ".gpu_tcc_st_hits")
1340 .desc("stores that hit in the TCC")
1341 ;
1342 GPU_StMiss
1343 .name(name() + ".gpu_st_misses")
1344 .desc("stores that miss in the GPU")
1345 ;
1346
1347 // CP cache stats
1348 CP_TCPLdHits
1349 .name(name() + ".cp_tcp_ld_hits")
1350 .desc("loads that hit in the TCP")
1351 ;
1352 CP_TCPLdTransfers
1353 .name(name() + ".cp_tcp_ld_transfers")
1354 .desc("TCP to TCP load transfers")
1355 ;
1356 CP_TCCLdHits
1357 .name(name() + ".cp_tcc_ld_hits")
1358 .desc("loads that hit in the TCC")
1359 ;
1360 CP_LdMiss
1361 .name(name() + ".cp_ld_misses")
1362 .desc("loads that miss in the GPU")
1363 ;
1364
1365 CP_TCPStHits
1366 .name(name() + ".cp_tcp_st_hits")
1367 .desc("stores that hit in the TCP")
1368 ;
1369 CP_TCPStTransfers
1370 .name(name() + ".cp_tcp_st_transfers")
1371 .desc("TCP to TCP store transfers")
1372 ;
1373 CP_TCCStHits
1374 .name(name() + ".cp_tcc_st_hits")
1375 .desc("stores that hit in the TCC")
1376 ;
1377 CP_StMiss
1378 .name(name() + ".cp_st_misses")
1379 .desc("stores that miss in the GPU")
1380 ;
1381}
37#include "base/str.hh"
38#include "config/the_isa.hh"
39
40#if THE_ISA == X86_ISA
41#include "arch/x86/insts/microldstop.hh"
42
43#endif // X86_ISA
44#include "mem/ruby/system/GPUCoalescer.hh"
45
46#include "cpu/testers/rubytest/RubyTester.hh"
47#include "debug/GPUCoalescer.hh"
48#include "debug/MemoryAccess.hh"
49#include "debug/ProtocolTrace.hh"
50#include "debug/RubyPort.hh"
51#include "debug/RubyStats.hh"
52#include "gpu-compute/shader.hh"
53#include "mem/packet.hh"
54#include "mem/ruby/common/DataBlock.hh"
55#include "mem/ruby/common/SubBlock.hh"
56#include "mem/ruby/network/MessageBuffer.hh"
57#include "mem/ruby/profiler/Profiler.hh"
58#include "mem/ruby/slicc_interface/AbstractController.hh"
59#include "mem/ruby/slicc_interface/RubyRequest.hh"
60#include "mem/ruby/structures/CacheMemory.hh"
61#include "mem/ruby/system/RubySystem.hh"
62#include "params/RubyGPUCoalescer.hh"
63
64using namespace std;
65
66GPUCoalescer *
67RubyGPUCoalescerParams::create()
68{
69 return new GPUCoalescer(this);
70}
71
72HSAScope
73reqScopeToHSAScope(Request* req)
74{
75 HSAScope accessScope = HSAScope_UNSPECIFIED;
76 if (req->isScoped()) {
77 if (req->isWavefrontScope()) {
78 accessScope = HSAScope_WAVEFRONT;
79 } else if (req->isWorkgroupScope()) {
80 accessScope = HSAScope_WORKGROUP;
81 } else if (req->isDeviceScope()) {
82 accessScope = HSAScope_DEVICE;
83 } else if (req->isSystemScope()) {
84 accessScope = HSAScope_SYSTEM;
85 } else {
86 fatal("Bad scope type");
87 }
88 }
89 return accessScope;
90}
91
92HSASegment
93reqSegmentToHSASegment(Request* req)
94{
95 HSASegment accessSegment = HSASegment_GLOBAL;
96
97 if (req->isGlobalSegment()) {
98 accessSegment = HSASegment_GLOBAL;
99 } else if (req->isGroupSegment()) {
100 accessSegment = HSASegment_GROUP;
101 } else if (req->isPrivateSegment()) {
102 accessSegment = HSASegment_PRIVATE;
103 } else if (req->isKernargSegment()) {
104 accessSegment = HSASegment_KERNARG;
105 } else if (req->isReadonlySegment()) {
106 accessSegment = HSASegment_READONLY;
107 } else if (req->isSpillSegment()) {
108 accessSegment = HSASegment_SPILL;
109 } else if (req->isArgSegment()) {
110 accessSegment = HSASegment_ARG;
111 } else {
112 fatal("Bad segment type");
113 }
114
115 return accessSegment;
116}
117
118GPUCoalescer::GPUCoalescer(const Params *p)
119 : RubyPort(p),
120 issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
121 false, Event::Progress_Event_Pri),
122 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check")
123{
124 m_store_waiting_on_load_cycles = 0;
125 m_store_waiting_on_store_cycles = 0;
126 m_load_waiting_on_store_cycles = 0;
127 m_load_waiting_on_load_cycles = 0;
128
129 m_outstanding_count = 0;
130
131 m_max_outstanding_requests = 0;
132 m_deadlock_threshold = 0;
133 m_instCache_ptr = nullptr;
134 m_dataCache_ptr = nullptr;
135
136 m_instCache_ptr = p->icache;
137 m_dataCache_ptr = p->dcache;
138 m_max_outstanding_requests = p->max_outstanding_requests;
139 m_deadlock_threshold = p->deadlock_threshold;
140
141 assert(m_max_outstanding_requests > 0);
142 assert(m_deadlock_threshold > 0);
143 assert(m_instCache_ptr);
144 assert(m_dataCache_ptr);
145
146 m_data_cache_hit_latency = p->dcache_hit_latency;
147
148 m_runningGarnetStandalone = p->garnet_standalone;
149 assumingRfOCoherence = p->assume_rfo;
150}
151
152GPUCoalescer::~GPUCoalescer()
153{
154}
155
156void
157GPUCoalescer::wakeup()
158{
159 // Check for deadlock of any of the requests
160 Cycles current_time = curCycle();
161
162 // Check across all outstanding requests
163 int total_outstanding = 0;
164
165 RequestTable::iterator read = m_readRequestTable.begin();
166 RequestTable::iterator read_end = m_readRequestTable.end();
167 for (; read != read_end; ++read) {
168 GPUCoalescerRequest* request = read->second;
169 if (current_time - request->issue_time < m_deadlock_threshold)
170 continue;
171
172 panic("Possible Deadlock detected. Aborting!\n"
173 "version: %d request.paddr: 0x%x m_readRequestTable: %d "
174 "current time: %u issue_time: %d difference: %d\n", m_version,
175 request->pkt->getAddr(), m_readRequestTable.size(),
176 current_time * clockPeriod(), request->issue_time * clockPeriod(),
177 (current_time - request->issue_time)*clockPeriod());
178 }
179
180 RequestTable::iterator write = m_writeRequestTable.begin();
181 RequestTable::iterator write_end = m_writeRequestTable.end();
182 for (; write != write_end; ++write) {
183 GPUCoalescerRequest* request = write->second;
184 if (current_time - request->issue_time < m_deadlock_threshold)
185 continue;
186
187 panic("Possible Deadlock detected. Aborting!\n"
188 "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
189 "current time: %u issue_time: %d difference: %d\n", m_version,
190 request->pkt->getAddr(), m_writeRequestTable.size(),
191 current_time * clockPeriod(), request->issue_time * clockPeriod(),
192 (current_time - request->issue_time) * clockPeriod());
193 }
194
195 total_outstanding += m_writeRequestTable.size();
196 total_outstanding += m_readRequestTable.size();
197
198 assert(m_outstanding_count == total_outstanding);
199
200 if (m_outstanding_count > 0) {
201 // If there are still outstanding requests, keep checking
202 schedule(deadlockCheckEvent,
203 m_deadlock_threshold * clockPeriod() +
204 curTick());
205 }
206}
207
208void
209GPUCoalescer::resetStats()
210{
211 m_latencyHist.reset();
212 m_missLatencyHist.reset();
213 for (int i = 0; i < RubyRequestType_NUM; i++) {
214 m_typeLatencyHist[i]->reset();
215 m_missTypeLatencyHist[i]->reset();
216 for (int j = 0; j < MachineType_NUM; j++) {
217 m_missTypeMachLatencyHist[i][j]->reset();
218 }
219 }
220
221 for (int i = 0; i < MachineType_NUM; i++) {
222 m_missMachLatencyHist[i]->reset();
223
224 m_IssueToInitialDelayHist[i]->reset();
225 m_InitialToForwardDelayHist[i]->reset();
226 m_ForwardToFirstResponseDelayHist[i]->reset();
227 m_FirstResponseToCompletionDelayHist[i]->reset();
228 }
229}
230
231void
232GPUCoalescer::printProgress(ostream& out) const
233{
234}
235
236RequestStatus
237GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
238{
239 Addr line_addr = makeLineAddress(pkt->getAddr());
240
241 if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
242 return RequestStatus_BufferFull;
243 }
244
245 if (m_controller->isBlocked(line_addr) &&
246 request_type != RubyRequestType_Locked_RMW_Write) {
247 return RequestStatus_Aliased;
248 }
249
250 if ((request_type == RubyRequestType_ST) ||
251 (request_type == RubyRequestType_ATOMIC) ||
252 (request_type == RubyRequestType_ATOMIC_RETURN) ||
253 (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
254 (request_type == RubyRequestType_RMW_Read) ||
255 (request_type == RubyRequestType_RMW_Write) ||
256 (request_type == RubyRequestType_Load_Linked) ||
257 (request_type == RubyRequestType_Store_Conditional) ||
258 (request_type == RubyRequestType_Locked_RMW_Read) ||
259 (request_type == RubyRequestType_Locked_RMW_Write) ||
260 (request_type == RubyRequestType_FLUSH)) {
261
262 // Check if there is any outstanding read request for the same
263 // cache line.
264 if (m_readRequestTable.count(line_addr) > 0) {
265 m_store_waiting_on_load_cycles++;
266 return RequestStatus_Aliased;
267 }
268
269 if (m_writeRequestTable.count(line_addr) > 0) {
270 // There is an outstanding write request for the cache line
271 m_store_waiting_on_store_cycles++;
272 return RequestStatus_Aliased;
273 }
274 } else {
275 // Check if there is any outstanding write request for the same
276 // cache line.
277 if (m_writeRequestTable.count(line_addr) > 0) {
278 m_load_waiting_on_store_cycles++;
279 return RequestStatus_Aliased;
280 }
281
282 if (m_readRequestTable.count(line_addr) > 0) {
283 // There is an outstanding read request for the cache line
284 m_load_waiting_on_load_cycles++;
285 return RequestStatus_Aliased;
286 }
287 }
288
289 return RequestStatus_Ready;
290
291}
292
293
294
295// sets the kernelEndList
296void
297GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
298{
299 // Don't know if this will happen or is possible
300 // but I just want to be careful and not have it become
301 // simulator hang in the future
302 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
303 assert(kernelEndList.count(wavefront_id) == 0);
304
305 kernelEndList[wavefront_id] = pkt;
306 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
307 kernelEndList.size());
308}
309
310
311// Insert the request on the correct request table. Return true if
312// the entry was already present.
313bool
314GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
315{
316 assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
317 pkt->req->isLockedRMW() ||
318 !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
319
320 int total_outstanding M5_VAR_USED =
321 m_writeRequestTable.size() + m_readRequestTable.size();
322
323 assert(m_outstanding_count == total_outstanding);
324
325 // See if we should schedule a deadlock check
326 if (!deadlockCheckEvent.scheduled()) {
327 schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
328 }
329
330 Addr line_addr = makeLineAddress(pkt->getAddr());
331 if ((request_type == RubyRequestType_ST) ||
332 (request_type == RubyRequestType_ATOMIC) ||
333 (request_type == RubyRequestType_ATOMIC_RETURN) ||
334 (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
335 (request_type == RubyRequestType_RMW_Read) ||
336 (request_type == RubyRequestType_RMW_Write) ||
337 (request_type == RubyRequestType_Load_Linked) ||
338 (request_type == RubyRequestType_Store_Conditional) ||
339 (request_type == RubyRequestType_Locked_RMW_Read) ||
340 (request_type == RubyRequestType_Locked_RMW_Write) ||
341 (request_type == RubyRequestType_FLUSH)) {
342
343 pair<RequestTable::iterator, bool> r =
344 m_writeRequestTable.insert(RequestTable::value_type(line_addr,
345 (GPUCoalescerRequest*) NULL));
346 if (r.second) {
347 RequestTable::iterator i = r.first;
348 i->second = new GPUCoalescerRequest(pkt, request_type,
349 curCycle());
350 DPRINTF(GPUCoalescer,
351 "Inserting write request for paddr %#x for type %d\n",
352 pkt->req->getPaddr(), i->second->m_type);
353 m_outstanding_count++;
354 } else {
355 return true;
356 }
357 } else {
358 pair<RequestTable::iterator, bool> r =
359 m_readRequestTable.insert(RequestTable::value_type(line_addr,
360 (GPUCoalescerRequest*) NULL));
361
362 if (r.second) {
363 RequestTable::iterator i = r.first;
364 i->second = new GPUCoalescerRequest(pkt, request_type,
365 curCycle());
366 DPRINTF(GPUCoalescer,
367 "Inserting read request for paddr %#x for type %d\n",
368 pkt->req->getPaddr(), i->second->m_type);
369 m_outstanding_count++;
370 } else {
371 return true;
372 }
373 }
374
375 m_outstandReqHist.sample(m_outstanding_count);
376
377 total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
378 assert(m_outstanding_count == total_outstanding);
379
380 return false;
381}
382
383void
384GPUCoalescer::markRemoved()
385{
386 m_outstanding_count--;
387 assert(m_outstanding_count ==
388 m_writeRequestTable.size() + m_readRequestTable.size());
389}
390
391void
392GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
393{
394 assert(m_outstanding_count ==
395 m_writeRequestTable.size() + m_readRequestTable.size());
396
397 Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
398 if ((srequest->m_type == RubyRequestType_ST) ||
399 (srequest->m_type == RubyRequestType_RMW_Read) ||
400 (srequest->m_type == RubyRequestType_RMW_Write) ||
401 (srequest->m_type == RubyRequestType_Load_Linked) ||
402 (srequest->m_type == RubyRequestType_Store_Conditional) ||
403 (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
404 (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
405 m_writeRequestTable.erase(line_addr);
406 } else {
407 m_readRequestTable.erase(line_addr);
408 }
409
410 markRemoved();
411}
412
413bool
414GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
415{
416 //
417 // The success flag indicates whether the LLSC operation was successful.
418 // LL ops will always succeed, but SC may fail if the cache line is no
419 // longer locked.
420 //
421 bool success = true;
422 if (request->m_type == RubyRequestType_Store_Conditional) {
423 if (!m_dataCache_ptr->isLocked(address, m_version)) {
424 //
425 // For failed SC requests, indicate the failure to the cpu by
426 // setting the extra data to zero.
427 //
428 request->pkt->req->setExtraData(0);
429 success = false;
430 } else {
431 //
432 // For successful SC requests, indicate the success to the cpu by
433 // setting the extra data to one.
434 //
435 request->pkt->req->setExtraData(1);
436 }
437 //
438 // Independent of success, all SC operations must clear the lock
439 //
440 m_dataCache_ptr->clearLocked(address);
441 } else if (request->m_type == RubyRequestType_Load_Linked) {
442 //
443 // Note: To fully follow Alpha LLSC semantics, should the LL clear any
444 // previously locked cache lines?
445 //
446 m_dataCache_ptr->setLocked(address, m_version);
447 } else if ((m_dataCache_ptr->isTagPresent(address)) &&
448 (m_dataCache_ptr->isLocked(address, m_version))) {
449 //
450 // Normal writes should clear the locked address
451 //
452 m_dataCache_ptr->clearLocked(address);
453 }
454 return success;
455}
456
457void
458GPUCoalescer::writeCallback(Addr address, DataBlock& data)
459{
460 writeCallback(address, MachineType_NULL, data);
461}
462
463void
464GPUCoalescer::writeCallback(Addr address,
465 MachineType mach,
466 DataBlock& data)
467{
468 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
469}
470
471void
472GPUCoalescer::writeCallback(Addr address,
473 MachineType mach,
474 DataBlock& data,
475 Cycles initialRequestTime,
476 Cycles forwardRequestTime,
477 Cycles firstResponseTime)
478{
479 writeCallback(address, mach, data,
480 initialRequestTime, forwardRequestTime, firstResponseTime,
481 false);
482}
483
484void
485GPUCoalescer::writeCallback(Addr address,
486 MachineType mach,
487 DataBlock& data,
488 Cycles initialRequestTime,
489 Cycles forwardRequestTime,
490 Cycles firstResponseTime,
491 bool isRegion)
492{
493 assert(address == makeLineAddress(address));
494
495 DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
496 assert(m_writeRequestTable.count(makeLineAddress(address)));
497
498 RequestTable::iterator i = m_writeRequestTable.find(address);
499 assert(i != m_writeRequestTable.end());
500 GPUCoalescerRequest* request = i->second;
501
502 m_writeRequestTable.erase(i);
503 markRemoved();
504
505 assert((request->m_type == RubyRequestType_ST) ||
506 (request->m_type == RubyRequestType_ATOMIC) ||
507 (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
508 (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
509 (request->m_type == RubyRequestType_RMW_Read) ||
510 (request->m_type == RubyRequestType_RMW_Write) ||
511 (request->m_type == RubyRequestType_Load_Linked) ||
512 (request->m_type == RubyRequestType_Store_Conditional) ||
513 (request->m_type == RubyRequestType_Locked_RMW_Read) ||
514 (request->m_type == RubyRequestType_Locked_RMW_Write) ||
515 (request->m_type == RubyRequestType_FLUSH));
516
517
518 //
519 // For Alpha, properly handle LL, SC, and write requests with respect to
520 // locked cache blocks.
521 //
522 // Not valid for Garnet_standalone protocl
523 //
524 bool success = true;
525 if (!m_runningGarnetStandalone)
526 success = handleLlsc(address, request);
527
528 if (request->m_type == RubyRequestType_Locked_RMW_Read) {
529 m_controller->blockOnQueue(address, m_mandatory_q_ptr);
530 } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
531 m_controller->unblock(address);
532 }
533
534 hitCallback(request, mach, data, success,
535 request->issue_time, forwardRequestTime, firstResponseTime,
536 isRegion);
537}
538
539void
540GPUCoalescer::readCallback(Addr address, DataBlock& data)
541{
542 readCallback(address, MachineType_NULL, data);
543}
544
545void
546GPUCoalescer::readCallback(Addr address,
547 MachineType mach,
548 DataBlock& data)
549{
550 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
551}
552
553void
554GPUCoalescer::readCallback(Addr address,
555 MachineType mach,
556 DataBlock& data,
557 Cycles initialRequestTime,
558 Cycles forwardRequestTime,
559 Cycles firstResponseTime)
560{
561
562 readCallback(address, mach, data,
563 initialRequestTime, forwardRequestTime, firstResponseTime,
564 false);
565}
566
567void
568GPUCoalescer::readCallback(Addr address,
569 MachineType mach,
570 DataBlock& data,
571 Cycles initialRequestTime,
572 Cycles forwardRequestTime,
573 Cycles firstResponseTime,
574 bool isRegion)
575{
576 assert(address == makeLineAddress(address));
577 assert(m_readRequestTable.count(makeLineAddress(address)));
578
579 DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
580 RequestTable::iterator i = m_readRequestTable.find(address);
581 assert(i != m_readRequestTable.end());
582 GPUCoalescerRequest* request = i->second;
583
584 m_readRequestTable.erase(i);
585 markRemoved();
586
587 assert((request->m_type == RubyRequestType_LD) ||
588 (request->m_type == RubyRequestType_IFETCH));
589
590 hitCallback(request, mach, data, true,
591 request->issue_time, forwardRequestTime, firstResponseTime,
592 isRegion);
593}
594
595void
596GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
597 MachineType mach,
598 DataBlock& data,
599 bool success,
600 Cycles initialRequestTime,
601 Cycles forwardRequestTime,
602 Cycles firstResponseTime,
603 bool isRegion)
604{
605 PacketPtr pkt = srequest->pkt;
606 Addr request_address = pkt->getAddr();
607 Addr request_line_address = makeLineAddress(request_address);
608
609 RubyRequestType type = srequest->m_type;
610
611 // Set this cache entry to the most recently used
612 if (type == RubyRequestType_IFETCH) {
613 if (m_instCache_ptr->isTagPresent(request_line_address))
614 m_instCache_ptr->setMRU(request_line_address);
615 } else {
616 if (m_dataCache_ptr->isTagPresent(request_line_address))
617 m_dataCache_ptr->setMRU(request_line_address);
618 }
619
620 recordMissLatency(srequest, mach,
621 initialRequestTime,
622 forwardRequestTime,
623 firstResponseTime,
624 success, isRegion);
625 // update the data
626 //
627 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
628 int len = reqCoalescer[request_line_address].size();
629 std::vector<PacketPtr> mylist;
630 for (int i = 0; i < len; ++i) {
631 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
632 assert(type == reqCoalescer[request_line_address][i].primaryType);
633 request_address = pkt->getAddr();
634 request_line_address = makeLineAddress(pkt->getAddr());
635 if (pkt->getPtr<uint8_t>()) {
636 if ((type == RubyRequestType_LD) ||
637 (type == RubyRequestType_ATOMIC) ||
638 (type == RubyRequestType_ATOMIC_RETURN) ||
639 (type == RubyRequestType_IFETCH) ||
640 (type == RubyRequestType_RMW_Read) ||
641 (type == RubyRequestType_Locked_RMW_Read) ||
642 (type == RubyRequestType_Load_Linked)) {
643 memcpy(pkt->getPtr<uint8_t>(),
644 data.getData(getOffset(request_address),
645 pkt->getSize()),
646 pkt->getSize());
647 } else {
648 data.setData(pkt->getPtr<uint8_t>(),
649 getOffset(request_address), pkt->getSize());
650 }
651 } else {
652 DPRINTF(MemoryAccess,
653 "WARNING. Data not transfered from Ruby to M5 for type " \
654 "%s\n",
655 RubyRequestType_to_string(type));
656 }
657
658 // If using the RubyTester, update the RubyTester sender state's
659 // subBlock with the recieved data. The tester will later access
660 // this state.
661 // Note: RubyPort will access it's sender state before the
662 // RubyTester.
663 if (m_usingRubyTester) {
664 RubyPort::SenderState *requestSenderState =
665 safe_cast<RubyPort::SenderState*>(pkt->senderState);
666 RubyTester::SenderState* testerSenderState =
667 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
668 testerSenderState->subBlock.mergeFrom(data);
669 }
670
671 mylist.push_back(pkt);
672 }
673 delete srequest;
674 reqCoalescer.erase(request_line_address);
675 assert(!reqCoalescer.count(request_line_address));
676
677
678
679 completeHitCallback(mylist, len);
680}
681
682bool
683GPUCoalescer::empty() const
684{
685 return m_writeRequestTable.empty() && m_readRequestTable.empty();
686}
687
688// Analyzes the packet to see if this request can be coalesced.
689// If request can be coalesced, this request is added to the reqCoalescer table
690// and makeRequest returns RequestStatus_Issued;
691// If this is the first request to a cacheline, request is added to both
692// newRequests queue and to the reqCoalescer table; makeRequest
693// returns RequestStatus_Issued.
694// If there is a pending request to this cacheline and this request
695// can't be coalesced, RequestStatus_Aliased is returned and
696// the packet needs to be reissued.
697RequestStatus
698GPUCoalescer::makeRequest(PacketPtr pkt)
699{
700 // Check for GPU Barrier Kernel End or Kernel Begin
701 // Leave these to be handled by the child class
702 // Kernel End/Barrier = isFlush + isRelease
703 // Kernel Begin = isFlush + isAcquire
704 if (pkt->req->isKernel()) {
705 if (pkt->req->isAcquire()){
706 // This is a Kernel Begin leave handling to
707 // virtual xCoalescer::makeRequest
708 return RequestStatus_Issued;
709 }else if (pkt->req->isRelease()) {
710 // This is a Kernel End leave handling to
711 // virtual xCoalescer::makeRequest
712 // If we are here then we didn't call
713 // a virtual version of this function
714 // so we will also schedule the callback
715 int wf_id = 0;
716 if (pkt->req->hasContextId()) {
717 wf_id = pkt->req->contextId();
718 }
719 insertKernel(wf_id, pkt);
720 newKernelEnds.push_back(wf_id);
721 if (!issueEvent.scheduled()) {
722 schedule(issueEvent, curTick());
723 }
724 return RequestStatus_Issued;
725 }
726 }
727
728 // If number of outstanding requests greater than the max allowed,
729 // return RequestStatus_BufferFull. This logic can be extended to
730 // support proper backpressure.
731 if (m_outstanding_count >= m_max_outstanding_requests) {
732 return RequestStatus_BufferFull;
733 }
734
735 RubyRequestType primary_type = RubyRequestType_NULL;
736 RubyRequestType secondary_type = RubyRequestType_NULL;
737
738 if (pkt->isLLSC()) {
739 //
740 // Alpha LL/SC instructions need to be handled carefully by the cache
741 // coherence protocol to ensure they follow the proper semantics. In
742 // particular, by identifying the operations as atomic, the protocol
743 // should understand that migratory sharing optimizations should not
744 // be performed (i.e. a load between the LL and SC should not steal
745 // away exclusive permission).
746 //
747 if (pkt->isWrite()) {
748 primary_type = RubyRequestType_Store_Conditional;
749 } else {
750 assert(pkt->isRead());
751 primary_type = RubyRequestType_Load_Linked;
752 }
753 secondary_type = RubyRequestType_ATOMIC;
754 } else if (pkt->req->isLockedRMW()) {
755 //
756 // x86 locked instructions are translated to store cache coherence
757 // requests because these requests should always be treated as read
758 // exclusive operations and should leverage any migratory sharing
759 // optimization built into the protocol.
760 //
761 if (pkt->isWrite()) {
762 primary_type = RubyRequestType_Locked_RMW_Write;
763 } else {
764 assert(pkt->isRead());
765 primary_type = RubyRequestType_Locked_RMW_Read;
766 }
767 secondary_type = RubyRequestType_ST;
768 } else if (pkt->isAtomicOp()) {
769 //
770 // GPU Atomic Operation
771 //
772 primary_type = RubyRequestType_ATOMIC;
773 secondary_type = RubyRequestType_ATOMIC;
774 } else {
775 if (pkt->isRead()) {
776 if (pkt->req->isInstFetch()) {
777 primary_type = secondary_type = RubyRequestType_IFETCH;
778 } else {
779#if THE_ISA == X86_ISA
780 uint32_t flags = pkt->req->getFlags();
781 bool storeCheck = flags &
782 (TheISA::StoreCheck << TheISA::FlagShift);
783#else
784 bool storeCheck = false;
785#endif // X86_ISA
786 if (storeCheck) {
787 primary_type = RubyRequestType_RMW_Read;
788 secondary_type = RubyRequestType_ST;
789 } else {
790 primary_type = secondary_type = RubyRequestType_LD;
791 }
792 }
793 } else if (pkt->isWrite()) {
794 //
795 // Note: M5 packets do not differentiate ST from RMW_Write
796 //
797 primary_type = secondary_type = RubyRequestType_ST;
798 } else if (pkt->isFlush()) {
799 primary_type = secondary_type = RubyRequestType_FLUSH;
800 } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
801 if (assumingRfOCoherence) {
802 // If we reached here, this request must be a memFence
803 // and the protocol implements RfO, the coalescer can
804 // assume sequentially consistency and schedule the callback
805 // immediately.
806 // Currently the code implements fence callbacks
807 // by reusing the mechanism for kernel completions.
808 // This should be fixed.
809 int wf_id = 0;
810 if (pkt->req->hasContextId()) {
811 wf_id = pkt->req->contextId();
812 }
813 insertKernel(wf_id, pkt);
814 newKernelEnds.push_back(wf_id);
815 if (!issueEvent.scheduled()) {
816 schedule(issueEvent, curTick());
817 }
818 return RequestStatus_Issued;
819 } else {
820 // If not RfO, return issued here and let the child coalescer
821 // take care of it.
822 return RequestStatus_Issued;
823 }
824 } else {
825 panic("Unsupported ruby packet type\n");
826 }
827 }
828
829 // Check if there is any pending request to this cache line from
830 // previous cycles.
831 // If there is a pending request, return aliased. Since coalescing
832 // across time is not permitted, aliased requests are not coalesced.
833 // If a request for this address has already been issued, we must block
834 RequestStatus status = getRequestStatus(pkt, primary_type);
835 if (status != RequestStatus_Ready)
836 return status;
837
838 Addr line_addr = makeLineAddress(pkt->getAddr());
839
840 // Check if this request can be coalesced with previous
841 // requests from this cycle.
842 if (!reqCoalescer.count(line_addr)) {
843 // This is the first access to this cache line.
844 // A new request to the memory subsystem has to be
845 // made in the next cycle for this cache line, so
846 // add this line addr to the "newRequests" queue
847 newRequests.push_back(line_addr);
848
849 // There was a request to this cache line in this cycle,
850 // let us see if we can coalesce this request with the previous
851 // requests from this cycle
852 } else if (primary_type !=
853 reqCoalescer[line_addr][0].primaryType) {
854 // can't coalesce loads, stores and atomics!
855 return RequestStatus_Aliased;
856 } else if (pkt->req->isLockedRMW() ||
857 reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
858 // can't coalesce locked accesses, but can coalesce atomics!
859 return RequestStatus_Aliased;
860 } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
861 pkt->req->contextId() !=
862 reqCoalescer[line_addr][0].pkt->req->contextId()) {
863 // can't coalesce releases from different wavefronts
864 return RequestStatus_Aliased;
865 }
866
867 // in addition to the packet, we need to save both request types
868 reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
869 if (!issueEvent.scheduled())
870 schedule(issueEvent, curTick());
871 // TODO: issue hardware prefetches here
872 return RequestStatus_Issued;
873}
874
875void
876GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
877{
878
879 int proc_id = -1;
880 if (pkt != NULL && pkt->req->hasContextId()) {
881 proc_id = pkt->req->contextId();
882 }
883
884 // If valid, copy the pc to the ruby request
885 Addr pc = 0;
886 if (pkt->req->hasPC()) {
887 pc = pkt->req->getPC();
888 }
889
890 // At the moment setting scopes only counts
891 // for GPU spill space accesses
892 // which is pkt->req->isStack()
893 // this scope is REPLACE since it
894 // does not need to be flushed at the end
895 // of a kernel Private and local may need
896 // to be visible at the end of the kernel
897 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
898 HSAScope accessScope = reqScopeToHSAScope(pkt->req);
899
900 Addr line_addr = makeLineAddress(pkt->getAddr());
901
902 // Creating WriteMask that records written bytes
903 // and atomic operations. This enables partial writes
904 // and partial reads of those writes
905 DataBlock dataBlock;
906 dataBlock.clear();
907 uint32_t blockSize = RubySystem::getBlockSizeBytes();
908 std::vector<bool> accessMask(blockSize,false);
909 std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
910 uint32_t tableSize = reqCoalescer[line_addr].size();
911 for (int i = 0; i < tableSize; i++) {
912 PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
913 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
914 uint32_t tmpSize = tmpPkt->getSize();
915 if (tmpPkt->isAtomicOp()) {
916 std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
917 tmpPkt->getAtomicOp());
918 atomicOps.push_back(tmpAtomicOp);
919 } else if (tmpPkt->isWrite()) {
920 dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
921 tmpOffset, tmpSize);
922 }
923 for (int j = 0; j < tmpSize; j++) {
924 accessMask[tmpOffset + j] = true;
925 }
926 }
927 std::shared_ptr<RubyRequest> msg;
928 if (pkt->isAtomicOp()) {
929 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
930 pkt->getPtr<uint8_t>(),
931 pkt->getSize(), pc, secondary_type,
932 RubyAccessMode_Supervisor, pkt,
933 PrefetchBit_No, proc_id, 100,
934 blockSize, accessMask,
935 dataBlock, atomicOps,
936 accessScope, accessSegment);
937 } else {
938 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
939 pkt->getPtr<uint8_t>(),
940 pkt->getSize(), pc, secondary_type,
941 RubyAccessMode_Supervisor, pkt,
942 PrefetchBit_No, proc_id, 100,
943 blockSize, accessMask,
944 dataBlock,
945 accessScope, accessSegment);
946 }
947 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
948 curTick(), m_version, "Coal", "Begin", "", "",
949 printAddress(msg->getPhysicalAddress()),
950 RubyRequestType_to_string(secondary_type));
951
952 fatal_if(secondary_type == RubyRequestType_IFETCH,
953 "there should not be any I-Fetch requests in the GPU Coalescer");
954
955 // Send the message to the cache controller
956 fatal_if(m_data_cache_hit_latency == 0,
957 "should not have a latency of zero");
958
959 assert(m_mandatory_q_ptr);
960 m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
961}
962
963template <class KEY, class VALUE>
964std::ostream &
965operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
966{
967 out << "[";
968 for (auto i = map.begin(); i != map.end(); ++i)
969 out << " " << i->first << "=" << i->second;
970 out << " ]";
971
972 return out;
973}
974
975void
976GPUCoalescer::print(ostream& out) const
977{
978 out << "[GPUCoalescer: " << m_version
979 << ", outstanding requests: " << m_outstanding_count
980 << ", read request table: " << m_readRequestTable
981 << ", write request table: " << m_writeRequestTable
982 << "]";
983}
984
985// this can be called from setState whenever coherence permissions are
986// upgraded when invoked, coherence violations will be checked for the
987// given block
988void
989GPUCoalescer::checkCoherence(Addr addr)
990{
991#ifdef CHECK_COHERENCE
992 m_ruby_system->checkGlobalCoherenceInvariant(addr);
993#endif
994}
995
996void
997GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
998 DPRINTF(RubyStats, "Recorded statistic: %s\n",
999 SequencerRequestType_to_string(requestType));
1000}
1001
1002
1003void
1004GPUCoalescer::completeIssue()
1005{
1006 // newRequests has the cacheline addresses of all the
1007 // requests which need to be issued to the memory subsystem
1008 // in this cycle
1009 int len = newRequests.size();
1010 DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
1011 for (int i = 0; i < len; ++i) {
1012 // Get the requests from reqCoalescer table. Get only the
1013 // first request for each cacheline, the remaining requests
1014 // can be coalesced with the first request. So, only
1015 // one request is issued per cacheline.
1016 RequestDesc info = reqCoalescer[newRequests[i]][0];
1017 PacketPtr pkt = info.pkt;
1018 DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
1019 i, pkt->req->getPaddr());
1020 // Insert this request to the read/writeRequestTables. These tables
1021 // are used to track aliased requests in makeRequest subroutine
1022 bool found = insertRequest(pkt, info.primaryType);
1023
1024 if (found) {
1025 panic("GPUCoalescer::makeRequest should never be called if the "
1026 "request is already outstanding\n");
1027 }
1028
1029 // Issue request to ruby subsystem
1030 issueRequest(pkt, info.secondaryType);
1031 }
1032 newRequests.clear();
1033
1034 // have Kernel End releases been issued this cycle
1035 len = newKernelEnds.size();
1036 for (int i = 0; i < len; i++) {
1037 kernelCallback(newKernelEnds[i]);
1038 }
1039 newKernelEnds.clear();
1040}
1041
1042void
1043GPUCoalescer::evictionCallback(Addr address)
1044{
1045 ruby_eviction_callback(address);
1046}
1047
1048void
1049GPUCoalescer::kernelCallback(int wavefront_id)
1050{
1051 assert(kernelEndList.count(wavefront_id));
1052
1053 ruby_hit_callback(kernelEndList[wavefront_id]);
1054
1055 kernelEndList.erase(wavefront_id);
1056}
1057
1058void
1059GPUCoalescer::atomicCallback(Addr address,
1060 MachineType mach,
1061 const DataBlock& data)
1062{
1063 assert(address == makeLineAddress(address));
1064
1065 DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
1066 assert(m_writeRequestTable.count(makeLineAddress(address)));
1067
1068 RequestTable::iterator i = m_writeRequestTable.find(address);
1069 assert(i != m_writeRequestTable.end());
1070 GPUCoalescerRequest* srequest = i->second;
1071
1072 m_writeRequestTable.erase(i);
1073 markRemoved();
1074
1075 assert((srequest->m_type == RubyRequestType_ATOMIC) ||
1076 (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
1077 (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
1078
1079
1080 // Atomics don't write to cache, so there is no MRU update...
1081
1082 recordMissLatency(srequest, mach,
1083 srequest->issue_time, Cycles(0), Cycles(0), true, false);
1084
1085 PacketPtr pkt = srequest->pkt;
1086 Addr request_address = pkt->getAddr();
1087 Addr request_line_address = makeLineAddress(pkt->getAddr());
1088
1089 int len = reqCoalescer[request_line_address].size();
1090 std::vector<PacketPtr> mylist;
1091 for (int i = 0; i < len; ++i) {
1092 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
1093 assert(srequest->m_type ==
1094 reqCoalescer[request_line_address][i].primaryType);
1095 request_address = (pkt->getAddr());
1096 request_line_address = makeLineAddress(request_address);
1097 if (pkt->getPtr<uint8_t>() &&
1098 srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
1099 /* atomics are done in memory, and return the data *before* the atomic op... */
1100 memcpy(pkt->getPtr<uint8_t>(),
1101 data.getData(getOffset(request_address),
1102 pkt->getSize()),
1103 pkt->getSize());
1104 } else {
1105 DPRINTF(MemoryAccess,
1106 "WARNING. Data not transfered from Ruby to M5 for type " \
1107 "%s\n",
1108 RubyRequestType_to_string(srequest->m_type));
1109 }
1110
1111 // If using the RubyTester, update the RubyTester sender state's
1112 // subBlock with the recieved data. The tester will later access
1113 // this state.
1114 // Note: RubyPort will access it's sender state before the
1115 // RubyTester.
1116 if (m_usingRubyTester) {
1117 RubyPort::SenderState *requestSenderState =
1118 safe_cast<RubyPort::SenderState*>(pkt->senderState);
1119 RubyTester::SenderState* testerSenderState =
1120 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
1121 testerSenderState->subBlock.mergeFrom(data);
1122 }
1123
1124 mylist.push_back(pkt);
1125 }
1126 delete srequest;
1127 reqCoalescer.erase(request_line_address);
1128 assert(!reqCoalescer.count(request_line_address));
1129
1130 completeHitCallback(mylist, len);
1131}
1132
1133void
1134GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
1135{
1136 if (myMachID == senderMachID) {
1137 CP_TCPLdHits++;
1138 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1139 CP_TCPLdTransfers++;
1140 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1141 CP_TCCLdHits++;
1142 } else {
1143 CP_LdMiss++;
1144 }
1145}
1146
1147void
1148GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
1149{
1150 if (myMachID == senderMachID) {
1151 CP_TCPStHits++;
1152 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1153 CP_TCPStTransfers++;
1154 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1155 CP_TCCStHits++;
1156 } else {
1157 CP_StMiss++;
1158 }
1159}
1160
1161void
1162GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
1163{
1164 for (int i = 0; i < len; ++i) {
1165 RubyPort::SenderState *ss =
1166 safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
1167 MemSlavePort *port = ss->port;
1168 assert(port != NULL);
1169
1170 mylist[i]->senderState = ss->predecessor;
1171 delete ss;
1172 port->hitCallback(mylist[i]);
1173 trySendRetries();
1174 }
1175
1176 testDrainComplete();
1177}
1178
1179PacketPtr
1180GPUCoalescer::mapAddrToPkt(Addr address)
1181{
1182 RequestTable::iterator i = m_readRequestTable.find(address);
1183 assert(i != m_readRequestTable.end());
1184 GPUCoalescerRequest* request = i->second;
1185 return request->pkt;
1186}
1187
1188void
1189GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
1190 MachineType mach,
1191 Cycles initialRequestTime,
1192 Cycles forwardRequestTime,
1193 Cycles firstResponseTime,
1194 bool success, bool isRegion)
1195{
1196 RubyRequestType type = srequest->m_type;
1197 Cycles issued_time = srequest->issue_time;
1198 Cycles completion_time = curCycle();
1199 assert(completion_time >= issued_time);
1200 Cycles total_lat = completion_time - issued_time;
1201
1202 // cache stats (valid for RfO protocol only)
1203 if (mach == MachineType_TCP) {
1204 if (type == RubyRequestType_LD) {
1205 GPU_TCPLdHits++;
1206 } else {
1207 GPU_TCPStHits++;
1208 }
1209 } else if (mach == MachineType_L1Cache_wCC) {
1210 if (type == RubyRequestType_LD) {
1211 GPU_TCPLdTransfers++;
1212 } else {
1213 GPU_TCPStTransfers++;
1214 }
1215 } else if (mach == MachineType_TCC) {
1216 if (type == RubyRequestType_LD) {
1217 GPU_TCCLdHits++;
1218 } else {
1219 GPU_TCCStHits++;
1220 }
1221 } else {
1222 if (type == RubyRequestType_LD) {
1223 GPU_LdMiss++;
1224 } else {
1225 GPU_StMiss++;
1226 }
1227 }
1228
1229 // Profile all access latency, even zero latency accesses
1230 m_latencyHist.sample(total_lat);
1231 m_typeLatencyHist[type]->sample(total_lat);
1232
1233 // Profile the miss latency for all non-zero demand misses
1234 if (total_lat != Cycles(0)) {
1235 m_missLatencyHist.sample(total_lat);
1236 m_missTypeLatencyHist[type]->sample(total_lat);
1237
1238 if (mach != MachineType_NUM) {
1239 m_missMachLatencyHist[mach]->sample(total_lat);
1240 m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
1241
1242 if ((issued_time <= initialRequestTime) &&
1243 (initialRequestTime <= forwardRequestTime) &&
1244 (forwardRequestTime <= firstResponseTime) &&
1245 (firstResponseTime <= completion_time)) {
1246
1247 m_IssueToInitialDelayHist[mach]->sample(
1248 initialRequestTime - issued_time);
1249 m_InitialToForwardDelayHist[mach]->sample(
1250 forwardRequestTime - initialRequestTime);
1251 m_ForwardToFirstResponseDelayHist[mach]->sample(
1252 firstResponseTime - forwardRequestTime);
1253 m_FirstResponseToCompletionDelayHist[mach]->sample(
1254 completion_time - firstResponseTime);
1255 }
1256 }
1257
1258 }
1259
1260 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1261 curTick(), m_version, "Coal",
1262 success ? "Done" : "SC_Failed", "", "",
1263 printAddress(srequest->pkt->getAddr()), total_lat);
1264}
1265
1266void
1267GPUCoalescer::regStats()
1268{
1269 RubyPort::regStats();
1270
1271 // These statistical variables are not for display.
1272 // The profiler will collate these across different
1273 // coalescers and display those collated statistics.
1274 m_outstandReqHist.init(10);
1275 m_latencyHist.init(10);
1276 m_missLatencyHist.init(10);
1277
1278 for (int i = 0; i < RubyRequestType_NUM; i++) {
1279 m_typeLatencyHist.push_back(new Stats::Histogram());
1280 m_typeLatencyHist[i]->init(10);
1281
1282 m_missTypeLatencyHist.push_back(new Stats::Histogram());
1283 m_missTypeLatencyHist[i]->init(10);
1284 }
1285
1286 for (int i = 0; i < MachineType_NUM; i++) {
1287 m_missMachLatencyHist.push_back(new Stats::Histogram());
1288 m_missMachLatencyHist[i]->init(10);
1289
1290 m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
1291 m_IssueToInitialDelayHist[i]->init(10);
1292
1293 m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
1294 m_InitialToForwardDelayHist[i]->init(10);
1295
1296 m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
1297 m_ForwardToFirstResponseDelayHist[i]->init(10);
1298
1299 m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
1300 m_FirstResponseToCompletionDelayHist[i]->init(10);
1301 }
1302
1303 for (int i = 0; i < RubyRequestType_NUM; i++) {
1304 m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
1305
1306 for (int j = 0; j < MachineType_NUM; j++) {
1307 m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
1308 m_missTypeMachLatencyHist[i][j]->init(10);
1309 }
1310 }
1311
1312 // GPU cache stats
1313 GPU_TCPLdHits
1314 .name(name() + ".gpu_tcp_ld_hits")
1315 .desc("loads that hit in the TCP")
1316 ;
1317 GPU_TCPLdTransfers
1318 .name(name() + ".gpu_tcp_ld_transfers")
1319 .desc("TCP to TCP load transfers")
1320 ;
1321 GPU_TCCLdHits
1322 .name(name() + ".gpu_tcc_ld_hits")
1323 .desc("loads that hit in the TCC")
1324 ;
1325 GPU_LdMiss
1326 .name(name() + ".gpu_ld_misses")
1327 .desc("loads that miss in the GPU")
1328 ;
1329
1330 GPU_TCPStHits
1331 .name(name() + ".gpu_tcp_st_hits")
1332 .desc("stores that hit in the TCP")
1333 ;
1334 GPU_TCPStTransfers
1335 .name(name() + ".gpu_tcp_st_transfers")
1336 .desc("TCP to TCP store transfers")
1337 ;
1338 GPU_TCCStHits
1339 .name(name() + ".gpu_tcc_st_hits")
1340 .desc("stores that hit in the TCC")
1341 ;
1342 GPU_StMiss
1343 .name(name() + ".gpu_st_misses")
1344 .desc("stores that miss in the GPU")
1345 ;
1346
1347 // CP cache stats
1348 CP_TCPLdHits
1349 .name(name() + ".cp_tcp_ld_hits")
1350 .desc("loads that hit in the TCP")
1351 ;
1352 CP_TCPLdTransfers
1353 .name(name() + ".cp_tcp_ld_transfers")
1354 .desc("TCP to TCP load transfers")
1355 ;
1356 CP_TCCLdHits
1357 .name(name() + ".cp_tcc_ld_hits")
1358 .desc("loads that hit in the TCC")
1359 ;
1360 CP_LdMiss
1361 .name(name() + ".cp_ld_misses")
1362 .desc("loads that miss in the GPU")
1363 ;
1364
1365 CP_TCPStHits
1366 .name(name() + ".cp_tcp_st_hits")
1367 .desc("stores that hit in the TCP")
1368 ;
1369 CP_TCPStTransfers
1370 .name(name() + ".cp_tcp_st_transfers")
1371 .desc("TCP to TCP store transfers")
1372 ;
1373 CP_TCCStHits
1374 .name(name() + ".cp_tcc_st_hits")
1375 .desc("stores that hit in the TCC")
1376 ;
1377 CP_StMiss
1378 .name(name() + ".cp_st_misses")
1379 .desc("stores that miss in the GPU")
1380 ;
1381}