Cross Reference: /gem5/src/mem/ruby/system/GPUCoalescer.cc

Deleted Added

sdiff udiff text old ( 12133:ca42be3276af ) new ( 12334:e0ab29a34764 )

full compact

GPUCoalescer.cc (12133:ca42be3276af)	GPUCoalescer.cc (12334:e0ab29a34764)
1/* 2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Sooraj Puthoor 34 */ 35	1/* 2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Sooraj Puthoor 34 */ 35
36#include "base/misc.hh"	36#include "base/logging.hh"
37#include "base/str.hh" 38#include "config/the_isa.hh" 39 40#if THE_ISA == X86_ISA 41#include "arch/x86/insts/microldstop.hh" 42 43#endif // X86_ISA 44#include "mem/ruby/system/GPUCoalescer.hh" 45 46#include "cpu/testers/rubytest/RubyTester.hh" 47#include "debug/GPUCoalescer.hh" 48#include "debug/MemoryAccess.hh" 49#include "debug/ProtocolTrace.hh" 50#include "debug/RubyPort.hh" 51#include "debug/RubyStats.hh" 52#include "gpu-compute/shader.hh" 53#include "mem/packet.hh" 54#include "mem/ruby/common/DataBlock.hh" 55#include "mem/ruby/common/SubBlock.hh" 56#include "mem/ruby/network/MessageBuffer.hh" 57#include "mem/ruby/profiler/Profiler.hh" 58#include "mem/ruby/slicc_interface/AbstractController.hh" 59#include "mem/ruby/slicc_interface/RubyRequest.hh" 60#include "mem/ruby/structures/CacheMemory.hh" 61#include "mem/ruby/system/RubySystem.hh" 62#include "params/RubyGPUCoalescer.hh" 63 64using namespace std; 65 66GPUCoalescer * 67RubyGPUCoalescerParams::create() 68{ 69 return new GPUCoalescer(this); 70} 71 72HSAScope 73reqScopeToHSAScope(Request* req) 74{ 75 HSAScope accessScope = HSAScope_UNSPECIFIED; 76 if (req->isScoped()) { 77 if (req->isWavefrontScope()) { 78 accessScope = HSAScope_WAVEFRONT; 79 } else if (req->isWorkgroupScope()) { 80 accessScope = HSAScope_WORKGROUP; 81 } else if (req->isDeviceScope()) { 82 accessScope = HSAScope_DEVICE; 83 } else if (req->isSystemScope()) { 84 accessScope = HSAScope_SYSTEM; 85 } else { 86 fatal("Bad scope type"); 87 } 88 } 89 return accessScope; 90} 91 92HSASegment 93reqSegmentToHSASegment(Request* req) 94{ 95 HSASegment accessSegment = HSASegment_GLOBAL; 96 97 if (req->isGlobalSegment()) { 98 accessSegment = HSASegment_GLOBAL; 99 } else if (req->isGroupSegment()) { 100 accessSegment = HSASegment_GROUP; 101 } else if (req->isPrivateSegment()) { 102 accessSegment = HSASegment_PRIVATE; 103 } else if (req->isKernargSegment()) { 104 accessSegment = HSASegment_KERNARG; 105 } else if (req->isReadonlySegment()) { 106 accessSegment = HSASegment_READONLY; 107 } else if (req->isSpillSegment()) { 108 accessSegment = HSASegment_SPILL; 109 } else if (req->isArgSegment()) { 110 accessSegment = HSASegment_ARG; 111 } else { 112 fatal("Bad segment type"); 113 } 114 115 return accessSegment; 116} 117 118GPUCoalescer::GPUCoalescer(const Params p) 119* : RubyPort(p), 120 issueEvent([this]{ completeIssue(); }, "Issue coalesced request", 121 false, Event::Progress_Event_Pri), 122 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check") 123{ 124 m_store_waiting_on_load_cycles = 0; 125 m_store_waiting_on_store_cycles = 0; 126 m_load_waiting_on_store_cycles = 0; 127 m_load_waiting_on_load_cycles = 0; 128 129 m_outstanding_count = 0; 130 131 m_max_outstanding_requests = 0; 132 m_deadlock_threshold = 0; 133 m_instCache_ptr = nullptr; 134 m_dataCache_ptr = nullptr; 135 136 m_instCache_ptr = p->icache; 137 m_dataCache_ptr = p->dcache; 138 m_max_outstanding_requests = p->max_outstanding_requests; 139 m_deadlock_threshold = p->deadlock_threshold; 140 141 assert(m_max_outstanding_requests > 0); 142 assert(m_deadlock_threshold > 0); 143 assert(m_instCache_ptr); 144 assert(m_dataCache_ptr); 145 146 m_data_cache_hit_latency = p->dcache_hit_latency; 147 148 m_runningGarnetStandalone = p->garnet_standalone; 149 assumingRfOCoherence = p->assume_rfo; 150} 151 152GPUCoalescer::~GPUCoalescer() 153{ 154} 155 156void 157GPUCoalescer::wakeup() 158{ 159 // Check for deadlock of any of the requests 160 Cycles current_time = curCycle(); 161 162 // Check across all outstanding requests 163 int total_outstanding = 0; 164 165 RequestTable::iterator read = m_readRequestTable.begin(); 166 RequestTable::iterator read_end = m_readRequestTable.end(); 167 for (; read != read_end; ++read) { 168 GPUCoalescerRequest* request = read->second; 169 if (current_time - request->issue_time < m_deadlock_threshold) 170 continue; 171 172 panic("Possible Deadlock detected. Aborting!\n" 173 "version: %d request.paddr: 0x%x m_readRequestTable: %d " 174 "current time: %u issue_time: %d difference: %d\n", m_version, 175 request->pkt->getAddr(), m_readRequestTable.size(), 176 current_time * clockPeriod(), request->issue_time * clockPeriod(), 177 (current_time - request->issue_time)clockPeriod()); 178* } 179 180 RequestTable::iterator write = m_writeRequestTable.begin(); 181 RequestTable::iterator write_end = m_writeRequestTable.end(); 182 for (; write != write_end; ++write) { 183 GPUCoalescerRequest* request = write->second; 184 if (current_time - request->issue_time < m_deadlock_threshold) 185 continue; 186 187 panic("Possible Deadlock detected. Aborting!\n" 188 "version: %d request.paddr: 0x%x m_writeRequestTable: %d " 189 "current time: %u issue_time: %d difference: %d\n", m_version, 190 request->pkt->getAddr(), m_writeRequestTable.size(), 191 current_time * clockPeriod(), request->issue_time * clockPeriod(), 192 (current_time - request->issue_time) * clockPeriod()); 193 } 194 195 total_outstanding += m_writeRequestTable.size(); 196 total_outstanding += m_readRequestTable.size(); 197 198 assert(m_outstanding_count == total_outstanding); 199 200 if (m_outstanding_count > 0) { 201 // If there are still outstanding requests, keep checking 202 schedule(deadlockCheckEvent, 203 m_deadlock_threshold * clockPeriod() + 204 curTick()); 205 } 206} 207 208void 209GPUCoalescer::resetStats() 210{ 211 m_latencyHist.reset(); 212 m_missLatencyHist.reset(); 213 for (int i = 0; i < RubyRequestType_NUM; i++) { 214 m_typeLatencyHist[i]->reset(); 215 m_missTypeLatencyHist[i]->reset(); 216 for (int j = 0; j < MachineType_NUM; j++) { 217 m_missTypeMachLatencyHist[i][j]->reset(); 218 } 219 } 220 221 for (int i = 0; i < MachineType_NUM; i++) { 222 m_missMachLatencyHist[i]->reset(); 223 224 m_IssueToInitialDelayHist[i]->reset(); 225 m_InitialToForwardDelayHist[i]->reset(); 226 m_ForwardToFirstResponseDelayHist[i]->reset(); 227 m_FirstResponseToCompletionDelayHist[i]->reset(); 228 } 229} 230 231void 232GPUCoalescer::printProgress(ostream& out) const 233{ 234} 235 236RequestStatus 237GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type) 238{ 239 Addr line_addr = makeLineAddress(pkt->getAddr()); 240 241 if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) { 242 return RequestStatus_BufferFull; 243 } 244 245 if (m_controller->isBlocked(line_addr) && 246 request_type != RubyRequestType_Locked_RMW_Write) { 247 return RequestStatus_Aliased; 248 } 249 250 if ((request_type == RubyRequestType_ST) \|\| 251 (request_type == RubyRequestType_ATOMIC) \|\| 252 (request_type == RubyRequestType_ATOMIC_RETURN) \|\| 253 (request_type == RubyRequestType_ATOMIC_NO_RETURN) \|\| 254 (request_type == RubyRequestType_RMW_Read) \|\| 255 (request_type == RubyRequestType_RMW_Write) \|\| 256 (request_type == RubyRequestType_Load_Linked) \|\| 257 (request_type == RubyRequestType_Store_Conditional) \|\| 258 (request_type == RubyRequestType_Locked_RMW_Read) \|\| 259 (request_type == RubyRequestType_Locked_RMW_Write) \|\| 260 (request_type == RubyRequestType_FLUSH)) { 261 262 // Check if there is any outstanding read request for the same 263 // cache line. 264 if (m_readRequestTable.count(line_addr) > 0) { 265 m_store_waiting_on_load_cycles++; 266 return RequestStatus_Aliased; 267 } 268 269 if (m_writeRequestTable.count(line_addr) > 0) { 270 // There is an outstanding write request for the cache line 271 m_store_waiting_on_store_cycles++; 272 return RequestStatus_Aliased; 273 } 274 } else { 275 // Check if there is any outstanding write request for the same 276 // cache line. 277 if (m_writeRequestTable.count(line_addr) > 0) { 278 m_load_waiting_on_store_cycles++; 279 return RequestStatus_Aliased; 280 } 281 282 if (m_readRequestTable.count(line_addr) > 0) { 283 // There is an outstanding read request for the cache line 284 m_load_waiting_on_load_cycles++; 285 return RequestStatus_Aliased; 286 } 287 } 288 289 return RequestStatus_Ready; 290 291} 292 293 294 295// sets the kernelEndList 296void 297GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt) 298{ 299 // Don't know if this will happen or is possible 300 // but I just want to be careful and not have it become 301 // simulator hang in the future 302 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id); 303 assert(kernelEndList.count(wavefront_id) == 0); 304 305 kernelEndList[wavefront_id] = pkt; 306 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n", 307 kernelEndList.size()); 308} 309 310 311// Insert the request on the correct request table. Return true if 312// the entry was already present. 313bool 314GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type) 315{ 316 assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready \|\| 317 pkt->req->isLockedRMW() \|\| 318 !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())); 319 320 int total_outstanding M5_VAR_USED = 321 m_writeRequestTable.size() + m_readRequestTable.size(); 322 323 assert(m_outstanding_count == total_outstanding); 324 325 // See if we should schedule a deadlock check 326 if (!deadlockCheckEvent.scheduled()) { 327 schedule(deadlockCheckEvent, m_deadlock_threshold + curTick()); 328 } 329 330 Addr line_addr = makeLineAddress(pkt->getAddr()); 331 if ((request_type == RubyRequestType_ST) \|\| 332 (request_type == RubyRequestType_ATOMIC) \|\| 333 (request_type == RubyRequestType_ATOMIC_RETURN) \|\| 334 (request_type == RubyRequestType_ATOMIC_NO_RETURN) \|\| 335 (request_type == RubyRequestType_RMW_Read) \|\| 336 (request_type == RubyRequestType_RMW_Write) \|\| 337 (request_type == RubyRequestType_Load_Linked) \|\| 338 (request_type == RubyRequestType_Store_Conditional) \|\| 339 (request_type == RubyRequestType_Locked_RMW_Read) \|\| 340 (request_type == RubyRequestType_Locked_RMW_Write) \|\| 341 (request_type == RubyRequestType_FLUSH)) { 342 343 pair<RequestTable::iterator, bool> r = 344 m_writeRequestTable.insert(RequestTable::value_type(line_addr, 345 (GPUCoalescerRequest) NULL)); 346* if (r.second) { 347 RequestTable::iterator i = r.first; 348 i->second = new GPUCoalescerRequest(pkt, request_type, 349 curCycle()); 350 DPRINTF(GPUCoalescer, 351 "Inserting write request for paddr %#x for type %d\n", 352 pkt->req->getPaddr(), i->second->m_type); 353 m_outstanding_count++; 354 } else { 355 return true; 356 } 357 } else { 358 pair<RequestTable::iterator, bool> r = 359 m_readRequestTable.insert(RequestTable::value_type(line_addr, 360 (GPUCoalescerRequest) NULL)); 361* 362 if (r.second) { 363 RequestTable::iterator i = r.first; 364 i->second = new GPUCoalescerRequest(pkt, request_type, 365 curCycle()); 366 DPRINTF(GPUCoalescer, 367 "Inserting read request for paddr %#x for type %d\n", 368 pkt->req->getPaddr(), i->second->m_type); 369 m_outstanding_count++; 370 } else { 371 return true; 372 } 373 } 374 375 m_outstandReqHist.sample(m_outstanding_count); 376 377 total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size(); 378 assert(m_outstanding_count == total_outstanding); 379 380 return false; 381} 382 383void 384GPUCoalescer::markRemoved() 385{ 386 m_outstanding_count--; 387 assert(m_outstanding_count == 388 m_writeRequestTable.size() + m_readRequestTable.size()); 389} 390 391void 392GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest) 393{ 394 assert(m_outstanding_count == 395 m_writeRequestTable.size() + m_readRequestTable.size()); 396 397 Addr line_addr = makeLineAddress(srequest->pkt->getAddr()); 398 if ((srequest->m_type == RubyRequestType_ST) \|\| 399 (srequest->m_type == RubyRequestType_RMW_Read) \|\| 400 (srequest->m_type == RubyRequestType_RMW_Write) \|\| 401 (srequest->m_type == RubyRequestType_Load_Linked) \|\| 402 (srequest->m_type == RubyRequestType_Store_Conditional) \|\| 403 (srequest->m_type == RubyRequestType_Locked_RMW_Read) \|\| 404 (srequest->m_type == RubyRequestType_Locked_RMW_Write)) { 405 m_writeRequestTable.erase(line_addr); 406 } else { 407 m_readRequestTable.erase(line_addr); 408 } 409 410 markRemoved(); 411} 412 413bool 414GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request) 415{ 416 // 417 // The success flag indicates whether the LLSC operation was successful. 418 // LL ops will always succeed, but SC may fail if the cache line is no 419 // longer locked. 420 // 421 bool success = true; 422 if (request->m_type == RubyRequestType_Store_Conditional) { 423 if (!m_dataCache_ptr->isLocked(address, m_version)) { 424 // 425 // For failed SC requests, indicate the failure to the cpu by 426 // setting the extra data to zero. 427 // 428 request->pkt->req->setExtraData(0); 429 success = false; 430 } else { 431 // 432 // For successful SC requests, indicate the success to the cpu by 433 // setting the extra data to one. 434 // 435 request->pkt->req->setExtraData(1); 436 } 437 // 438 // Independent of success, all SC operations must clear the lock 439 // 440 m_dataCache_ptr->clearLocked(address); 441 } else if (request->m_type == RubyRequestType_Load_Linked) { 442 // 443 // Note: To fully follow Alpha LLSC semantics, should the LL clear any 444 // previously locked cache lines? 445 // 446 m_dataCache_ptr->setLocked(address, m_version); 447 } else if ((m_dataCache_ptr->isTagPresent(address)) && 448 (m_dataCache_ptr->isLocked(address, m_version))) { 449 // 450 // Normal writes should clear the locked address 451 // 452 m_dataCache_ptr->clearLocked(address); 453 } 454 return success; 455} 456 457void 458GPUCoalescer::writeCallback(Addr address, DataBlock& data) 459{ 460 writeCallback(address, MachineType_NULL, data); 461} 462 463void 464GPUCoalescer::writeCallback(Addr address, 465 MachineType mach, 466 DataBlock& data) 467{ 468 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); 469} 470 471void 472GPUCoalescer::writeCallback(Addr address, 473 MachineType mach, 474 DataBlock& data, 475 Cycles initialRequestTime, 476 Cycles forwardRequestTime, 477 Cycles firstResponseTime) 478{ 479 writeCallback(address, mach, data, 480 initialRequestTime, forwardRequestTime, firstResponseTime, 481 false); 482} 483 484void 485GPUCoalescer::writeCallback(Addr address, 486 MachineType mach, 487 DataBlock& data, 488 Cycles initialRequestTime, 489 Cycles forwardRequestTime, 490 Cycles firstResponseTime, 491 bool isRegion) 492{ 493 assert(address == makeLineAddress(address)); 494 495 DPRINTF(GPUCoalescer, "write callback for address %#x\n", address); 496 assert(m_writeRequestTable.count(makeLineAddress(address))); 497 498 RequestTable::iterator i = m_writeRequestTable.find(address); 499 assert(i != m_writeRequestTable.end()); 500 GPUCoalescerRequest* request = i->second; 501 502 m_writeRequestTable.erase(i); 503 markRemoved(); 504 505 assert((request->m_type == RubyRequestType_ST) \|\| 506 (request->m_type == RubyRequestType_ATOMIC) \|\| 507 (request->m_type == RubyRequestType_ATOMIC_RETURN) \|\| 508 (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) \|\| 509 (request->m_type == RubyRequestType_RMW_Read) \|\| 510 (request->m_type == RubyRequestType_RMW_Write) \|\| 511 (request->m_type == RubyRequestType_Load_Linked) \|\| 512 (request->m_type == RubyRequestType_Store_Conditional) \|\| 513 (request->m_type == RubyRequestType_Locked_RMW_Read) \|\| 514 (request->m_type == RubyRequestType_Locked_RMW_Write) \|\| 515 (request->m_type == RubyRequestType_FLUSH)); 516 517 518 // 519 // For Alpha, properly handle LL, SC, and write requests with respect to 520 // locked cache blocks. 521 // 522 // Not valid for Garnet_standalone protocl 523 // 524 bool success = true; 525 if (!m_runningGarnetStandalone) 526 success = handleLlsc(address, request); 527 528 if (request->m_type == RubyRequestType_Locked_RMW_Read) { 529 m_controller->blockOnQueue(address, m_mandatory_q_ptr); 530 } else if (request->m_type == RubyRequestType_Locked_RMW_Write) { 531 m_controller->unblock(address); 532 } 533 534 hitCallback(request, mach, data, success, 535 request->issue_time, forwardRequestTime, firstResponseTime, 536 isRegion); 537} 538 539void 540GPUCoalescer::readCallback(Addr address, DataBlock& data) 541{ 542 readCallback(address, MachineType_NULL, data); 543} 544 545void 546GPUCoalescer::readCallback(Addr address, 547 MachineType mach, 548 DataBlock& data) 549{ 550 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); 551} 552 553void 554GPUCoalescer::readCallback(Addr address, 555 MachineType mach, 556 DataBlock& data, 557 Cycles initialRequestTime, 558 Cycles forwardRequestTime, 559 Cycles firstResponseTime) 560{ 561 562 readCallback(address, mach, data, 563 initialRequestTime, forwardRequestTime, firstResponseTime, 564 false); 565} 566 567void 568GPUCoalescer::readCallback(Addr address, 569 MachineType mach, 570 DataBlock& data, 571 Cycles initialRequestTime, 572 Cycles forwardRequestTime, 573 Cycles firstResponseTime, 574 bool isRegion) 575{ 576 assert(address == makeLineAddress(address)); 577 assert(m_readRequestTable.count(makeLineAddress(address))); 578 579 DPRINTF(GPUCoalescer, "read callback for address %#x\n", address); 580 RequestTable::iterator i = m_readRequestTable.find(address); 581 assert(i != m_readRequestTable.end()); 582 GPUCoalescerRequest* request = i->second; 583 584 m_readRequestTable.erase(i); 585 markRemoved(); 586 587 assert((request->m_type == RubyRequestType_LD) \|\| 588 (request->m_type == RubyRequestType_IFETCH)); 589 590 hitCallback(request, mach, data, true, 591 request->issue_time, forwardRequestTime, firstResponseTime, 592 isRegion); 593} 594 595void 596GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest, 597 MachineType mach, 598 DataBlock& data, 599 bool success, 600 Cycles initialRequestTime, 601 Cycles forwardRequestTime, 602 Cycles firstResponseTime, 603 bool isRegion) 604{ 605 PacketPtr pkt = srequest->pkt; 606 Addr request_address = pkt->getAddr(); 607 Addr request_line_address = makeLineAddress(request_address); 608 609 RubyRequestType type = srequest->m_type; 610 611 // Set this cache entry to the most recently used 612 if (type == RubyRequestType_IFETCH) { 613 if (m_instCache_ptr->isTagPresent(request_line_address)) 614 m_instCache_ptr->setMRU(request_line_address); 615 } else { 616 if (m_dataCache_ptr->isTagPresent(request_line_address)) 617 m_dataCache_ptr->setMRU(request_line_address); 618 } 619 620 recordMissLatency(srequest, mach, 621 initialRequestTime, 622 forwardRequestTime, 623 firstResponseTime, 624 success, isRegion); 625 // update the data 626 // 627 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER 628 int len = reqCoalescer[request_line_address].size(); 629 std::vector<PacketPtr> mylist; 630 for (int i = 0; i < len; ++i) { 631 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt; 632 assert(type == reqCoalescer[request_line_address][i].primaryType); 633 request_address = pkt->getAddr(); 634 request_line_address = makeLineAddress(pkt->getAddr()); 635 if (pkt->getPtr<uint8_t>()) { 636 if ((type == RubyRequestType_LD) \|\| 637 (type == RubyRequestType_ATOMIC) \|\| 638 (type == RubyRequestType_ATOMIC_RETURN) \|\| 639 (type == RubyRequestType_IFETCH) \|\| 640 (type == RubyRequestType_RMW_Read) \|\| 641 (type == RubyRequestType_Locked_RMW_Read) \|\| 642 (type == RubyRequestType_Load_Linked)) { 643 memcpy(pkt->getPtr<uint8_t>(), 644 data.getData(getOffset(request_address), 645 pkt->getSize()), 646 pkt->getSize()); 647 } else { 648 data.setData(pkt->getPtr<uint8_t>(), 649 getOffset(request_address), pkt->getSize()); 650 } 651 } else { 652 DPRINTF(MemoryAccess, 653 "WARNING. Data not transfered from Ruby to M5 for type " \ 654 "%s\n", 655 RubyRequestType_to_string(type)); 656 } 657 658 // If using the RubyTester, update the RubyTester sender state's 659 // subBlock with the recieved data. The tester will later access 660 // this state. 661 // Note: RubyPort will access it's sender state before the 662 // RubyTester. 663 if (m_usingRubyTester) { 664 RubyPort::SenderState requestSenderState = 665* safe_cast<RubyPort::SenderState>(pkt->senderState); 666* RubyTester::SenderState* testerSenderState = 667 safe_cast<RubyTester::SenderState>(requestSenderState->predecessor); 668* testerSenderState->subBlock.mergeFrom(data); 669 } 670 671 mylist.push_back(pkt); 672 } 673 delete srequest; 674 reqCoalescer.erase(request_line_address); 675 assert(!reqCoalescer.count(request_line_address)); 676 677 678 679 completeHitCallback(mylist, len); 680} 681 682bool 683GPUCoalescer::empty() const 684{ 685 return m_writeRequestTable.empty() && m_readRequestTable.empty(); 686} 687 688// Analyzes the packet to see if this request can be coalesced. 689// If request can be coalesced, this request is added to the reqCoalescer table 690// and makeRequest returns RequestStatus_Issued; 691// If this is the first request to a cacheline, request is added to both 692// newRequests queue and to the reqCoalescer table; makeRequest 693// returns RequestStatus_Issued. 694// If there is a pending request to this cacheline and this request 695// can't be coalesced, RequestStatus_Aliased is returned and 696// the packet needs to be reissued. 697RequestStatus 698GPUCoalescer::makeRequest(PacketPtr pkt) 699{ 700 // Check for GPU Barrier Kernel End or Kernel Begin 701 // Leave these to be handled by the child class 702 // Kernel End/Barrier = isFlush + isRelease 703 // Kernel Begin = isFlush + isAcquire 704 if (pkt->req->isKernel()) { 705 if (pkt->req->isAcquire()){ 706 // This is a Kernel Begin leave handling to 707 // virtual xCoalescer::makeRequest 708 return RequestStatus_Issued; 709 }else if (pkt->req->isRelease()) { 710 // This is a Kernel End leave handling to 711 // virtual xCoalescer::makeRequest 712 // If we are here then we didn't call 713 // a virtual version of this function 714 // so we will also schedule the callback 715 int wf_id = 0; 716 if (pkt->req->hasContextId()) { 717 wf_id = pkt->req->contextId(); 718 } 719 insertKernel(wf_id, pkt); 720 newKernelEnds.push_back(wf_id); 721 if (!issueEvent.scheduled()) { 722 schedule(issueEvent, curTick()); 723 } 724 return RequestStatus_Issued; 725 } 726 } 727 728 // If number of outstanding requests greater than the max allowed, 729 // return RequestStatus_BufferFull. This logic can be extended to 730 // support proper backpressure. 731 if (m_outstanding_count >= m_max_outstanding_requests) { 732 return RequestStatus_BufferFull; 733 } 734 735 RubyRequestType primary_type = RubyRequestType_NULL; 736 RubyRequestType secondary_type = RubyRequestType_NULL; 737 738 if (pkt->isLLSC()) { 739 // 740 // Alpha LL/SC instructions need to be handled carefully by the cache 741 // coherence protocol to ensure they follow the proper semantics. In 742 // particular, by identifying the operations as atomic, the protocol 743 // should understand that migratory sharing optimizations should not 744 // be performed (i.e. a load between the LL and SC should not steal 745 // away exclusive permission). 746 // 747 if (pkt->isWrite()) { 748 primary_type = RubyRequestType_Store_Conditional; 749 } else { 750 assert(pkt->isRead()); 751 primary_type = RubyRequestType_Load_Linked; 752 } 753 secondary_type = RubyRequestType_ATOMIC; 754 } else if (pkt->req->isLockedRMW()) { 755 // 756 // x86 locked instructions are translated to store cache coherence 757 // requests because these requests should always be treated as read 758 // exclusive operations and should leverage any migratory sharing 759 // optimization built into the protocol. 760 // 761 if (pkt->isWrite()) { 762 primary_type = RubyRequestType_Locked_RMW_Write; 763 } else { 764 assert(pkt->isRead()); 765 primary_type = RubyRequestType_Locked_RMW_Read; 766 } 767 secondary_type = RubyRequestType_ST; 768 } else if (pkt->isAtomicOp()) { 769 // 770 // GPU Atomic Operation 771 // 772 primary_type = RubyRequestType_ATOMIC; 773 secondary_type = RubyRequestType_ATOMIC; 774 } else { 775 if (pkt->isRead()) { 776 if (pkt->req->isInstFetch()) { 777 primary_type = secondary_type = RubyRequestType_IFETCH; 778 } else { 779#if THE_ISA == X86_ISA 780 uint32_t flags = pkt->req->getFlags(); 781 bool storeCheck = flags & 782 (TheISA::StoreCheck << TheISA::FlagShift); 783#else 784 bool storeCheck = false; 785#endif // X86_ISA 786 if (storeCheck) { 787 primary_type = RubyRequestType_RMW_Read; 788 secondary_type = RubyRequestType_ST; 789 } else { 790 primary_type = secondary_type = RubyRequestType_LD; 791 } 792 } 793 } else if (pkt->isWrite()) { 794 // 795 // Note: M5 packets do not differentiate ST from RMW_Write 796 // 797 primary_type = secondary_type = RubyRequestType_ST; 798 } else if (pkt->isFlush()) { 799 primary_type = secondary_type = RubyRequestType_FLUSH; 800 } else if (pkt->req->isRelease() \|\| pkt->req->isAcquire()) { 801 if (assumingRfOCoherence) { 802 // If we reached here, this request must be a memFence 803 // and the protocol implements RfO, the coalescer can 804 // assume sequentially consistency and schedule the callback 805 // immediately. 806 // Currently the code implements fence callbacks 807 // by reusing the mechanism for kernel completions. 808 // This should be fixed. 809 int wf_id = 0; 810 if (pkt->req->hasContextId()) { 811 wf_id = pkt->req->contextId(); 812 } 813 insertKernel(wf_id, pkt); 814 newKernelEnds.push_back(wf_id); 815 if (!issueEvent.scheduled()) { 816 schedule(issueEvent, curTick()); 817 } 818 return RequestStatus_Issued; 819 } else { 820 // If not RfO, return issued here and let the child coalescer 821 // take care of it. 822 return RequestStatus_Issued; 823 } 824 } else { 825 panic("Unsupported ruby packet type\n"); 826 } 827 } 828 829 // Check if there is any pending request to this cache line from 830 // previous cycles. 831 // If there is a pending request, return aliased. Since coalescing 832 // across time is not permitted, aliased requests are not coalesced. 833 // If a request for this address has already been issued, we must block 834 RequestStatus status = getRequestStatus(pkt, primary_type); 835 if (status != RequestStatus_Ready) 836 return status; 837 838 Addr line_addr = makeLineAddress(pkt->getAddr()); 839 840 // Check if this request can be coalesced with previous 841 // requests from this cycle. 842 if (!reqCoalescer.count(line_addr)) { 843 // This is the first access to this cache line. 844 // A new request to the memory subsystem has to be 845 // made in the next cycle for this cache line, so 846 // add this line addr to the "newRequests" queue 847 newRequests.push_back(line_addr); 848 849 // There was a request to this cache line in this cycle, 850 // let us see if we can coalesce this request with the previous 851 // requests from this cycle 852 } else if (primary_type != 853 reqCoalescer[line_addr][0].primaryType) { 854 // can't coalesce loads, stores and atomics! 855 return RequestStatus_Aliased; 856 } else if (pkt->req->isLockedRMW() \|\| 857 reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) { 858 // can't coalesce locked accesses, but can coalesce atomics! 859 return RequestStatus_Aliased; 860 } else if (pkt->req->hasContextId() && pkt->req->isRelease() && 861 pkt->req->contextId() != 862 reqCoalescer[line_addr][0].pkt->req->contextId()) { 863 // can't coalesce releases from different wavefronts 864 return RequestStatus_Aliased; 865 } 866 867 // in addition to the packet, we need to save both request types 868 reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type); 869 if (!issueEvent.scheduled()) 870 schedule(issueEvent, curTick()); 871 // TODO: issue hardware prefetches here 872 return RequestStatus_Issued; 873} 874 875void 876GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) 877{ 878 879 int proc_id = -1; 880 if (pkt != NULL && pkt->req->hasContextId()) { 881 proc_id = pkt->req->contextId(); 882 } 883 884 // If valid, copy the pc to the ruby request 885 Addr pc = 0; 886 if (pkt->req->hasPC()) { 887 pc = pkt->req->getPC(); 888 } 889 890 // At the moment setting scopes only counts 891 // for GPU spill space accesses 892 // which is pkt->req->isStack() 893 // this scope is REPLACE since it 894 // does not need to be flushed at the end 895 // of a kernel Private and local may need 896 // to be visible at the end of the kernel 897 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req); 898 HSAScope accessScope = reqScopeToHSAScope(pkt->req); 899 900 Addr line_addr = makeLineAddress(pkt->getAddr()); 901 902 // Creating WriteMask that records written bytes 903 // and atomic operations. This enables partial writes 904 // and partial reads of those writes 905 DataBlock dataBlock; 906 dataBlock.clear(); 907 uint32_t blockSize = RubySystem::getBlockSizeBytes(); 908 std::vector<bool> accessMask(blockSize,false); 909 std::vector< std::pair<int,AtomicOpFunctor> > atomicOps; 910* uint32_t tableSize = reqCoalescer[line_addr].size(); 911 for (int i = 0; i < tableSize; i++) { 912 PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt; 913 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr; 914 uint32_t tmpSize = tmpPkt->getSize(); 915 if (tmpPkt->isAtomicOp()) { 916 std::pair<int,AtomicOpFunctor > tmpAtomicOp(tmpOffset, 917* tmpPkt->getAtomicOp()); 918 atomicOps.push_back(tmpAtomicOp); 919 } else if (tmpPkt->isWrite()) { 920 dataBlock.setData(tmpPkt->getPtr<uint8_t>(), 921 tmpOffset, tmpSize); 922 } 923 for (int j = 0; j < tmpSize; j++) { 924 accessMask[tmpOffset + j] = true; 925 } 926 } 927 std::shared_ptr<RubyRequest> msg; 928 if (pkt->isAtomicOp()) { 929 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), 930 pkt->getPtr<uint8_t>(), 931 pkt->getSize(), pc, secondary_type, 932 RubyAccessMode_Supervisor, pkt, 933 PrefetchBit_No, proc_id, 100, 934 blockSize, accessMask, 935 dataBlock, atomicOps, 936 accessScope, accessSegment); 937 } else { 938 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), 939 pkt->getPtr<uint8_t>(), 940 pkt->getSize(), pc, secondary_type, 941 RubyAccessMode_Supervisor, pkt, 942 PrefetchBit_No, proc_id, 100, 943 blockSize, accessMask, 944 dataBlock, 945 accessScope, accessSegment); 946 } 947 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n", 948 curTick(), m_version, "Coal", "Begin", "", "", 949 printAddress(msg->getPhysicalAddress()), 950 RubyRequestType_to_string(secondary_type)); 951 952 fatal_if(secondary_type == RubyRequestType_IFETCH, 953 "there should not be any I-Fetch requests in the GPU Coalescer"); 954 955 // Send the message to the cache controller 956 fatal_if(m_data_cache_hit_latency == 0, 957 "should not have a latency of zero"); 958 959 assert(m_mandatory_q_ptr); 960 m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); 961} 962 963template <class KEY, class VALUE> 964std::ostream & 965operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map) 966{ 967 out << "["; 968 for (auto i = map.begin(); i != map.end(); ++i) 969 out << " " << i->first << "=" << i->second; 970 out << " ]"; 971 972 return out; 973} 974 975void 976GPUCoalescer::print(ostream& out) const 977{ 978 out << "[GPUCoalescer: " << m_version 979 << ", outstanding requests: " << m_outstanding_count 980 << ", read request table: " << m_readRequestTable 981 << ", write request table: " << m_writeRequestTable 982 << "]"; 983} 984 985// this can be called from setState whenever coherence permissions are 986// upgraded when invoked, coherence violations will be checked for the 987// given block 988void 989GPUCoalescer::checkCoherence(Addr addr) 990{ 991#ifdef CHECK_COHERENCE 992 m_ruby_system->checkGlobalCoherenceInvariant(addr); 993#endif 994} 995 996void 997GPUCoalescer::recordRequestType(SequencerRequestType requestType) { 998 DPRINTF(RubyStats, "Recorded statistic: %s\n", 999 SequencerRequestType_to_string(requestType)); 1000} 1001 1002 1003void 1004GPUCoalescer::completeIssue() 1005{ 1006 // newRequests has the cacheline addresses of all the 1007 // requests which need to be issued to the memory subsystem 1008 // in this cycle 1009 int len = newRequests.size(); 1010 DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len); 1011 for (int i = 0; i < len; ++i) { 1012 // Get the requests from reqCoalescer table. Get only the 1013 // first request for each cacheline, the remaining requests 1014 // can be coalesced with the first request. So, only 1015 // one request is issued per cacheline. 1016 RequestDesc info = reqCoalescer[newRequests[i]][0]; 1017 PacketPtr pkt = info.pkt; 1018 DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n", 1019 i, pkt->req->getPaddr()); 1020 // Insert this request to the read/writeRequestTables. These tables 1021 // are used to track aliased requests in makeRequest subroutine 1022 bool found = insertRequest(pkt, info.primaryType); 1023 1024 if (found) { 1025 panic("GPUCoalescer::makeRequest should never be called if the " 1026 "request is already outstanding\n"); 1027 } 1028 1029 // Issue request to ruby subsystem 1030 issueRequest(pkt, info.secondaryType); 1031 } 1032 newRequests.clear(); 1033 1034 // have Kernel End releases been issued this cycle 1035 len = newKernelEnds.size(); 1036 for (int i = 0; i < len; i++) { 1037 kernelCallback(newKernelEnds[i]); 1038 } 1039 newKernelEnds.clear(); 1040} 1041 1042void 1043GPUCoalescer::evictionCallback(Addr address) 1044{ 1045 ruby_eviction_callback(address); 1046} 1047 1048void 1049GPUCoalescer::kernelCallback(int wavefront_id) 1050{ 1051 assert(kernelEndList.count(wavefront_id)); 1052 1053 ruby_hit_callback(kernelEndList[wavefront_id]); 1054 1055 kernelEndList.erase(wavefront_id); 1056} 1057 1058void 1059GPUCoalescer::atomicCallback(Addr address, 1060 MachineType mach, 1061 const DataBlock& data) 1062{ 1063 assert(address == makeLineAddress(address)); 1064 1065 DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address); 1066 assert(m_writeRequestTable.count(makeLineAddress(address))); 1067 1068 RequestTable::iterator i = m_writeRequestTable.find(address); 1069 assert(i != m_writeRequestTable.end()); 1070 GPUCoalescerRequest* srequest = i->second; 1071 1072 m_writeRequestTable.erase(i); 1073 markRemoved(); 1074 1075 assert((srequest->m_type == RubyRequestType_ATOMIC) \|\| 1076 (srequest->m_type == RubyRequestType_ATOMIC_RETURN) \|\| 1077 (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN)); 1078 1079 1080 // Atomics don't write to cache, so there is no MRU update... 1081 1082 recordMissLatency(srequest, mach, 1083 srequest->issue_time, Cycles(0), Cycles(0), true, false); 1084 1085 PacketPtr pkt = srequest->pkt; 1086 Addr request_address = pkt->getAddr(); 1087 Addr request_line_address = makeLineAddress(pkt->getAddr()); 1088 1089 int len = reqCoalescer[request_line_address].size(); 1090 std::vector<PacketPtr> mylist; 1091 for (int i = 0; i < len; ++i) { 1092 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt; 1093 assert(srequest->m_type == 1094 reqCoalescer[request_line_address][i].primaryType); 1095 request_address = (pkt->getAddr()); 1096 request_line_address = makeLineAddress(request_address); 1097 if (pkt->getPtr<uint8_t>() && 1098 srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) { 1099 /* atomics are done in memory, and return the data before the atomic op... / 1100* memcpy(pkt->getPtr<uint8_t>(), 1101 data.getData(getOffset(request_address), 1102 pkt->getSize()), 1103 pkt->getSize()); 1104 } else { 1105 DPRINTF(MemoryAccess, 1106 "WARNING. Data not transfered from Ruby to M5 for type " \ 1107 "%s\n", 1108 RubyRequestType_to_string(srequest->m_type)); 1109 } 1110 1111 // If using the RubyTester, update the RubyTester sender state's 1112 // subBlock with the recieved data. The tester will later access 1113 // this state. 1114 // Note: RubyPort will access it's sender state before the 1115 // RubyTester. 1116 if (m_usingRubyTester) { 1117 RubyPort::SenderState requestSenderState = 1118* safe_cast<RubyPort::SenderState>(pkt->senderState); 1119* RubyTester::SenderState* testerSenderState = 1120 safe_cast<RubyTester::SenderState>(requestSenderState->predecessor); 1121* testerSenderState->subBlock.mergeFrom(data); 1122 } 1123 1124 mylist.push_back(pkt); 1125 } 1126 delete srequest; 1127 reqCoalescer.erase(request_line_address); 1128 assert(!reqCoalescer.count(request_line_address)); 1129 1130 completeHitCallback(mylist, len); 1131} 1132 1133void 1134GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID) 1135{ 1136 if (myMachID == senderMachID) { 1137 CP_TCPLdHits++; 1138 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) { 1139 CP_TCPLdTransfers++; 1140 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) { 1141 CP_TCCLdHits++; 1142 } else { 1143 CP_LdMiss++; 1144 } 1145} 1146 1147void 1148GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID) 1149{ 1150 if (myMachID == senderMachID) { 1151 CP_TCPStHits++; 1152 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) { 1153 CP_TCPStTransfers++; 1154 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) { 1155 CP_TCCStHits++; 1156 } else { 1157 CP_StMiss++; 1158 } 1159} 1160 1161void 1162GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len) 1163{ 1164 for (int i = 0; i < len; ++i) { 1165 RubyPort::SenderState ss = 1166* safe_cast<RubyPort::SenderState >(mylist[i]->senderState); 1167* MemSlavePort port = ss->port; 1168* assert(port != NULL); 1169 1170 mylist[i]->senderState = ss->predecessor; 1171 delete ss; 1172 port->hitCallback(mylist[i]); 1173 trySendRetries(); 1174 } 1175 1176 testDrainComplete(); 1177} 1178 1179PacketPtr 1180GPUCoalescer::mapAddrToPkt(Addr address) 1181{ 1182 RequestTable::iterator i = m_readRequestTable.find(address); 1183 assert(i != m_readRequestTable.end()); 1184 GPUCoalescerRequest* request = i->second; 1185 return request->pkt; 1186} 1187 1188void 1189GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest, 1190 MachineType mach, 1191 Cycles initialRequestTime, 1192 Cycles forwardRequestTime, 1193 Cycles firstResponseTime, 1194 bool success, bool isRegion) 1195{ 1196 RubyRequestType type = srequest->m_type; 1197 Cycles issued_time = srequest->issue_time; 1198 Cycles completion_time = curCycle(); 1199 assert(completion_time >= issued_time); 1200 Cycles total_lat = completion_time - issued_time; 1201 1202 // cache stats (valid for RfO protocol only) 1203 if (mach == MachineType_TCP) { 1204 if (type == RubyRequestType_LD) { 1205 GPU_TCPLdHits++; 1206 } else { 1207 GPU_TCPStHits++; 1208 } 1209 } else if (mach == MachineType_L1Cache_wCC) { 1210 if (type == RubyRequestType_LD) { 1211 GPU_TCPLdTransfers++; 1212 } else { 1213 GPU_TCPStTransfers++; 1214 } 1215 } else if (mach == MachineType_TCC) { 1216 if (type == RubyRequestType_LD) { 1217 GPU_TCCLdHits++; 1218 } else { 1219 GPU_TCCStHits++; 1220 } 1221 } else { 1222 if (type == RubyRequestType_LD) { 1223 GPU_LdMiss++; 1224 } else { 1225 GPU_StMiss++; 1226 } 1227 } 1228 1229 // Profile all access latency, even zero latency accesses 1230 m_latencyHist.sample(total_lat); 1231 m_typeLatencyHist[type]->sample(total_lat); 1232 1233 // Profile the miss latency for all non-zero demand misses 1234 if (total_lat != Cycles(0)) { 1235 m_missLatencyHist.sample(total_lat); 1236 m_missTypeLatencyHist[type]->sample(total_lat); 1237 1238 if (mach != MachineType_NUM) { 1239 m_missMachLatencyHist[mach]->sample(total_lat); 1240 m_missTypeMachLatencyHist[type][mach]->sample(total_lat); 1241 1242 if ((issued_time <= initialRequestTime) && 1243 (initialRequestTime <= forwardRequestTime) && 1244 (forwardRequestTime <= firstResponseTime) && 1245 (firstResponseTime <= completion_time)) { 1246 1247 m_IssueToInitialDelayHist[mach]->sample( 1248 initialRequestTime - issued_time); 1249 m_InitialToForwardDelayHist[mach]->sample( 1250 forwardRequestTime - initialRequestTime); 1251 m_ForwardToFirstResponseDelayHist[mach]->sample( 1252 firstResponseTime - forwardRequestTime); 1253 m_FirstResponseToCompletionDelayHist[mach]->sample( 1254 completion_time - firstResponseTime); 1255 } 1256 } 1257 1258 } 1259 1260 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n", 1261 curTick(), m_version, "Coal", 1262 success ? "Done" : "SC_Failed", "", "", 1263 printAddress(srequest->pkt->getAddr()), total_lat); 1264} 1265 1266void 1267GPUCoalescer::regStats() 1268{ 1269 RubyPort::regStats(); 1270 1271 // These statistical variables are not for display. 1272 // The profiler will collate these across different 1273 // coalescers and display those collated statistics. 1274 m_outstandReqHist.init(10); 1275 m_latencyHist.init(10); 1276 m_missLatencyHist.init(10); 1277 1278 for (int i = 0; i < RubyRequestType_NUM; i++) { 1279 m_typeLatencyHist.push_back(new Stats::Histogram()); 1280 m_typeLatencyHist[i]->init(10); 1281 1282 m_missTypeLatencyHist.push_back(new Stats::Histogram()); 1283 m_missTypeLatencyHist[i]->init(10); 1284 } 1285 1286 for (int i = 0; i < MachineType_NUM; i++) { 1287 m_missMachLatencyHist.push_back(new Stats::Histogram()); 1288 m_missMachLatencyHist[i]->init(10); 1289 1290 m_IssueToInitialDelayHist.push_back(new Stats::Histogram()); 1291 m_IssueToInitialDelayHist[i]->init(10); 1292 1293 m_InitialToForwardDelayHist.push_back(new Stats::Histogram()); 1294 m_InitialToForwardDelayHist[i]->init(10); 1295 1296 m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram()); 1297 m_ForwardToFirstResponseDelayHist[i]->init(10); 1298 1299 m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram()); 1300 m_FirstResponseToCompletionDelayHist[i]->init(10); 1301 } 1302 1303 for (int i = 0; i < RubyRequestType_NUM; i++) { 1304 m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram >()); 1305* 1306 for (int j = 0; j < MachineType_NUM; j++) { 1307 m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram()); 1308 m_missTypeMachLatencyHist[i][j]->init(10); 1309 } 1310 } 1311 1312 // GPU cache stats 1313 GPU_TCPLdHits 1314 .name(name() + ".gpu_tcp_ld_hits") 1315 .desc("loads that hit in the TCP") 1316 ; 1317 GPU_TCPLdTransfers 1318 .name(name() + ".gpu_tcp_ld_transfers") 1319 .desc("TCP to TCP load transfers") 1320 ; 1321 GPU_TCCLdHits 1322 .name(name() + ".gpu_tcc_ld_hits") 1323 .desc("loads that hit in the TCC") 1324 ; 1325 GPU_LdMiss 1326 .name(name() + ".gpu_ld_misses") 1327 .desc("loads that miss in the GPU") 1328 ; 1329 1330 GPU_TCPStHits 1331 .name(name() + ".gpu_tcp_st_hits") 1332 .desc("stores that hit in the TCP") 1333 ; 1334 GPU_TCPStTransfers 1335 .name(name() + ".gpu_tcp_st_transfers") 1336 .desc("TCP to TCP store transfers") 1337 ; 1338 GPU_TCCStHits 1339 .name(name() + ".gpu_tcc_st_hits") 1340 .desc("stores that hit in the TCC") 1341 ; 1342 GPU_StMiss 1343 .name(name() + ".gpu_st_misses") 1344 .desc("stores that miss in the GPU") 1345 ; 1346 1347 // CP cache stats 1348 CP_TCPLdHits 1349 .name(name() + ".cp_tcp_ld_hits") 1350 .desc("loads that hit in the TCP") 1351 ; 1352 CP_TCPLdTransfers 1353 .name(name() + ".cp_tcp_ld_transfers") 1354 .desc("TCP to TCP load transfers") 1355 ; 1356 CP_TCCLdHits 1357 .name(name() + ".cp_tcc_ld_hits") 1358 .desc("loads that hit in the TCC") 1359 ; 1360 CP_LdMiss 1361 .name(name() + ".cp_ld_misses") 1362 .desc("loads that miss in the GPU") 1363 ; 1364 1365 CP_TCPStHits 1366 .name(name() + ".cp_tcp_st_hits") 1367 .desc("stores that hit in the TCP") 1368 ; 1369 CP_TCPStTransfers 1370 .name(name() + ".cp_tcp_st_transfers") 1371 .desc("TCP to TCP store transfers") 1372 ; 1373 CP_TCCStHits 1374 .name(name() + ".cp_tcc_st_hits") 1375 .desc("stores that hit in the TCC") 1376 ; 1377 CP_StMiss 1378 .name(name() + ".cp_st_misses") 1379 .desc("stores that miss in the GPU") 1380 ; 1381}	37#include "base/str.hh" 38#include "config/the_isa.hh" 39 40#if THE_ISA == X86_ISA 41#include "arch/x86/insts/microldstop.hh" 42 43#endif // X86_ISA 44#include "mem/ruby/system/GPUCoalescer.hh" 45 46#include "cpu/testers/rubytest/RubyTester.hh" 47#include "debug/GPUCoalescer.hh" 48#include "debug/MemoryAccess.hh" 49#include "debug/ProtocolTrace.hh" 50#include "debug/RubyPort.hh" 51#include "debug/RubyStats.hh" 52#include "gpu-compute/shader.hh" 53#include "mem/packet.hh" 54#include "mem/ruby/common/DataBlock.hh" 55#include "mem/ruby/common/SubBlock.hh" 56#include "mem/ruby/network/MessageBuffer.hh" 57#include "mem/ruby/profiler/Profiler.hh" 58#include "mem/ruby/slicc_interface/AbstractController.hh" 59#include "mem/ruby/slicc_interface/RubyRequest.hh" 60#include "mem/ruby/structures/CacheMemory.hh" 61#include "mem/ruby/system/RubySystem.hh" 62#include "params/RubyGPUCoalescer.hh" 63 64using namespace std; 65 66GPUCoalescer * 67RubyGPUCoalescerParams::create() 68{ 69 return new GPUCoalescer(this); 70} 71 72HSAScope 73reqScopeToHSAScope(Request* req) 74{ 75 HSAScope accessScope = HSAScope_UNSPECIFIED; 76 if (req->isScoped()) { 77 if (req->isWavefrontScope()) { 78 accessScope = HSAScope_WAVEFRONT; 79 } else if (req->isWorkgroupScope()) { 80 accessScope = HSAScope_WORKGROUP; 81 } else if (req->isDeviceScope()) { 82 accessScope = HSAScope_DEVICE; 83 } else if (req->isSystemScope()) { 84 accessScope = HSAScope_SYSTEM; 85 } else { 86 fatal("Bad scope type"); 87 } 88 } 89 return accessScope; 90} 91 92HSASegment 93reqSegmentToHSASegment(Request* req) 94{ 95 HSASegment accessSegment = HSASegment_GLOBAL; 96 97 if (req->isGlobalSegment()) { 98 accessSegment = HSASegment_GLOBAL; 99 } else if (req->isGroupSegment()) { 100 accessSegment = HSASegment_GROUP; 101 } else if (req->isPrivateSegment()) { 102 accessSegment = HSASegment_PRIVATE; 103 } else if (req->isKernargSegment()) { 104 accessSegment = HSASegment_KERNARG; 105 } else if (req->isReadonlySegment()) { 106 accessSegment = HSASegment_READONLY; 107 } else if (req->isSpillSegment()) { 108 accessSegment = HSASegment_SPILL; 109 } else if (req->isArgSegment()) { 110 accessSegment = HSASegment_ARG; 111 } else { 112 fatal("Bad segment type"); 113 } 114 115 return accessSegment; 116} 117 118GPUCoalescer::GPUCoalescer(const Params p) 119* : RubyPort(p), 120 issueEvent([this]{ completeIssue(); }, "Issue coalesced request", 121 false, Event::Progress_Event_Pri), 122 deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check") 123{ 124 m_store_waiting_on_load_cycles = 0; 125 m_store_waiting_on_store_cycles = 0; 126 m_load_waiting_on_store_cycles = 0; 127 m_load_waiting_on_load_cycles = 0; 128 129 m_outstanding_count = 0; 130 131 m_max_outstanding_requests = 0; 132 m_deadlock_threshold = 0; 133 m_instCache_ptr = nullptr; 134 m_dataCache_ptr = nullptr; 135 136 m_instCache_ptr = p->icache; 137 m_dataCache_ptr = p->dcache; 138 m_max_outstanding_requests = p->max_outstanding_requests; 139 m_deadlock_threshold = p->deadlock_threshold; 140 141 assert(m_max_outstanding_requests > 0); 142 assert(m_deadlock_threshold > 0); 143 assert(m_instCache_ptr); 144 assert(m_dataCache_ptr); 145 146 m_data_cache_hit_latency = p->dcache_hit_latency; 147 148 m_runningGarnetStandalone = p->garnet_standalone; 149 assumingRfOCoherence = p->assume_rfo; 150} 151 152GPUCoalescer::~GPUCoalescer() 153{ 154} 155 156void 157GPUCoalescer::wakeup() 158{ 159 // Check for deadlock of any of the requests 160 Cycles current_time = curCycle(); 161 162 // Check across all outstanding requests 163 int total_outstanding = 0; 164 165 RequestTable::iterator read = m_readRequestTable.begin(); 166 RequestTable::iterator read_end = m_readRequestTable.end(); 167 for (; read != read_end; ++read) { 168 GPUCoalescerRequest* request = read->second; 169 if (current_time - request->issue_time < m_deadlock_threshold) 170 continue; 171 172 panic("Possible Deadlock detected. Aborting!\n" 173 "version: %d request.paddr: 0x%x m_readRequestTable: %d " 174 "current time: %u issue_time: %d difference: %d\n", m_version, 175 request->pkt->getAddr(), m_readRequestTable.size(), 176 current_time * clockPeriod(), request->issue_time * clockPeriod(), 177 (current_time - request->issue_time)clockPeriod()); 178* } 179 180 RequestTable::iterator write = m_writeRequestTable.begin(); 181 RequestTable::iterator write_end = m_writeRequestTable.end(); 182 for (; write != write_end; ++write) { 183 GPUCoalescerRequest* request = write->second; 184 if (current_time - request->issue_time < m_deadlock_threshold) 185 continue; 186 187 panic("Possible Deadlock detected. Aborting!\n" 188 "version: %d request.paddr: 0x%x m_writeRequestTable: %d " 189 "current time: %u issue_time: %d difference: %d\n", m_version, 190 request->pkt->getAddr(), m_writeRequestTable.size(), 191 current_time * clockPeriod(), request->issue_time * clockPeriod(), 192 (current_time - request->issue_time) * clockPeriod()); 193 } 194 195 total_outstanding += m_writeRequestTable.size(); 196 total_outstanding += m_readRequestTable.size(); 197 198 assert(m_outstanding_count == total_outstanding); 199 200 if (m_outstanding_count > 0) { 201 // If there are still outstanding requests, keep checking 202 schedule(deadlockCheckEvent, 203 m_deadlock_threshold * clockPeriod() + 204 curTick()); 205 } 206} 207 208void 209GPUCoalescer::resetStats() 210{ 211 m_latencyHist.reset(); 212 m_missLatencyHist.reset(); 213 for (int i = 0; i < RubyRequestType_NUM; i++) { 214 m_typeLatencyHist[i]->reset(); 215 m_missTypeLatencyHist[i]->reset(); 216 for (int j = 0; j < MachineType_NUM; j++) { 217 m_missTypeMachLatencyHist[i][j]->reset(); 218 } 219 } 220 221 for (int i = 0; i < MachineType_NUM; i++) { 222 m_missMachLatencyHist[i]->reset(); 223 224 m_IssueToInitialDelayHist[i]->reset(); 225 m_InitialToForwardDelayHist[i]->reset(); 226 m_ForwardToFirstResponseDelayHist[i]->reset(); 227 m_FirstResponseToCompletionDelayHist[i]->reset(); 228 } 229} 230 231void 232GPUCoalescer::printProgress(ostream& out) const 233{ 234} 235 236RequestStatus 237GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type) 238{ 239 Addr line_addr = makeLineAddress(pkt->getAddr()); 240 241 if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) { 242 return RequestStatus_BufferFull; 243 } 244 245 if (m_controller->isBlocked(line_addr) && 246 request_type != RubyRequestType_Locked_RMW_Write) { 247 return RequestStatus_Aliased; 248 } 249 250 if ((request_type == RubyRequestType_ST) \|\| 251 (request_type == RubyRequestType_ATOMIC) \|\| 252 (request_type == RubyRequestType_ATOMIC_RETURN) \|\| 253 (request_type == RubyRequestType_ATOMIC_NO_RETURN) \|\| 254 (request_type == RubyRequestType_RMW_Read) \|\| 255 (request_type == RubyRequestType_RMW_Write) \|\| 256 (request_type == RubyRequestType_Load_Linked) \|\| 257 (request_type == RubyRequestType_Store_Conditional) \|\| 258 (request_type == RubyRequestType_Locked_RMW_Read) \|\| 259 (request_type == RubyRequestType_Locked_RMW_Write) \|\| 260 (request_type == RubyRequestType_FLUSH)) { 261 262 // Check if there is any outstanding read request for the same 263 // cache line. 264 if (m_readRequestTable.count(line_addr) > 0) { 265 m_store_waiting_on_load_cycles++; 266 return RequestStatus_Aliased; 267 } 268 269 if (m_writeRequestTable.count(line_addr) > 0) { 270 // There is an outstanding write request for the cache line 271 m_store_waiting_on_store_cycles++; 272 return RequestStatus_Aliased; 273 } 274 } else { 275 // Check if there is any outstanding write request for the same 276 // cache line. 277 if (m_writeRequestTable.count(line_addr) > 0) { 278 m_load_waiting_on_store_cycles++; 279 return RequestStatus_Aliased; 280 } 281 282 if (m_readRequestTable.count(line_addr) > 0) { 283 // There is an outstanding read request for the cache line 284 m_load_waiting_on_load_cycles++; 285 return RequestStatus_Aliased; 286 } 287 } 288 289 return RequestStatus_Ready; 290 291} 292 293 294 295// sets the kernelEndList 296void 297GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt) 298{ 299 // Don't know if this will happen or is possible 300 // but I just want to be careful and not have it become 301 // simulator hang in the future 302 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id); 303 assert(kernelEndList.count(wavefront_id) == 0); 304 305 kernelEndList[wavefront_id] = pkt; 306 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n", 307 kernelEndList.size()); 308} 309 310 311// Insert the request on the correct request table. Return true if 312// the entry was already present. 313bool 314GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type) 315{ 316 assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready \|\| 317 pkt->req->isLockedRMW() \|\| 318 !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())); 319 320 int total_outstanding M5_VAR_USED = 321 m_writeRequestTable.size() + m_readRequestTable.size(); 322 323 assert(m_outstanding_count == total_outstanding); 324 325 // See if we should schedule a deadlock check 326 if (!deadlockCheckEvent.scheduled()) { 327 schedule(deadlockCheckEvent, m_deadlock_threshold + curTick()); 328 } 329 330 Addr line_addr = makeLineAddress(pkt->getAddr()); 331 if ((request_type == RubyRequestType_ST) \|\| 332 (request_type == RubyRequestType_ATOMIC) \|\| 333 (request_type == RubyRequestType_ATOMIC_RETURN) \|\| 334 (request_type == RubyRequestType_ATOMIC_NO_RETURN) \|\| 335 (request_type == RubyRequestType_RMW_Read) \|\| 336 (request_type == RubyRequestType_RMW_Write) \|\| 337 (request_type == RubyRequestType_Load_Linked) \|\| 338 (request_type == RubyRequestType_Store_Conditional) \|\| 339 (request_type == RubyRequestType_Locked_RMW_Read) \|\| 340 (request_type == RubyRequestType_Locked_RMW_Write) \|\| 341 (request_type == RubyRequestType_FLUSH)) { 342 343 pair<RequestTable::iterator, bool> r = 344 m_writeRequestTable.insert(RequestTable::value_type(line_addr, 345 (GPUCoalescerRequest) NULL)); 346* if (r.second) { 347 RequestTable::iterator i = r.first; 348 i->second = new GPUCoalescerRequest(pkt, request_type, 349 curCycle()); 350 DPRINTF(GPUCoalescer, 351 "Inserting write request for paddr %#x for type %d\n", 352 pkt->req->getPaddr(), i->second->m_type); 353 m_outstanding_count++; 354 } else { 355 return true; 356 } 357 } else { 358 pair<RequestTable::iterator, bool> r = 359 m_readRequestTable.insert(RequestTable::value_type(line_addr, 360 (GPUCoalescerRequest) NULL)); 361* 362 if (r.second) { 363 RequestTable::iterator i = r.first; 364 i->second = new GPUCoalescerRequest(pkt, request_type, 365 curCycle()); 366 DPRINTF(GPUCoalescer, 367 "Inserting read request for paddr %#x for type %d\n", 368 pkt->req->getPaddr(), i->second->m_type); 369 m_outstanding_count++; 370 } else { 371 return true; 372 } 373 } 374 375 m_outstandReqHist.sample(m_outstanding_count); 376 377 total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size(); 378 assert(m_outstanding_count == total_outstanding); 379 380 return false; 381} 382 383void 384GPUCoalescer::markRemoved() 385{ 386 m_outstanding_count--; 387 assert(m_outstanding_count == 388 m_writeRequestTable.size() + m_readRequestTable.size()); 389} 390 391void 392GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest) 393{ 394 assert(m_outstanding_count == 395 m_writeRequestTable.size() + m_readRequestTable.size()); 396 397 Addr line_addr = makeLineAddress(srequest->pkt->getAddr()); 398 if ((srequest->m_type == RubyRequestType_ST) \|\| 399 (srequest->m_type == RubyRequestType_RMW_Read) \|\| 400 (srequest->m_type == RubyRequestType_RMW_Write) \|\| 401 (srequest->m_type == RubyRequestType_Load_Linked) \|\| 402 (srequest->m_type == RubyRequestType_Store_Conditional) \|\| 403 (srequest->m_type == RubyRequestType_Locked_RMW_Read) \|\| 404 (srequest->m_type == RubyRequestType_Locked_RMW_Write)) { 405 m_writeRequestTable.erase(line_addr); 406 } else { 407 m_readRequestTable.erase(line_addr); 408 } 409 410 markRemoved(); 411} 412 413bool 414GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request) 415{ 416 // 417 // The success flag indicates whether the LLSC operation was successful. 418 // LL ops will always succeed, but SC may fail if the cache line is no 419 // longer locked. 420 // 421 bool success = true; 422 if (request->m_type == RubyRequestType_Store_Conditional) { 423 if (!m_dataCache_ptr->isLocked(address, m_version)) { 424 // 425 // For failed SC requests, indicate the failure to the cpu by 426 // setting the extra data to zero. 427 // 428 request->pkt->req->setExtraData(0); 429 success = false; 430 } else { 431 // 432 // For successful SC requests, indicate the success to the cpu by 433 // setting the extra data to one. 434 // 435 request->pkt->req->setExtraData(1); 436 } 437 // 438 // Independent of success, all SC operations must clear the lock 439 // 440 m_dataCache_ptr->clearLocked(address); 441 } else if (request->m_type == RubyRequestType_Load_Linked) { 442 // 443 // Note: To fully follow Alpha LLSC semantics, should the LL clear any 444 // previously locked cache lines? 445 // 446 m_dataCache_ptr->setLocked(address, m_version); 447 } else if ((m_dataCache_ptr->isTagPresent(address)) && 448 (m_dataCache_ptr->isLocked(address, m_version))) { 449 // 450 // Normal writes should clear the locked address 451 // 452 m_dataCache_ptr->clearLocked(address); 453 } 454 return success; 455} 456 457void 458GPUCoalescer::writeCallback(Addr address, DataBlock& data) 459{ 460 writeCallback(address, MachineType_NULL, data); 461} 462 463void 464GPUCoalescer::writeCallback(Addr address, 465 MachineType mach, 466 DataBlock& data) 467{ 468 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); 469} 470 471void 472GPUCoalescer::writeCallback(Addr address, 473 MachineType mach, 474 DataBlock& data, 475 Cycles initialRequestTime, 476 Cycles forwardRequestTime, 477 Cycles firstResponseTime) 478{ 479 writeCallback(address, mach, data, 480 initialRequestTime, forwardRequestTime, firstResponseTime, 481 false); 482} 483 484void 485GPUCoalescer::writeCallback(Addr address, 486 MachineType mach, 487 DataBlock& data, 488 Cycles initialRequestTime, 489 Cycles forwardRequestTime, 490 Cycles firstResponseTime, 491 bool isRegion) 492{ 493 assert(address == makeLineAddress(address)); 494 495 DPRINTF(GPUCoalescer, "write callback for address %#x\n", address); 496 assert(m_writeRequestTable.count(makeLineAddress(address))); 497 498 RequestTable::iterator i = m_writeRequestTable.find(address); 499 assert(i != m_writeRequestTable.end()); 500 GPUCoalescerRequest* request = i->second; 501 502 m_writeRequestTable.erase(i); 503 markRemoved(); 504 505 assert((request->m_type == RubyRequestType_ST) \|\| 506 (request->m_type == RubyRequestType_ATOMIC) \|\| 507 (request->m_type == RubyRequestType_ATOMIC_RETURN) \|\| 508 (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) \|\| 509 (request->m_type == RubyRequestType_RMW_Read) \|\| 510 (request->m_type == RubyRequestType_RMW_Write) \|\| 511 (request->m_type == RubyRequestType_Load_Linked) \|\| 512 (request->m_type == RubyRequestType_Store_Conditional) \|\| 513 (request->m_type == RubyRequestType_Locked_RMW_Read) \|\| 514 (request->m_type == RubyRequestType_Locked_RMW_Write) \|\| 515 (request->m_type == RubyRequestType_FLUSH)); 516 517 518 // 519 // For Alpha, properly handle LL, SC, and write requests with respect to 520 // locked cache blocks. 521 // 522 // Not valid for Garnet_standalone protocl 523 // 524 bool success = true; 525 if (!m_runningGarnetStandalone) 526 success = handleLlsc(address, request); 527 528 if (request->m_type == RubyRequestType_Locked_RMW_Read) { 529 m_controller->blockOnQueue(address, m_mandatory_q_ptr); 530 } else if (request->m_type == RubyRequestType_Locked_RMW_Write) { 531 m_controller->unblock(address); 532 } 533 534 hitCallback(request, mach, data, success, 535 request->issue_time, forwardRequestTime, firstResponseTime, 536 isRegion); 537} 538 539void 540GPUCoalescer::readCallback(Addr address, DataBlock& data) 541{ 542 readCallback(address, MachineType_NULL, data); 543} 544 545void 546GPUCoalescer::readCallback(Addr address, 547 MachineType mach, 548 DataBlock& data) 549{ 550 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); 551} 552 553void 554GPUCoalescer::readCallback(Addr address, 555 MachineType mach, 556 DataBlock& data, 557 Cycles initialRequestTime, 558 Cycles forwardRequestTime, 559 Cycles firstResponseTime) 560{ 561 562 readCallback(address, mach, data, 563 initialRequestTime, forwardRequestTime, firstResponseTime, 564 false); 565} 566 567void 568GPUCoalescer::readCallback(Addr address, 569 MachineType mach, 570 DataBlock& data, 571 Cycles initialRequestTime, 572 Cycles forwardRequestTime, 573 Cycles firstResponseTime, 574 bool isRegion) 575{ 576 assert(address == makeLineAddress(address)); 577 assert(m_readRequestTable.count(makeLineAddress(address))); 578 579 DPRINTF(GPUCoalescer, "read callback for address %#x\n", address); 580 RequestTable::iterator i = m_readRequestTable.find(address); 581 assert(i != m_readRequestTable.end()); 582 GPUCoalescerRequest* request = i->second; 583 584 m_readRequestTable.erase(i); 585 markRemoved(); 586 587 assert((request->m_type == RubyRequestType_LD) \|\| 588 (request->m_type == RubyRequestType_IFETCH)); 589 590 hitCallback(request, mach, data, true, 591 request->issue_time, forwardRequestTime, firstResponseTime, 592 isRegion); 593} 594 595void 596GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest, 597 MachineType mach, 598 DataBlock& data, 599 bool success, 600 Cycles initialRequestTime, 601 Cycles forwardRequestTime, 602 Cycles firstResponseTime, 603 bool isRegion) 604{ 605 PacketPtr pkt = srequest->pkt; 606 Addr request_address = pkt->getAddr(); 607 Addr request_line_address = makeLineAddress(request_address); 608 609 RubyRequestType type = srequest->m_type; 610 611 // Set this cache entry to the most recently used 612 if (type == RubyRequestType_IFETCH) { 613 if (m_instCache_ptr->isTagPresent(request_line_address)) 614 m_instCache_ptr->setMRU(request_line_address); 615 } else { 616 if (m_dataCache_ptr->isTagPresent(request_line_address)) 617 m_dataCache_ptr->setMRU(request_line_address); 618 } 619 620 recordMissLatency(srequest, mach, 621 initialRequestTime, 622 forwardRequestTime, 623 firstResponseTime, 624 success, isRegion); 625 // update the data 626 // 627 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER 628 int len = reqCoalescer[request_line_address].size(); 629 std::vector<PacketPtr> mylist; 630 for (int i = 0; i < len; ++i) { 631 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt; 632 assert(type == reqCoalescer[request_line_address][i].primaryType); 633 request_address = pkt->getAddr(); 634 request_line_address = makeLineAddress(pkt->getAddr()); 635 if (pkt->getPtr<uint8_t>()) { 636 if ((type == RubyRequestType_LD) \|\| 637 (type == RubyRequestType_ATOMIC) \|\| 638 (type == RubyRequestType_ATOMIC_RETURN) \|\| 639 (type == RubyRequestType_IFETCH) \|\| 640 (type == RubyRequestType_RMW_Read) \|\| 641 (type == RubyRequestType_Locked_RMW_Read) \|\| 642 (type == RubyRequestType_Load_Linked)) { 643 memcpy(pkt->getPtr<uint8_t>(), 644 data.getData(getOffset(request_address), 645 pkt->getSize()), 646 pkt->getSize()); 647 } else { 648 data.setData(pkt->getPtr<uint8_t>(), 649 getOffset(request_address), pkt->getSize()); 650 } 651 } else { 652 DPRINTF(MemoryAccess, 653 "WARNING. Data not transfered from Ruby to M5 for type " \ 654 "%s\n", 655 RubyRequestType_to_string(type)); 656 } 657 658 // If using the RubyTester, update the RubyTester sender state's 659 // subBlock with the recieved data. The tester will later access 660 // this state. 661 // Note: RubyPort will access it's sender state before the 662 // RubyTester. 663 if (m_usingRubyTester) { 664 RubyPort::SenderState requestSenderState = 665* safe_cast<RubyPort::SenderState>(pkt->senderState); 666* RubyTester::SenderState* testerSenderState = 667 safe_cast<RubyTester::SenderState>(requestSenderState->predecessor); 668* testerSenderState->subBlock.mergeFrom(data); 669 } 670 671 mylist.push_back(pkt); 672 } 673 delete srequest; 674 reqCoalescer.erase(request_line_address); 675 assert(!reqCoalescer.count(request_line_address)); 676 677 678 679 completeHitCallback(mylist, len); 680} 681 682bool 683GPUCoalescer::empty() const 684{ 685 return m_writeRequestTable.empty() && m_readRequestTable.empty(); 686} 687 688// Analyzes the packet to see if this request can be coalesced. 689// If request can be coalesced, this request is added to the reqCoalescer table 690// and makeRequest returns RequestStatus_Issued; 691// If this is the first request to a cacheline, request is added to both 692// newRequests queue and to the reqCoalescer table; makeRequest 693// returns RequestStatus_Issued. 694// If there is a pending request to this cacheline and this request 695// can't be coalesced, RequestStatus_Aliased is returned and 696// the packet needs to be reissued. 697RequestStatus 698GPUCoalescer::makeRequest(PacketPtr pkt) 699{ 700 // Check for GPU Barrier Kernel End or Kernel Begin 701 // Leave these to be handled by the child class 702 // Kernel End/Barrier = isFlush + isRelease 703 // Kernel Begin = isFlush + isAcquire 704 if (pkt->req->isKernel()) { 705 if (pkt->req->isAcquire()){ 706 // This is a Kernel Begin leave handling to 707 // virtual xCoalescer::makeRequest 708 return RequestStatus_Issued; 709 }else if (pkt->req->isRelease()) { 710 // This is a Kernel End leave handling to 711 // virtual xCoalescer::makeRequest 712 // If we are here then we didn't call 713 // a virtual version of this function 714 // so we will also schedule the callback 715 int wf_id = 0; 716 if (pkt->req->hasContextId()) { 717 wf_id = pkt->req->contextId(); 718 } 719 insertKernel(wf_id, pkt); 720 newKernelEnds.push_back(wf_id); 721 if (!issueEvent.scheduled()) { 722 schedule(issueEvent, curTick()); 723 } 724 return RequestStatus_Issued; 725 } 726 } 727 728 // If number of outstanding requests greater than the max allowed, 729 // return RequestStatus_BufferFull. This logic can be extended to 730 // support proper backpressure. 731 if (m_outstanding_count >= m_max_outstanding_requests) { 732 return RequestStatus_BufferFull; 733 } 734 735 RubyRequestType primary_type = RubyRequestType_NULL; 736 RubyRequestType secondary_type = RubyRequestType_NULL; 737 738 if (pkt->isLLSC()) { 739 // 740 // Alpha LL/SC instructions need to be handled carefully by the cache 741 // coherence protocol to ensure they follow the proper semantics. In 742 // particular, by identifying the operations as atomic, the protocol 743 // should understand that migratory sharing optimizations should not 744 // be performed (i.e. a load between the LL and SC should not steal 745 // away exclusive permission). 746 // 747 if (pkt->isWrite()) { 748 primary_type = RubyRequestType_Store_Conditional; 749 } else { 750 assert(pkt->isRead()); 751 primary_type = RubyRequestType_Load_Linked; 752 } 753 secondary_type = RubyRequestType_ATOMIC; 754 } else if (pkt->req->isLockedRMW()) { 755 // 756 // x86 locked instructions are translated to store cache coherence 757 // requests because these requests should always be treated as read 758 // exclusive operations and should leverage any migratory sharing 759 // optimization built into the protocol. 760 // 761 if (pkt->isWrite()) { 762 primary_type = RubyRequestType_Locked_RMW_Write; 763 } else { 764 assert(pkt->isRead()); 765 primary_type = RubyRequestType_Locked_RMW_Read; 766 } 767 secondary_type = RubyRequestType_ST; 768 } else if (pkt->isAtomicOp()) { 769 // 770 // GPU Atomic Operation 771 // 772 primary_type = RubyRequestType_ATOMIC; 773 secondary_type = RubyRequestType_ATOMIC; 774 } else { 775 if (pkt->isRead()) { 776 if (pkt->req->isInstFetch()) { 777 primary_type = secondary_type = RubyRequestType_IFETCH; 778 } else { 779#if THE_ISA == X86_ISA 780 uint32_t flags = pkt->req->getFlags(); 781 bool storeCheck = flags & 782 (TheISA::StoreCheck << TheISA::FlagShift); 783#else 784 bool storeCheck = false; 785#endif // X86_ISA 786 if (storeCheck) { 787 primary_type = RubyRequestType_RMW_Read; 788 secondary_type = RubyRequestType_ST; 789 } else { 790 primary_type = secondary_type = RubyRequestType_LD; 791 } 792 } 793 } else if (pkt->isWrite()) { 794 // 795 // Note: M5 packets do not differentiate ST from RMW_Write 796 // 797 primary_type = secondary_type = RubyRequestType_ST; 798 } else if (pkt->isFlush()) { 799 primary_type = secondary_type = RubyRequestType_FLUSH; 800 } else if (pkt->req->isRelease() \|\| pkt->req->isAcquire()) { 801 if (assumingRfOCoherence) { 802 // If we reached here, this request must be a memFence 803 // and the protocol implements RfO, the coalescer can 804 // assume sequentially consistency and schedule the callback 805 // immediately. 806 // Currently the code implements fence callbacks 807 // by reusing the mechanism for kernel completions. 808 // This should be fixed. 809 int wf_id = 0; 810 if (pkt->req->hasContextId()) { 811 wf_id = pkt->req->contextId(); 812 } 813 insertKernel(wf_id, pkt); 814 newKernelEnds.push_back(wf_id); 815 if (!issueEvent.scheduled()) { 816 schedule(issueEvent, curTick()); 817 } 818 return RequestStatus_Issued; 819 } else { 820 // If not RfO, return issued here and let the child coalescer 821 // take care of it. 822 return RequestStatus_Issued; 823 } 824 } else { 825 panic("Unsupported ruby packet type\n"); 826 } 827 } 828 829 // Check if there is any pending request to this cache line from 830 // previous cycles. 831 // If there is a pending request, return aliased. Since coalescing 832 // across time is not permitted, aliased requests are not coalesced. 833 // If a request for this address has already been issued, we must block 834 RequestStatus status = getRequestStatus(pkt, primary_type); 835 if (status != RequestStatus_Ready) 836 return status; 837 838 Addr line_addr = makeLineAddress(pkt->getAddr()); 839 840 // Check if this request can be coalesced with previous 841 // requests from this cycle. 842 if (!reqCoalescer.count(line_addr)) { 843 // This is the first access to this cache line. 844 // A new request to the memory subsystem has to be 845 // made in the next cycle for this cache line, so 846 // add this line addr to the "newRequests" queue 847 newRequests.push_back(line_addr); 848 849 // There was a request to this cache line in this cycle, 850 // let us see if we can coalesce this request with the previous 851 // requests from this cycle 852 } else if (primary_type != 853 reqCoalescer[line_addr][0].primaryType) { 854 // can't coalesce loads, stores and atomics! 855 return RequestStatus_Aliased; 856 } else if (pkt->req->isLockedRMW() \|\| 857 reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) { 858 // can't coalesce locked accesses, but can coalesce atomics! 859 return RequestStatus_Aliased; 860 } else if (pkt->req->hasContextId() && pkt->req->isRelease() && 861 pkt->req->contextId() != 862 reqCoalescer[line_addr][0].pkt->req->contextId()) { 863 // can't coalesce releases from different wavefronts 864 return RequestStatus_Aliased; 865 } 866 867 // in addition to the packet, we need to save both request types 868 reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type); 869 if (!issueEvent.scheduled()) 870 schedule(issueEvent, curTick()); 871 // TODO: issue hardware prefetches here 872 return RequestStatus_Issued; 873} 874 875void 876GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) 877{ 878 879 int proc_id = -1; 880 if (pkt != NULL && pkt->req->hasContextId()) { 881 proc_id = pkt->req->contextId(); 882 } 883 884 // If valid, copy the pc to the ruby request 885 Addr pc = 0; 886 if (pkt->req->hasPC()) { 887 pc = pkt->req->getPC(); 888 } 889 890 // At the moment setting scopes only counts 891 // for GPU spill space accesses 892 // which is pkt->req->isStack() 893 // this scope is REPLACE since it 894 // does not need to be flushed at the end 895 // of a kernel Private and local may need 896 // to be visible at the end of the kernel 897 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req); 898 HSAScope accessScope = reqScopeToHSAScope(pkt->req); 899 900 Addr line_addr = makeLineAddress(pkt->getAddr()); 901 902 // Creating WriteMask that records written bytes 903 // and atomic operations. This enables partial writes 904 // and partial reads of those writes 905 DataBlock dataBlock; 906 dataBlock.clear(); 907 uint32_t blockSize = RubySystem::getBlockSizeBytes(); 908 std::vector<bool> accessMask(blockSize,false); 909 std::vector< std::pair<int,AtomicOpFunctor> > atomicOps; 910* uint32_t tableSize = reqCoalescer[line_addr].size(); 911 for (int i = 0; i < tableSize; i++) { 912 PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt; 913 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr; 914 uint32_t tmpSize = tmpPkt->getSize(); 915 if (tmpPkt->isAtomicOp()) { 916 std::pair<int,AtomicOpFunctor > tmpAtomicOp(tmpOffset, 917* tmpPkt->getAtomicOp()); 918 atomicOps.push_back(tmpAtomicOp); 919 } else if (tmpPkt->isWrite()) { 920 dataBlock.setData(tmpPkt->getPtr<uint8_t>(), 921 tmpOffset, tmpSize); 922 } 923 for (int j = 0; j < tmpSize; j++) { 924 accessMask[tmpOffset + j] = true; 925 } 926 } 927 std::shared_ptr<RubyRequest> msg; 928 if (pkt->isAtomicOp()) { 929 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), 930 pkt->getPtr<uint8_t>(), 931 pkt->getSize(), pc, secondary_type, 932 RubyAccessMode_Supervisor, pkt, 933 PrefetchBit_No, proc_id, 100, 934 blockSize, accessMask, 935 dataBlock, atomicOps, 936 accessScope, accessSegment); 937 } else { 938 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), 939 pkt->getPtr<uint8_t>(), 940 pkt->getSize(), pc, secondary_type, 941 RubyAccessMode_Supervisor, pkt, 942 PrefetchBit_No, proc_id, 100, 943 blockSize, accessMask, 944 dataBlock, 945 accessScope, accessSegment); 946 } 947 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n", 948 curTick(), m_version, "Coal", "Begin", "", "", 949 printAddress(msg->getPhysicalAddress()), 950 RubyRequestType_to_string(secondary_type)); 951 952 fatal_if(secondary_type == RubyRequestType_IFETCH, 953 "there should not be any I-Fetch requests in the GPU Coalescer"); 954 955 // Send the message to the cache controller 956 fatal_if(m_data_cache_hit_latency == 0, 957 "should not have a latency of zero"); 958 959 assert(m_mandatory_q_ptr); 960 m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); 961} 962 963template <class KEY, class VALUE> 964std::ostream & 965operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map) 966{ 967 out << "["; 968 for (auto i = map.begin(); i != map.end(); ++i) 969 out << " " << i->first << "=" << i->second; 970 out << " ]"; 971 972 return out; 973} 974 975void 976GPUCoalescer::print(ostream& out) const 977{ 978 out << "[GPUCoalescer: " << m_version 979 << ", outstanding requests: " << m_outstanding_count 980 << ", read request table: " << m_readRequestTable 981 << ", write request table: " << m_writeRequestTable 982 << "]"; 983} 984 985// this can be called from setState whenever coherence permissions are 986// upgraded when invoked, coherence violations will be checked for the 987// given block 988void 989GPUCoalescer::checkCoherence(Addr addr) 990{ 991#ifdef CHECK_COHERENCE 992 m_ruby_system->checkGlobalCoherenceInvariant(addr); 993#endif 994} 995 996void 997GPUCoalescer::recordRequestType(SequencerRequestType requestType) { 998 DPRINTF(RubyStats, "Recorded statistic: %s\n", 999 SequencerRequestType_to_string(requestType)); 1000} 1001 1002 1003void 1004GPUCoalescer::completeIssue() 1005{ 1006 // newRequests has the cacheline addresses of all the 1007 // requests which need to be issued to the memory subsystem 1008 // in this cycle 1009 int len = newRequests.size(); 1010 DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len); 1011 for (int i = 0; i < len; ++i) { 1012 // Get the requests from reqCoalescer table. Get only the 1013 // first request for each cacheline, the remaining requests 1014 // can be coalesced with the first request. So, only 1015 // one request is issued per cacheline. 1016 RequestDesc info = reqCoalescer[newRequests[i]][0]; 1017 PacketPtr pkt = info.pkt; 1018 DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n", 1019 i, pkt->req->getPaddr()); 1020 // Insert this request to the read/writeRequestTables. These tables 1021 // are used to track aliased requests in makeRequest subroutine 1022 bool found = insertRequest(pkt, info.primaryType); 1023 1024 if (found) { 1025 panic("GPUCoalescer::makeRequest should never be called if the " 1026 "request is already outstanding\n"); 1027 } 1028 1029 // Issue request to ruby subsystem 1030 issueRequest(pkt, info.secondaryType); 1031 } 1032 newRequests.clear(); 1033 1034 // have Kernel End releases been issued this cycle 1035 len = newKernelEnds.size(); 1036 for (int i = 0; i < len; i++) { 1037 kernelCallback(newKernelEnds[i]); 1038 } 1039 newKernelEnds.clear(); 1040} 1041 1042void 1043GPUCoalescer::evictionCallback(Addr address) 1044{ 1045 ruby_eviction_callback(address); 1046} 1047 1048void 1049GPUCoalescer::kernelCallback(int wavefront_id) 1050{ 1051 assert(kernelEndList.count(wavefront_id)); 1052 1053 ruby_hit_callback(kernelEndList[wavefront_id]); 1054 1055 kernelEndList.erase(wavefront_id); 1056} 1057 1058void 1059GPUCoalescer::atomicCallback(Addr address, 1060 MachineType mach, 1061 const DataBlock& data) 1062{ 1063 assert(address == makeLineAddress(address)); 1064 1065 DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address); 1066 assert(m_writeRequestTable.count(makeLineAddress(address))); 1067 1068 RequestTable::iterator i = m_writeRequestTable.find(address); 1069 assert(i != m_writeRequestTable.end()); 1070 GPUCoalescerRequest* srequest = i->second; 1071 1072 m_writeRequestTable.erase(i); 1073 markRemoved(); 1074 1075 assert((srequest->m_type == RubyRequestType_ATOMIC) \|\| 1076 (srequest->m_type == RubyRequestType_ATOMIC_RETURN) \|\| 1077 (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN)); 1078 1079 1080 // Atomics don't write to cache, so there is no MRU update... 1081 1082 recordMissLatency(srequest, mach, 1083 srequest->issue_time, Cycles(0), Cycles(0), true, false); 1084 1085 PacketPtr pkt = srequest->pkt; 1086 Addr request_address = pkt->getAddr(); 1087 Addr request_line_address = makeLineAddress(pkt->getAddr()); 1088 1089 int len = reqCoalescer[request_line_address].size(); 1090 std::vector<PacketPtr> mylist; 1091 for (int i = 0; i < len; ++i) { 1092 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt; 1093 assert(srequest->m_type == 1094 reqCoalescer[request_line_address][i].primaryType); 1095 request_address = (pkt->getAddr()); 1096 request_line_address = makeLineAddress(request_address); 1097 if (pkt->getPtr<uint8_t>() && 1098 srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) { 1099 /* atomics are done in memory, and return the data before the atomic op... / 1100* memcpy(pkt->getPtr<uint8_t>(), 1101 data.getData(getOffset(request_address), 1102 pkt->getSize()), 1103 pkt->getSize()); 1104 } else { 1105 DPRINTF(MemoryAccess, 1106 "WARNING. Data not transfered from Ruby to M5 for type " \ 1107 "%s\n", 1108 RubyRequestType_to_string(srequest->m_type)); 1109 } 1110 1111 // If using the RubyTester, update the RubyTester sender state's 1112 // subBlock with the recieved data. The tester will later access 1113 // this state. 1114 // Note: RubyPort will access it's sender state before the 1115 // RubyTester. 1116 if (m_usingRubyTester) { 1117 RubyPort::SenderState requestSenderState = 1118* safe_cast<RubyPort::SenderState>(pkt->senderState); 1119* RubyTester::SenderState* testerSenderState = 1120 safe_cast<RubyTester::SenderState>(requestSenderState->predecessor); 1121* testerSenderState->subBlock.mergeFrom(data); 1122 } 1123 1124 mylist.push_back(pkt); 1125 } 1126 delete srequest; 1127 reqCoalescer.erase(request_line_address); 1128 assert(!reqCoalescer.count(request_line_address)); 1129 1130 completeHitCallback(mylist, len); 1131} 1132 1133void 1134GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID) 1135{ 1136 if (myMachID == senderMachID) { 1137 CP_TCPLdHits++; 1138 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) { 1139 CP_TCPLdTransfers++; 1140 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) { 1141 CP_TCCLdHits++; 1142 } else { 1143 CP_LdMiss++; 1144 } 1145} 1146 1147void 1148GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID) 1149{ 1150 if (myMachID == senderMachID) { 1151 CP_TCPStHits++; 1152 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) { 1153 CP_TCPStTransfers++; 1154 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) { 1155 CP_TCCStHits++; 1156 } else { 1157 CP_StMiss++; 1158 } 1159} 1160 1161void 1162GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len) 1163{ 1164 for (int i = 0; i < len; ++i) { 1165 RubyPort::SenderState ss = 1166* safe_cast<RubyPort::SenderState >(mylist[i]->senderState); 1167* MemSlavePort port = ss->port; 1168* assert(port != NULL); 1169 1170 mylist[i]->senderState = ss->predecessor; 1171 delete ss; 1172 port->hitCallback(mylist[i]); 1173 trySendRetries(); 1174 } 1175 1176 testDrainComplete(); 1177} 1178 1179PacketPtr 1180GPUCoalescer::mapAddrToPkt(Addr address) 1181{ 1182 RequestTable::iterator i = m_readRequestTable.find(address); 1183 assert(i != m_readRequestTable.end()); 1184 GPUCoalescerRequest* request = i->second; 1185 return request->pkt; 1186} 1187 1188void 1189GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest, 1190 MachineType mach, 1191 Cycles initialRequestTime, 1192 Cycles forwardRequestTime, 1193 Cycles firstResponseTime, 1194 bool success, bool isRegion) 1195{ 1196 RubyRequestType type = srequest->m_type; 1197 Cycles issued_time = srequest->issue_time; 1198 Cycles completion_time = curCycle(); 1199 assert(completion_time >= issued_time); 1200 Cycles total_lat = completion_time - issued_time; 1201 1202 // cache stats (valid for RfO protocol only) 1203 if (mach == MachineType_TCP) { 1204 if (type == RubyRequestType_LD) { 1205 GPU_TCPLdHits++; 1206 } else { 1207 GPU_TCPStHits++; 1208 } 1209 } else if (mach == MachineType_L1Cache_wCC) { 1210 if (type == RubyRequestType_LD) { 1211 GPU_TCPLdTransfers++; 1212 } else { 1213 GPU_TCPStTransfers++; 1214 } 1215 } else if (mach == MachineType_TCC) { 1216 if (type == RubyRequestType_LD) { 1217 GPU_TCCLdHits++; 1218 } else { 1219 GPU_TCCStHits++; 1220 } 1221 } else { 1222 if (type == RubyRequestType_LD) { 1223 GPU_LdMiss++; 1224 } else { 1225 GPU_StMiss++; 1226 } 1227 } 1228 1229 // Profile all access latency, even zero latency accesses 1230 m_latencyHist.sample(total_lat); 1231 m_typeLatencyHist[type]->sample(total_lat); 1232 1233 // Profile the miss latency for all non-zero demand misses 1234 if (total_lat != Cycles(0)) { 1235 m_missLatencyHist.sample(total_lat); 1236 m_missTypeLatencyHist[type]->sample(total_lat); 1237 1238 if (mach != MachineType_NUM) { 1239 m_missMachLatencyHist[mach]->sample(total_lat); 1240 m_missTypeMachLatencyHist[type][mach]->sample(total_lat); 1241 1242 if ((issued_time <= initialRequestTime) && 1243 (initialRequestTime <= forwardRequestTime) && 1244 (forwardRequestTime <= firstResponseTime) && 1245 (firstResponseTime <= completion_time)) { 1246 1247 m_IssueToInitialDelayHist[mach]->sample( 1248 initialRequestTime - issued_time); 1249 m_InitialToForwardDelayHist[mach]->sample( 1250 forwardRequestTime - initialRequestTime); 1251 m_ForwardToFirstResponseDelayHist[mach]->sample( 1252 firstResponseTime - forwardRequestTime); 1253 m_FirstResponseToCompletionDelayHist[mach]->sample( 1254 completion_time - firstResponseTime); 1255 } 1256 } 1257 1258 } 1259 1260 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n", 1261 curTick(), m_version, "Coal", 1262 success ? "Done" : "SC_Failed", "", "", 1263 printAddress(srequest->pkt->getAddr()), total_lat); 1264} 1265 1266void 1267GPUCoalescer::regStats() 1268{ 1269 RubyPort::regStats(); 1270 1271 // These statistical variables are not for display. 1272 // The profiler will collate these across different 1273 // coalescers and display those collated statistics. 1274 m_outstandReqHist.init(10); 1275 m_latencyHist.init(10); 1276 m_missLatencyHist.init(10); 1277 1278 for (int i = 0; i < RubyRequestType_NUM; i++) { 1279 m_typeLatencyHist.push_back(new Stats::Histogram()); 1280 m_typeLatencyHist[i]->init(10); 1281 1282 m_missTypeLatencyHist.push_back(new Stats::Histogram()); 1283 m_missTypeLatencyHist[i]->init(10); 1284 } 1285 1286 for (int i = 0; i < MachineType_NUM; i++) { 1287 m_missMachLatencyHist.push_back(new Stats::Histogram()); 1288 m_missMachLatencyHist[i]->init(10); 1289 1290 m_IssueToInitialDelayHist.push_back(new Stats::Histogram()); 1291 m_IssueToInitialDelayHist[i]->init(10); 1292 1293 m_InitialToForwardDelayHist.push_back(new Stats::Histogram()); 1294 m_InitialToForwardDelayHist[i]->init(10); 1295 1296 m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram()); 1297 m_ForwardToFirstResponseDelayHist[i]->init(10); 1298 1299 m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram()); 1300 m_FirstResponseToCompletionDelayHist[i]->init(10); 1301 } 1302 1303 for (int i = 0; i < RubyRequestType_NUM; i++) { 1304 m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram >()); 1305* 1306 for (int j = 0; j < MachineType_NUM; j++) { 1307 m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram()); 1308 m_missTypeMachLatencyHist[i][j]->init(10); 1309 } 1310 } 1311 1312 // GPU cache stats 1313 GPU_TCPLdHits 1314 .name(name() + ".gpu_tcp_ld_hits") 1315 .desc("loads that hit in the TCP") 1316 ; 1317 GPU_TCPLdTransfers 1318 .name(name() + ".gpu_tcp_ld_transfers") 1319 .desc("TCP to TCP load transfers") 1320 ; 1321 GPU_TCCLdHits 1322 .name(name() + ".gpu_tcc_ld_hits") 1323 .desc("loads that hit in the TCC") 1324 ; 1325 GPU_LdMiss 1326 .name(name() + ".gpu_ld_misses") 1327 .desc("loads that miss in the GPU") 1328 ; 1329 1330 GPU_TCPStHits 1331 .name(name() + ".gpu_tcp_st_hits") 1332 .desc("stores that hit in the TCP") 1333 ; 1334 GPU_TCPStTransfers 1335 .name(name() + ".gpu_tcp_st_transfers") 1336 .desc("TCP to TCP store transfers") 1337 ; 1338 GPU_TCCStHits 1339 .name(name() + ".gpu_tcc_st_hits") 1340 .desc("stores that hit in the TCC") 1341 ; 1342 GPU_StMiss 1343 .name(name() + ".gpu_st_misses") 1344 .desc("stores that miss in the GPU") 1345 ; 1346 1347 // CP cache stats 1348 CP_TCPLdHits 1349 .name(name() + ".cp_tcp_ld_hits") 1350 .desc("loads that hit in the TCP") 1351 ; 1352 CP_TCPLdTransfers 1353 .name(name() + ".cp_tcp_ld_transfers") 1354 .desc("TCP to TCP load transfers") 1355 ; 1356 CP_TCCLdHits 1357 .name(name() + ".cp_tcc_ld_hits") 1358 .desc("loads that hit in the TCC") 1359 ; 1360 CP_LdMiss 1361 .name(name() + ".cp_ld_misses") 1362 .desc("loads that miss in the GPU") 1363 ; 1364 1365 CP_TCPStHits 1366 .name(name() + ".cp_tcp_st_hits") 1367 .desc("stores that hit in the TCP") 1368 ; 1369 CP_TCPStTransfers 1370 .name(name() + ".cp_tcp_st_transfers") 1371 .desc("TCP to TCP store transfers") 1372 ; 1373 CP_TCCStHits 1374 .name(name() + ".cp_tcc_st_hits") 1375 .desc("stores that hit in the TCC") 1376 ; 1377 CP_StMiss 1378 .name(name() + ".cp_st_misses") 1379 .desc("stores that miss in the GPU") 1380 ; 1381}