GPUCoalescer.cc revision 11689:9d19bb965564
1/* 2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Sooraj Puthoor 34 */ 35 36#include "base/misc.hh" 37#include "base/str.hh" 38#include "config/the_isa.hh" 39 40#if THE_ISA == X86_ISA 41#include "arch/x86/insts/microldstop.hh" 42 43#endif // X86_ISA 44#include "mem/ruby/system/GPUCoalescer.hh" 45 46#include "cpu/testers/rubytest/RubyTester.hh" 47#include "debug/GPUCoalescer.hh" 48#include "debug/MemoryAccess.hh" 49#include "debug/ProtocolTrace.hh" 50#include "debug/RubyPort.hh" 51#include "debug/RubyStats.hh" 52#include "gpu-compute/shader.hh" 53#include "mem/packet.hh" 54#include "mem/ruby/common/DataBlock.hh" 55#include "mem/ruby/common/SubBlock.hh" 56#include "mem/ruby/network/MessageBuffer.hh" 57#include "mem/ruby/profiler/Profiler.hh" 58#include "mem/ruby/slicc_interface/AbstractController.hh" 59#include "mem/ruby/slicc_interface/RubyRequest.hh" 60#include "mem/ruby/structures/CacheMemory.hh" 61#include "mem/ruby/system/RubySystem.hh" 62#include "params/RubyGPUCoalescer.hh" 63 64using namespace std; 65 66GPUCoalescer * 67RubyGPUCoalescerParams::create() 68{ 69 return new GPUCoalescer(this); 70} 71 72HSAScope 73reqScopeToHSAScope(Request* req) 74{ 75 HSAScope accessScope = HSAScope_UNSPECIFIED; 76 if (req->isScoped()) { 77 if (req->isWavefrontScope()) { 78 accessScope = HSAScope_WAVEFRONT; 79 } else if (req->isWorkgroupScope()) { 80 accessScope = HSAScope_WORKGROUP; 81 } else if (req->isDeviceScope()) { 82 accessScope = HSAScope_DEVICE; 83 } else if (req->isSystemScope()) { 84 accessScope = HSAScope_SYSTEM; 85 } else { 86 fatal("Bad scope type"); 87 } 88 } 89 return accessScope; 90} 91 92HSASegment 93reqSegmentToHSASegment(Request* req) 94{ 95 HSASegment accessSegment = HSASegment_GLOBAL; 96 97 if (req->isGlobalSegment()) { 98 accessSegment = HSASegment_GLOBAL; 99 } else if (req->isGroupSegment()) { 100 accessSegment = HSASegment_GROUP; 101 } else if (req->isPrivateSegment()) { 102 accessSegment = HSASegment_PRIVATE; 103 } else if (req->isKernargSegment()) { 104 accessSegment = HSASegment_KERNARG; 105 } else if (req->isReadonlySegment()) { 106 accessSegment = HSASegment_READONLY; 107 } else if (req->isSpillSegment()) { 108 accessSegment = HSASegment_SPILL; 109 } else if (req->isArgSegment()) { 110 accessSegment = HSASegment_ARG; 111 } else { 112 fatal("Bad segment type"); 113 } 114 115 return accessSegment; 116} 117 118GPUCoalescer::GPUCoalescer(const Params *p) 119 : RubyPort(p), issueEvent(this), deadlockCheckEvent(this) 120{ 121 m_store_waiting_on_load_cycles = 0; 122 m_store_waiting_on_store_cycles = 0; 123 m_load_waiting_on_store_cycles = 0; 124 m_load_waiting_on_load_cycles = 0; 125 126 m_outstanding_count = 0; 127 128 m_max_outstanding_requests = 0; 129 m_deadlock_threshold = 0; 130 m_instCache_ptr = nullptr; 131 m_dataCache_ptr = nullptr; 132 133 m_instCache_ptr = p->icache; 134 m_dataCache_ptr = p->dcache; 135 m_max_outstanding_requests = p->max_outstanding_requests; 136 m_deadlock_threshold = p->deadlock_threshold; 137 138 assert(m_max_outstanding_requests > 0); 139 assert(m_deadlock_threshold > 0); 140 assert(m_instCache_ptr); 141 assert(m_dataCache_ptr); 142 143 m_data_cache_hit_latency = p->dcache_hit_latency; 144 145 m_runningGarnetStandalone = p->garnet_standalone; 146 assumingRfOCoherence = p->assume_rfo; 147} 148 149GPUCoalescer::~GPUCoalescer() 150{ 151} 152 153void 154GPUCoalescer::wakeup() 155{ 156 // Check for deadlock of any of the requests 157 Cycles current_time = curCycle(); 158 159 // Check across all outstanding requests 160 int total_outstanding = 0; 161 162 RequestTable::iterator read = m_readRequestTable.begin(); 163 RequestTable::iterator read_end = m_readRequestTable.end(); 164 for (; read != read_end; ++read) { 165 GPUCoalescerRequest* request = read->second; 166 if (current_time - request->issue_time < m_deadlock_threshold) 167 continue; 168 169 panic("Possible Deadlock detected. Aborting!\n" 170 "version: %d request.paddr: 0x%x m_readRequestTable: %d " 171 "current time: %u issue_time: %d difference: %d\n", m_version, 172 request->pkt->getAddr(), m_readRequestTable.size(), 173 current_time * clockPeriod(), request->issue_time * clockPeriod(), 174 (current_time - request->issue_time)*clockPeriod()); 175 } 176 177 RequestTable::iterator write = m_writeRequestTable.begin(); 178 RequestTable::iterator write_end = m_writeRequestTable.end(); 179 for (; write != write_end; ++write) { 180 GPUCoalescerRequest* request = write->second; 181 if (current_time - request->issue_time < m_deadlock_threshold) 182 continue; 183 184 panic("Possible Deadlock detected. Aborting!\n" 185 "version: %d request.paddr: 0x%x m_writeRequestTable: %d " 186 "current time: %u issue_time: %d difference: %d\n", m_version, 187 request->pkt->getAddr(), m_writeRequestTable.size(), 188 current_time * clockPeriod(), request->issue_time * clockPeriod(), 189 (current_time - request->issue_time) * clockPeriod()); 190 } 191 192 total_outstanding += m_writeRequestTable.size(); 193 total_outstanding += m_readRequestTable.size(); 194 195 assert(m_outstanding_count == total_outstanding); 196 197 if (m_outstanding_count > 0) { 198 // If there are still outstanding requests, keep checking 199 schedule(deadlockCheckEvent, 200 m_deadlock_threshold * clockPeriod() + 201 curTick()); 202 } 203} 204 205void 206GPUCoalescer::resetStats() 207{ 208 m_latencyHist.reset(); 209 m_missLatencyHist.reset(); 210 for (int i = 0; i < RubyRequestType_NUM; i++) { 211 m_typeLatencyHist[i]->reset(); 212 m_missTypeLatencyHist[i]->reset(); 213 for (int j = 0; j < MachineType_NUM; j++) { 214 m_missTypeMachLatencyHist[i][j]->reset(); 215 } 216 } 217 218 for (int i = 0; i < MachineType_NUM; i++) { 219 m_missMachLatencyHist[i]->reset(); 220 221 m_IssueToInitialDelayHist[i]->reset(); 222 m_InitialToForwardDelayHist[i]->reset(); 223 m_ForwardToFirstResponseDelayHist[i]->reset(); 224 m_FirstResponseToCompletionDelayHist[i]->reset(); 225 } 226} 227 228void 229GPUCoalescer::printProgress(ostream& out) const 230{ 231} 232 233RequestStatus 234GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type) 235{ 236 Addr line_addr = makeLineAddress(pkt->getAddr()); 237 238 if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) { 239 return RequestStatus_BufferFull; 240 } 241 242 if (m_controller->isBlocked(line_addr) && 243 request_type != RubyRequestType_Locked_RMW_Write) { 244 return RequestStatus_Aliased; 245 } 246 247 if ((request_type == RubyRequestType_ST) || 248 (request_type == RubyRequestType_ATOMIC) || 249 (request_type == RubyRequestType_ATOMIC_RETURN) || 250 (request_type == RubyRequestType_ATOMIC_NO_RETURN) || 251 (request_type == RubyRequestType_RMW_Read) || 252 (request_type == RubyRequestType_RMW_Write) || 253 (request_type == RubyRequestType_Load_Linked) || 254 (request_type == RubyRequestType_Store_Conditional) || 255 (request_type == RubyRequestType_Locked_RMW_Read) || 256 (request_type == RubyRequestType_Locked_RMW_Write) || 257 (request_type == RubyRequestType_FLUSH)) { 258 259 // Check if there is any outstanding read request for the same 260 // cache line. 261 if (m_readRequestTable.count(line_addr) > 0) { 262 m_store_waiting_on_load_cycles++; 263 return RequestStatus_Aliased; 264 } 265 266 if (m_writeRequestTable.count(line_addr) > 0) { 267 // There is an outstanding write request for the cache line 268 m_store_waiting_on_store_cycles++; 269 return RequestStatus_Aliased; 270 } 271 } else { 272 // Check if there is any outstanding write request for the same 273 // cache line. 274 if (m_writeRequestTable.count(line_addr) > 0) { 275 m_load_waiting_on_store_cycles++; 276 return RequestStatus_Aliased; 277 } 278 279 if (m_readRequestTable.count(line_addr) > 0) { 280 // There is an outstanding read request for the cache line 281 m_load_waiting_on_load_cycles++; 282 return RequestStatus_Aliased; 283 } 284 } 285 286 return RequestStatus_Ready; 287 288} 289 290 291 292// sets the kernelEndList 293void 294GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt) 295{ 296 // Don't know if this will happen or is possible 297 // but I just want to be careful and not have it become 298 // simulator hang in the future 299 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id); 300 assert(kernelEndList.count(wavefront_id) == 0); 301 302 kernelEndList[wavefront_id] = pkt; 303 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n", 304 kernelEndList.size()); 305} 306 307 308// Insert the request on the correct request table. Return true if 309// the entry was already present. 310bool 311GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type) 312{ 313 assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready || 314 pkt->req->isLockedRMW() || 315 !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())); 316 317 int total_outstanding M5_VAR_USED = 318 m_writeRequestTable.size() + m_readRequestTable.size(); 319 320 assert(m_outstanding_count == total_outstanding); 321 322 // See if we should schedule a deadlock check 323 if (!deadlockCheckEvent.scheduled()) { 324 schedule(deadlockCheckEvent, m_deadlock_threshold + curTick()); 325 } 326 327 Addr line_addr = makeLineAddress(pkt->getAddr()); 328 if ((request_type == RubyRequestType_ST) || 329 (request_type == RubyRequestType_ATOMIC) || 330 (request_type == RubyRequestType_ATOMIC_RETURN) || 331 (request_type == RubyRequestType_ATOMIC_NO_RETURN) || 332 (request_type == RubyRequestType_RMW_Read) || 333 (request_type == RubyRequestType_RMW_Write) || 334 (request_type == RubyRequestType_Load_Linked) || 335 (request_type == RubyRequestType_Store_Conditional) || 336 (request_type == RubyRequestType_Locked_RMW_Read) || 337 (request_type == RubyRequestType_Locked_RMW_Write) || 338 (request_type == RubyRequestType_FLUSH)) { 339 340 pair<RequestTable::iterator, bool> r = 341 m_writeRequestTable.insert(RequestTable::value_type(line_addr, 342 (GPUCoalescerRequest*) NULL)); 343 if (r.second) { 344 RequestTable::iterator i = r.first; 345 i->second = new GPUCoalescerRequest(pkt, request_type, 346 curCycle()); 347 DPRINTF(GPUCoalescer, 348 "Inserting write request for paddr %#x for type %d\n", 349 pkt->req->getPaddr(), i->second->m_type); 350 m_outstanding_count++; 351 } else { 352 return true; 353 } 354 } else { 355 pair<RequestTable::iterator, bool> r = 356 m_readRequestTable.insert(RequestTable::value_type(line_addr, 357 (GPUCoalescerRequest*) NULL)); 358 359 if (r.second) { 360 RequestTable::iterator i = r.first; 361 i->second = new GPUCoalescerRequest(pkt, request_type, 362 curCycle()); 363 DPRINTF(GPUCoalescer, 364 "Inserting read request for paddr %#x for type %d\n", 365 pkt->req->getPaddr(), i->second->m_type); 366 m_outstanding_count++; 367 } else { 368 return true; 369 } 370 } 371 372 m_outstandReqHist.sample(m_outstanding_count); 373 374 total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size(); 375 assert(m_outstanding_count == total_outstanding); 376 377 return false; 378} 379 380void 381GPUCoalescer::markRemoved() 382{ 383 m_outstanding_count--; 384 assert(m_outstanding_count == 385 m_writeRequestTable.size() + m_readRequestTable.size()); 386} 387 388void 389GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest) 390{ 391 assert(m_outstanding_count == 392 m_writeRequestTable.size() + m_readRequestTable.size()); 393 394 Addr line_addr = makeLineAddress(srequest->pkt->getAddr()); 395 if ((srequest->m_type == RubyRequestType_ST) || 396 (srequest->m_type == RubyRequestType_RMW_Read) || 397 (srequest->m_type == RubyRequestType_RMW_Write) || 398 (srequest->m_type == RubyRequestType_Load_Linked) || 399 (srequest->m_type == RubyRequestType_Store_Conditional) || 400 (srequest->m_type == RubyRequestType_Locked_RMW_Read) || 401 (srequest->m_type == RubyRequestType_Locked_RMW_Write)) { 402 m_writeRequestTable.erase(line_addr); 403 } else { 404 m_readRequestTable.erase(line_addr); 405 } 406 407 markRemoved(); 408} 409 410bool 411GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request) 412{ 413 // 414 // The success flag indicates whether the LLSC operation was successful. 415 // LL ops will always succeed, but SC may fail if the cache line is no 416 // longer locked. 417 // 418 bool success = true; 419 if (request->m_type == RubyRequestType_Store_Conditional) { 420 if (!m_dataCache_ptr->isLocked(address, m_version)) { 421 // 422 // For failed SC requests, indicate the failure to the cpu by 423 // setting the extra data to zero. 424 // 425 request->pkt->req->setExtraData(0); 426 success = false; 427 } else { 428 // 429 // For successful SC requests, indicate the success to the cpu by 430 // setting the extra data to one. 431 // 432 request->pkt->req->setExtraData(1); 433 } 434 // 435 // Independent of success, all SC operations must clear the lock 436 // 437 m_dataCache_ptr->clearLocked(address); 438 } else if (request->m_type == RubyRequestType_Load_Linked) { 439 // 440 // Note: To fully follow Alpha LLSC semantics, should the LL clear any 441 // previously locked cache lines? 442 // 443 m_dataCache_ptr->setLocked(address, m_version); 444 } else if ((m_dataCache_ptr->isTagPresent(address)) && 445 (m_dataCache_ptr->isLocked(address, m_version))) { 446 // 447 // Normal writes should clear the locked address 448 // 449 m_dataCache_ptr->clearLocked(address); 450 } 451 return success; 452} 453 454void 455GPUCoalescer::writeCallback(Addr address, DataBlock& data) 456{ 457 writeCallback(address, MachineType_NULL, data); 458} 459 460void 461GPUCoalescer::writeCallback(Addr address, 462 MachineType mach, 463 DataBlock& data) 464{ 465 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); 466} 467 468void 469GPUCoalescer::writeCallback(Addr address, 470 MachineType mach, 471 DataBlock& data, 472 Cycles initialRequestTime, 473 Cycles forwardRequestTime, 474 Cycles firstResponseTime) 475{ 476 writeCallback(address, mach, data, 477 initialRequestTime, forwardRequestTime, firstResponseTime, 478 false); 479} 480 481void 482GPUCoalescer::writeCallback(Addr address, 483 MachineType mach, 484 DataBlock& data, 485 Cycles initialRequestTime, 486 Cycles forwardRequestTime, 487 Cycles firstResponseTime, 488 bool isRegion) 489{ 490 assert(address == makeLineAddress(address)); 491 492 DPRINTF(GPUCoalescer, "write callback for address %#x\n", address); 493 assert(m_writeRequestTable.count(makeLineAddress(address))); 494 495 RequestTable::iterator i = m_writeRequestTable.find(address); 496 assert(i != m_writeRequestTable.end()); 497 GPUCoalescerRequest* request = i->second; 498 499 m_writeRequestTable.erase(i); 500 markRemoved(); 501 502 assert((request->m_type == RubyRequestType_ST) || 503 (request->m_type == RubyRequestType_ATOMIC) || 504 (request->m_type == RubyRequestType_ATOMIC_RETURN) || 505 (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) || 506 (request->m_type == RubyRequestType_RMW_Read) || 507 (request->m_type == RubyRequestType_RMW_Write) || 508 (request->m_type == RubyRequestType_Load_Linked) || 509 (request->m_type == RubyRequestType_Store_Conditional) || 510 (request->m_type == RubyRequestType_Locked_RMW_Read) || 511 (request->m_type == RubyRequestType_Locked_RMW_Write) || 512 (request->m_type == RubyRequestType_FLUSH)); 513 514 515 // 516 // For Alpha, properly handle LL, SC, and write requests with respect to 517 // locked cache blocks. 518 // 519 // Not valid for Garnet_standalone protocl 520 // 521 bool success = true; 522 if (!m_runningGarnetStandalone) 523 success = handleLlsc(address, request); 524 525 if (request->m_type == RubyRequestType_Locked_RMW_Read) { 526 m_controller->blockOnQueue(address, m_mandatory_q_ptr); 527 } else if (request->m_type == RubyRequestType_Locked_RMW_Write) { 528 m_controller->unblock(address); 529 } 530 531 hitCallback(request, mach, data, success, 532 request->issue_time, forwardRequestTime, firstResponseTime, 533 isRegion); 534} 535 536void 537GPUCoalescer::readCallback(Addr address, DataBlock& data) 538{ 539 readCallback(address, MachineType_NULL, data); 540} 541 542void 543GPUCoalescer::readCallback(Addr address, 544 MachineType mach, 545 DataBlock& data) 546{ 547 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); 548} 549 550void 551GPUCoalescer::readCallback(Addr address, 552 MachineType mach, 553 DataBlock& data, 554 Cycles initialRequestTime, 555 Cycles forwardRequestTime, 556 Cycles firstResponseTime) 557{ 558 559 readCallback(address, mach, data, 560 initialRequestTime, forwardRequestTime, firstResponseTime, 561 false); 562} 563 564void 565GPUCoalescer::readCallback(Addr address, 566 MachineType mach, 567 DataBlock& data, 568 Cycles initialRequestTime, 569 Cycles forwardRequestTime, 570 Cycles firstResponseTime, 571 bool isRegion) 572{ 573 assert(address == makeLineAddress(address)); 574 assert(m_readRequestTable.count(makeLineAddress(address))); 575 576 DPRINTF(GPUCoalescer, "read callback for address %#x\n", address); 577 RequestTable::iterator i = m_readRequestTable.find(address); 578 assert(i != m_readRequestTable.end()); 579 GPUCoalescerRequest* request = i->second; 580 581 m_readRequestTable.erase(i); 582 markRemoved(); 583 584 assert((request->m_type == RubyRequestType_LD) || 585 (request->m_type == RubyRequestType_IFETCH)); 586 587 hitCallback(request, mach, data, true, 588 request->issue_time, forwardRequestTime, firstResponseTime, 589 isRegion); 590} 591 592void 593GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest, 594 MachineType mach, 595 DataBlock& data, 596 bool success, 597 Cycles initialRequestTime, 598 Cycles forwardRequestTime, 599 Cycles firstResponseTime, 600 bool isRegion) 601{ 602 PacketPtr pkt = srequest->pkt; 603 Addr request_address = pkt->getAddr(); 604 Addr request_line_address = makeLineAddress(request_address); 605 606 RubyRequestType type = srequest->m_type; 607 608 // Set this cache entry to the most recently used 609 if (type == RubyRequestType_IFETCH) { 610 if (m_instCache_ptr->isTagPresent(request_line_address)) 611 m_instCache_ptr->setMRU(request_line_address); 612 } else { 613 if (m_dataCache_ptr->isTagPresent(request_line_address)) 614 m_dataCache_ptr->setMRU(request_line_address); 615 } 616 617 recordMissLatency(srequest, mach, 618 initialRequestTime, 619 forwardRequestTime, 620 firstResponseTime, 621 success, isRegion); 622 // update the data 623 // 624 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER 625 int len = reqCoalescer[request_line_address].size(); 626 std::vector<PacketPtr> mylist; 627 for (int i = 0; i < len; ++i) { 628 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt; 629 assert(type == reqCoalescer[request_line_address][i].primaryType); 630 request_address = pkt->getAddr(); 631 request_line_address = makeLineAddress(pkt->getAddr()); 632 if (pkt->getPtr<uint8_t>()) { 633 if ((type == RubyRequestType_LD) || 634 (type == RubyRequestType_ATOMIC) || 635 (type == RubyRequestType_ATOMIC_RETURN) || 636 (type == RubyRequestType_IFETCH) || 637 (type == RubyRequestType_RMW_Read) || 638 (type == RubyRequestType_Locked_RMW_Read) || 639 (type == RubyRequestType_Load_Linked)) { 640 memcpy(pkt->getPtr<uint8_t>(), 641 data.getData(getOffset(request_address), 642 pkt->getSize()), 643 pkt->getSize()); 644 } else { 645 data.setData(pkt->getPtr<uint8_t>(), 646 getOffset(request_address), pkt->getSize()); 647 } 648 } else { 649 DPRINTF(MemoryAccess, 650 "WARNING. Data not transfered from Ruby to M5 for type " \ 651 "%s\n", 652 RubyRequestType_to_string(type)); 653 } 654 655 // If using the RubyTester, update the RubyTester sender state's 656 // subBlock with the recieved data. The tester will later access 657 // this state. 658 // Note: RubyPort will access it's sender state before the 659 // RubyTester. 660 if (m_usingRubyTester) { 661 RubyPort::SenderState *requestSenderState = 662 safe_cast<RubyPort::SenderState*>(pkt->senderState); 663 RubyTester::SenderState* testerSenderState = 664 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor); 665 testerSenderState->subBlock.mergeFrom(data); 666 } 667 668 mylist.push_back(pkt); 669 } 670 delete srequest; 671 reqCoalescer.erase(request_line_address); 672 assert(!reqCoalescer.count(request_line_address)); 673 674 675 676 completeHitCallback(mylist, len); 677} 678 679bool 680GPUCoalescer::empty() const 681{ 682 return m_writeRequestTable.empty() && m_readRequestTable.empty(); 683} 684 685// Analyzes the packet to see if this request can be coalesced. 686// If request can be coalesced, this request is added to the reqCoalescer table 687// and makeRequest returns RequestStatus_Issued; 688// If this is the first request to a cacheline, request is added to both 689// newRequests queue and to the reqCoalescer table; makeRequest 690// returns RequestStatus_Issued. 691// If there is a pending request to this cacheline and this request 692// can't be coalesced, RequestStatus_Aliased is returned and 693// the packet needs to be reissued. 694RequestStatus 695GPUCoalescer::makeRequest(PacketPtr pkt) 696{ 697 // Check for GPU Barrier Kernel End or Kernel Begin 698 // Leave these to be handled by the child class 699 // Kernel End/Barrier = isFlush + isRelease 700 // Kernel Begin = isFlush + isAcquire 701 if (pkt->req->isKernel()) { 702 if (pkt->req->isAcquire()){ 703 // This is a Kernel Begin leave handling to 704 // virtual xCoalescer::makeRequest 705 return RequestStatus_Issued; 706 }else if (pkt->req->isRelease()) { 707 // This is a Kernel End leave handling to 708 // virtual xCoalescer::makeRequest 709 // If we are here then we didn't call 710 // a virtual version of this function 711 // so we will also schedule the callback 712 int wf_id = 0; 713 if (pkt->req->hasContextId()) { 714 wf_id = pkt->req->contextId(); 715 } 716 insertKernel(wf_id, pkt); 717 newKernelEnds.push_back(wf_id); 718 if (!issueEvent.scheduled()) { 719 schedule(issueEvent, curTick()); 720 } 721 return RequestStatus_Issued; 722 } 723 } 724 725 // If number of outstanding requests greater than the max allowed, 726 // return RequestStatus_BufferFull. This logic can be extended to 727 // support proper backpressure. 728 if (m_outstanding_count >= m_max_outstanding_requests) { 729 return RequestStatus_BufferFull; 730 } 731 732 RubyRequestType primary_type = RubyRequestType_NULL; 733 RubyRequestType secondary_type = RubyRequestType_NULL; 734 735 if (pkt->isLLSC()) { 736 // 737 // Alpha LL/SC instructions need to be handled carefully by the cache 738 // coherence protocol to ensure they follow the proper semantics. In 739 // particular, by identifying the operations as atomic, the protocol 740 // should understand that migratory sharing optimizations should not 741 // be performed (i.e. a load between the LL and SC should not steal 742 // away exclusive permission). 743 // 744 if (pkt->isWrite()) { 745 primary_type = RubyRequestType_Store_Conditional; 746 } else { 747 assert(pkt->isRead()); 748 primary_type = RubyRequestType_Load_Linked; 749 } 750 secondary_type = RubyRequestType_ATOMIC; 751 } else if (pkt->req->isLockedRMW()) { 752 // 753 // x86 locked instructions are translated to store cache coherence 754 // requests because these requests should always be treated as read 755 // exclusive operations and should leverage any migratory sharing 756 // optimization built into the protocol. 757 // 758 if (pkt->isWrite()) { 759 primary_type = RubyRequestType_Locked_RMW_Write; 760 } else { 761 assert(pkt->isRead()); 762 primary_type = RubyRequestType_Locked_RMW_Read; 763 } 764 secondary_type = RubyRequestType_ST; 765 } else if (pkt->isAtomicOp()) { 766 // 767 // GPU Atomic Operation 768 // 769 primary_type = RubyRequestType_ATOMIC; 770 secondary_type = RubyRequestType_ATOMIC; 771 } else { 772 if (pkt->isRead()) { 773 if (pkt->req->isInstFetch()) { 774 primary_type = secondary_type = RubyRequestType_IFETCH; 775 } else { 776#if THE_ISA == X86_ISA 777 uint32_t flags = pkt->req->getFlags(); 778 bool storeCheck = flags & 779 (TheISA::StoreCheck << TheISA::FlagShift); 780#else 781 bool storeCheck = false; 782#endif // X86_ISA 783 if (storeCheck) { 784 primary_type = RubyRequestType_RMW_Read; 785 secondary_type = RubyRequestType_ST; 786 } else { 787 primary_type = secondary_type = RubyRequestType_LD; 788 } 789 } 790 } else if (pkt->isWrite()) { 791 // 792 // Note: M5 packets do not differentiate ST from RMW_Write 793 // 794 primary_type = secondary_type = RubyRequestType_ST; 795 } else if (pkt->isFlush()) { 796 primary_type = secondary_type = RubyRequestType_FLUSH; 797 } else if (pkt->req->isRelease() || pkt->req->isAcquire()) { 798 if (assumingRfOCoherence) { 799 // If we reached here, this request must be a memFence 800 // and the protocol implements RfO, the coalescer can 801 // assume sequentially consistency and schedule the callback 802 // immediately. 803 // Currently the code implements fence callbacks 804 // by reusing the mechanism for kernel completions. 805 // This should be fixed. 806 int wf_id = 0; 807 if (pkt->req->hasContextId()) { 808 wf_id = pkt->req->contextId(); 809 } 810 insertKernel(wf_id, pkt); 811 newKernelEnds.push_back(wf_id); 812 if (!issueEvent.scheduled()) { 813 schedule(issueEvent, curTick()); 814 } 815 return RequestStatus_Issued; 816 } else { 817 // If not RfO, return issued here and let the child coalescer 818 // take care of it. 819 return RequestStatus_Issued; 820 } 821 } else { 822 panic("Unsupported ruby packet type\n"); 823 } 824 } 825 826 // Check if there is any pending request to this cache line from 827 // previous cycles. 828 // If there is a pending request, return aliased. Since coalescing 829 // across time is not permitted, aliased requests are not coalesced. 830 // If a request for this address has already been issued, we must block 831 RequestStatus status = getRequestStatus(pkt, primary_type); 832 if (status != RequestStatus_Ready) 833 return status; 834 835 Addr line_addr = makeLineAddress(pkt->getAddr()); 836 837 // Check if this request can be coalesced with previous 838 // requests from this cycle. 839 if (!reqCoalescer.count(line_addr)) { 840 // This is the first access to this cache line. 841 // A new request to the memory subsystem has to be 842 // made in the next cycle for this cache line, so 843 // add this line addr to the "newRequests" queue 844 newRequests.push_back(line_addr); 845 846 // There was a request to this cache line in this cycle, 847 // let us see if we can coalesce this request with the previous 848 // requests from this cycle 849 } else if (primary_type != 850 reqCoalescer[line_addr][0].primaryType) { 851 // can't coalesce loads, stores and atomics! 852 return RequestStatus_Aliased; 853 } else if (pkt->req->isLockedRMW() || 854 reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) { 855 // can't coalesce locked accesses, but can coalesce atomics! 856 return RequestStatus_Aliased; 857 } else if (pkt->req->hasContextId() && pkt->req->isRelease() && 858 pkt->req->contextId() != 859 reqCoalescer[line_addr][0].pkt->req->contextId()) { 860 // can't coalesce releases from different wavefronts 861 return RequestStatus_Aliased; 862 } 863 864 // in addition to the packet, we need to save both request types 865 reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type); 866 if (!issueEvent.scheduled()) 867 schedule(issueEvent, curTick()); 868 // TODO: issue hardware prefetches here 869 return RequestStatus_Issued; 870} 871 872void 873GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) 874{ 875 876 int proc_id = -1; 877 if (pkt != NULL && pkt->req->hasContextId()) { 878 proc_id = pkt->req->contextId(); 879 } 880 881 // If valid, copy the pc to the ruby request 882 Addr pc = 0; 883 if (pkt->req->hasPC()) { 884 pc = pkt->req->getPC(); 885 } 886 887 // At the moment setting scopes only counts 888 // for GPU spill space accesses 889 // which is pkt->req->isStack() 890 // this scope is REPLACE since it 891 // does not need to be flushed at the end 892 // of a kernel Private and local may need 893 // to be visible at the end of the kernel 894 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req); 895 HSAScope accessScope = reqScopeToHSAScope(pkt->req); 896 897 Addr line_addr = makeLineAddress(pkt->getAddr()); 898 899 // Creating WriteMask that records written bytes 900 // and atomic operations. This enables partial writes 901 // and partial reads of those writes 902 DataBlock dataBlock; 903 dataBlock.clear(); 904 uint32_t blockSize = RubySystem::getBlockSizeBytes(); 905 std::vector<bool> accessMask(blockSize,false); 906 std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps; 907 uint32_t tableSize = reqCoalescer[line_addr].size(); 908 for (int i = 0; i < tableSize; i++) { 909 PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt; 910 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr; 911 uint32_t tmpSize = tmpPkt->getSize(); 912 if (tmpPkt->isAtomicOp()) { 913 std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset, 914 tmpPkt->getAtomicOp()); 915 atomicOps.push_back(tmpAtomicOp); 916 } else if (tmpPkt->isWrite()) { 917 dataBlock.setData(tmpPkt->getPtr<uint8_t>(), 918 tmpOffset, tmpSize); 919 } 920 for (int j = 0; j < tmpSize; j++) { 921 accessMask[tmpOffset + j] = true; 922 } 923 } 924 std::shared_ptr<RubyRequest> msg; 925 if (pkt->isAtomicOp()) { 926 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), 927 pkt->getPtr<uint8_t>(), 928 pkt->getSize(), pc, secondary_type, 929 RubyAccessMode_Supervisor, pkt, 930 PrefetchBit_No, proc_id, 100, 931 blockSize, accessMask, 932 dataBlock, atomicOps, 933 accessScope, accessSegment); 934 } else { 935 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), 936 pkt->getPtr<uint8_t>(), 937 pkt->getSize(), pc, secondary_type, 938 RubyAccessMode_Supervisor, pkt, 939 PrefetchBit_No, proc_id, 100, 940 blockSize, accessMask, 941 dataBlock, 942 accessScope, accessSegment); 943 } 944 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n", 945 curTick(), m_version, "Coal", "Begin", "", "", 946 printAddress(msg->getPhysicalAddress()), 947 RubyRequestType_to_string(secondary_type)); 948 949 fatal_if(secondary_type == RubyRequestType_IFETCH, 950 "there should not be any I-Fetch requests in the GPU Coalescer"); 951 952 // Send the message to the cache controller 953 fatal_if(m_data_cache_hit_latency == 0, 954 "should not have a latency of zero"); 955 956 assert(m_mandatory_q_ptr); 957 m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); 958} 959 960template <class KEY, class VALUE> 961std::ostream & 962operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map) 963{ 964 out << "["; 965 for (auto i = map.begin(); i != map.end(); ++i) 966 out << " " << i->first << "=" << i->second; 967 out << " ]"; 968 969 return out; 970} 971 972void 973GPUCoalescer::print(ostream& out) const 974{ 975 out << "[GPUCoalescer: " << m_version 976 << ", outstanding requests: " << m_outstanding_count 977 << ", read request table: " << m_readRequestTable 978 << ", write request table: " << m_writeRequestTable 979 << "]"; 980} 981 982// this can be called from setState whenever coherence permissions are 983// upgraded when invoked, coherence violations will be checked for the 984// given block 985void 986GPUCoalescer::checkCoherence(Addr addr) 987{ 988#ifdef CHECK_COHERENCE 989 m_ruby_system->checkGlobalCoherenceInvariant(addr); 990#endif 991} 992 993void 994GPUCoalescer::recordRequestType(SequencerRequestType requestType) { 995 DPRINTF(RubyStats, "Recorded statistic: %s\n", 996 SequencerRequestType_to_string(requestType)); 997} 998 999GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq) 1000 : Event(Progress_Event_Pri), seq(_seq) 1001{ 1002} 1003 1004 1005void 1006GPUCoalescer::completeIssue() 1007{ 1008 // newRequests has the cacheline addresses of all the 1009 // requests which need to be issued to the memory subsystem 1010 // in this cycle 1011 int len = newRequests.size(); 1012 DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len); 1013 for (int i = 0; i < len; ++i) { 1014 // Get the requests from reqCoalescer table. Get only the 1015 // first request for each cacheline, the remaining requests 1016 // can be coalesced with the first request. So, only 1017 // one request is issued per cacheline. 1018 RequestDesc info = reqCoalescer[newRequests[i]][0]; 1019 PacketPtr pkt = info.pkt; 1020 DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n", 1021 i, pkt->req->getPaddr()); 1022 // Insert this request to the read/writeRequestTables. These tables 1023 // are used to track aliased requests in makeRequest subroutine 1024 bool found = insertRequest(pkt, info.primaryType); 1025 1026 if (found) { 1027 panic("GPUCoalescer::makeRequest should never be called if the " 1028 "request is already outstanding\n"); 1029 } 1030 1031 // Issue request to ruby subsystem 1032 issueRequest(pkt, info.secondaryType); 1033 } 1034 newRequests.clear(); 1035 1036 // have Kernel End releases been issued this cycle 1037 len = newKernelEnds.size(); 1038 for (int i = 0; i < len; i++) { 1039 kernelCallback(newKernelEnds[i]); 1040 } 1041 newKernelEnds.clear(); 1042} 1043 1044void 1045GPUCoalescer::IssueEvent::process() 1046{ 1047 seq->completeIssue(); 1048} 1049 1050const char * 1051GPUCoalescer::IssueEvent::description() const 1052{ 1053 return "Issue coalesced request"; 1054} 1055 1056void 1057GPUCoalescer::evictionCallback(Addr address) 1058{ 1059 ruby_eviction_callback(address); 1060} 1061 1062void 1063GPUCoalescer::kernelCallback(int wavefront_id) 1064{ 1065 assert(kernelEndList.count(wavefront_id)); 1066 1067 ruby_hit_callback(kernelEndList[wavefront_id]); 1068 1069 kernelEndList.erase(wavefront_id); 1070} 1071 1072void 1073GPUCoalescer::atomicCallback(Addr address, 1074 MachineType mach, 1075 const DataBlock& data) 1076{ 1077 assert(address == makeLineAddress(address)); 1078 1079 DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address); 1080 assert(m_writeRequestTable.count(makeLineAddress(address))); 1081 1082 RequestTable::iterator i = m_writeRequestTable.find(address); 1083 assert(i != m_writeRequestTable.end()); 1084 GPUCoalescerRequest* srequest = i->second; 1085 1086 m_writeRequestTable.erase(i); 1087 markRemoved(); 1088 1089 assert((srequest->m_type == RubyRequestType_ATOMIC) || 1090 (srequest->m_type == RubyRequestType_ATOMIC_RETURN) || 1091 (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN)); 1092 1093 1094 // Atomics don't write to cache, so there is no MRU update... 1095 1096 recordMissLatency(srequest, mach, 1097 srequest->issue_time, Cycles(0), Cycles(0), true, false); 1098 1099 PacketPtr pkt = srequest->pkt; 1100 Addr request_address = pkt->getAddr(); 1101 Addr request_line_address = makeLineAddress(pkt->getAddr()); 1102 1103 int len = reqCoalescer[request_line_address].size(); 1104 std::vector<PacketPtr> mylist; 1105 for (int i = 0; i < len; ++i) { 1106 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt; 1107 assert(srequest->m_type == 1108 reqCoalescer[request_line_address][i].primaryType); 1109 request_address = (pkt->getAddr()); 1110 request_line_address = makeLineAddress(request_address); 1111 if (pkt->getPtr<uint8_t>() && 1112 srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) { 1113 /* atomics are done in memory, and return the data *before* the atomic op... */ 1114 memcpy(pkt->getPtr<uint8_t>(), 1115 data.getData(getOffset(request_address), 1116 pkt->getSize()), 1117 pkt->getSize()); 1118 } else { 1119 DPRINTF(MemoryAccess, 1120 "WARNING. Data not transfered from Ruby to M5 for type " \ 1121 "%s\n", 1122 RubyRequestType_to_string(srequest->m_type)); 1123 } 1124 1125 // If using the RubyTester, update the RubyTester sender state's 1126 // subBlock with the recieved data. The tester will later access 1127 // this state. 1128 // Note: RubyPort will access it's sender state before the 1129 // RubyTester. 1130 if (m_usingRubyTester) { 1131 RubyPort::SenderState *requestSenderState = 1132 safe_cast<RubyPort::SenderState*>(pkt->senderState); 1133 RubyTester::SenderState* testerSenderState = 1134 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor); 1135 testerSenderState->subBlock.mergeFrom(data); 1136 } 1137 1138 mylist.push_back(pkt); 1139 } 1140 delete srequest; 1141 reqCoalescer.erase(request_line_address); 1142 assert(!reqCoalescer.count(request_line_address)); 1143 1144 completeHitCallback(mylist, len); 1145} 1146 1147void 1148GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID) 1149{ 1150 if (myMachID == senderMachID) { 1151 CP_TCPLdHits++; 1152 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) { 1153 CP_TCPLdTransfers++; 1154 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) { 1155 CP_TCCLdHits++; 1156 } else { 1157 CP_LdMiss++; 1158 } 1159} 1160 1161void 1162GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID) 1163{ 1164 if (myMachID == senderMachID) { 1165 CP_TCPStHits++; 1166 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) { 1167 CP_TCPStTransfers++; 1168 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) { 1169 CP_TCCStHits++; 1170 } else { 1171 CP_StMiss++; 1172 } 1173} 1174 1175void 1176GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len) 1177{ 1178 for (int i = 0; i < len; ++i) { 1179 RubyPort::SenderState *ss = 1180 safe_cast<RubyPort::SenderState *>(mylist[i]->senderState); 1181 MemSlavePort *port = ss->port; 1182 assert(port != NULL); 1183 1184 mylist[i]->senderState = ss->predecessor; 1185 delete ss; 1186 port->hitCallback(mylist[i]); 1187 trySendRetries(); 1188 } 1189 1190 testDrainComplete(); 1191} 1192 1193PacketPtr 1194GPUCoalescer::mapAddrToPkt(Addr address) 1195{ 1196 RequestTable::iterator i = m_readRequestTable.find(address); 1197 assert(i != m_readRequestTable.end()); 1198 GPUCoalescerRequest* request = i->second; 1199 return request->pkt; 1200} 1201 1202void 1203GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest, 1204 MachineType mach, 1205 Cycles initialRequestTime, 1206 Cycles forwardRequestTime, 1207 Cycles firstResponseTime, 1208 bool success, bool isRegion) 1209{ 1210 RubyRequestType type = srequest->m_type; 1211 Cycles issued_time = srequest->issue_time; 1212 Cycles completion_time = curCycle(); 1213 assert(completion_time >= issued_time); 1214 Cycles total_lat = completion_time - issued_time; 1215 1216 // cache stats (valid for RfO protocol only) 1217 if (mach == MachineType_TCP) { 1218 if (type == RubyRequestType_LD) { 1219 GPU_TCPLdHits++; 1220 } else { 1221 GPU_TCPStHits++; 1222 } 1223 } else if (mach == MachineType_L1Cache_wCC) { 1224 if (type == RubyRequestType_LD) { 1225 GPU_TCPLdTransfers++; 1226 } else { 1227 GPU_TCPStTransfers++; 1228 } 1229 } else if (mach == MachineType_TCC) { 1230 if (type == RubyRequestType_LD) { 1231 GPU_TCCLdHits++; 1232 } else { 1233 GPU_TCCStHits++; 1234 } 1235 } else { 1236 if (type == RubyRequestType_LD) { 1237 GPU_LdMiss++; 1238 } else { 1239 GPU_StMiss++; 1240 } 1241 } 1242 1243 // Profile all access latency, even zero latency accesses 1244 m_latencyHist.sample(total_lat); 1245 m_typeLatencyHist[type]->sample(total_lat); 1246 1247 // Profile the miss latency for all non-zero demand misses 1248 if (total_lat != Cycles(0)) { 1249 m_missLatencyHist.sample(total_lat); 1250 m_missTypeLatencyHist[type]->sample(total_lat); 1251 1252 if (mach != MachineType_NUM) { 1253 m_missMachLatencyHist[mach]->sample(total_lat); 1254 m_missTypeMachLatencyHist[type][mach]->sample(total_lat); 1255 1256 if ((issued_time <= initialRequestTime) && 1257 (initialRequestTime <= forwardRequestTime) && 1258 (forwardRequestTime <= firstResponseTime) && 1259 (firstResponseTime <= completion_time)) { 1260 1261 m_IssueToInitialDelayHist[mach]->sample( 1262 initialRequestTime - issued_time); 1263 m_InitialToForwardDelayHist[mach]->sample( 1264 forwardRequestTime - initialRequestTime); 1265 m_ForwardToFirstResponseDelayHist[mach]->sample( 1266 firstResponseTime - forwardRequestTime); 1267 m_FirstResponseToCompletionDelayHist[mach]->sample( 1268 completion_time - firstResponseTime); 1269 } 1270 } 1271 1272 } 1273 1274 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n", 1275 curTick(), m_version, "Coal", 1276 success ? "Done" : "SC_Failed", "", "", 1277 printAddress(srequest->pkt->getAddr()), total_lat); 1278} 1279 1280void 1281GPUCoalescer::regStats() 1282{ 1283 RubyPort::regStats(); 1284 1285 // These statistical variables are not for display. 1286 // The profiler will collate these across different 1287 // coalescers and display those collated statistics. 1288 m_outstandReqHist.init(10); 1289 m_latencyHist.init(10); 1290 m_missLatencyHist.init(10); 1291 1292 for (int i = 0; i < RubyRequestType_NUM; i++) { 1293 m_typeLatencyHist.push_back(new Stats::Histogram()); 1294 m_typeLatencyHist[i]->init(10); 1295 1296 m_missTypeLatencyHist.push_back(new Stats::Histogram()); 1297 m_missTypeLatencyHist[i]->init(10); 1298 } 1299 1300 for (int i = 0; i < MachineType_NUM; i++) { 1301 m_missMachLatencyHist.push_back(new Stats::Histogram()); 1302 m_missMachLatencyHist[i]->init(10); 1303 1304 m_IssueToInitialDelayHist.push_back(new Stats::Histogram()); 1305 m_IssueToInitialDelayHist[i]->init(10); 1306 1307 m_InitialToForwardDelayHist.push_back(new Stats::Histogram()); 1308 m_InitialToForwardDelayHist[i]->init(10); 1309 1310 m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram()); 1311 m_ForwardToFirstResponseDelayHist[i]->init(10); 1312 1313 m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram()); 1314 m_FirstResponseToCompletionDelayHist[i]->init(10); 1315 } 1316 1317 for (int i = 0; i < RubyRequestType_NUM; i++) { 1318 m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>()); 1319 1320 for (int j = 0; j < MachineType_NUM; j++) { 1321 m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram()); 1322 m_missTypeMachLatencyHist[i][j]->init(10); 1323 } 1324 } 1325 1326 // GPU cache stats 1327 GPU_TCPLdHits 1328 .name(name() + ".gpu_tcp_ld_hits") 1329 .desc("loads that hit in the TCP") 1330 ; 1331 GPU_TCPLdTransfers 1332 .name(name() + ".gpu_tcp_ld_transfers") 1333 .desc("TCP to TCP load transfers") 1334 ; 1335 GPU_TCCLdHits 1336 .name(name() + ".gpu_tcc_ld_hits") 1337 .desc("loads that hit in the TCC") 1338 ; 1339 GPU_LdMiss 1340 .name(name() + ".gpu_ld_misses") 1341 .desc("loads that miss in the GPU") 1342 ; 1343 1344 GPU_TCPStHits 1345 .name(name() + ".gpu_tcp_st_hits") 1346 .desc("stores that hit in the TCP") 1347 ; 1348 GPU_TCPStTransfers 1349 .name(name() + ".gpu_tcp_st_transfers") 1350 .desc("TCP to TCP store transfers") 1351 ; 1352 GPU_TCCStHits 1353 .name(name() + ".gpu_tcc_st_hits") 1354 .desc("stores that hit in the TCC") 1355 ; 1356 GPU_StMiss 1357 .name(name() + ".gpu_st_misses") 1358 .desc("stores that miss in the GPU") 1359 ; 1360 1361 // CP cache stats 1362 CP_TCPLdHits 1363 .name(name() + ".cp_tcp_ld_hits") 1364 .desc("loads that hit in the TCP") 1365 ; 1366 CP_TCPLdTransfers 1367 .name(name() + ".cp_tcp_ld_transfers") 1368 .desc("TCP to TCP load transfers") 1369 ; 1370 CP_TCCLdHits 1371 .name(name() + ".cp_tcc_ld_hits") 1372 .desc("loads that hit in the TCC") 1373 ; 1374 CP_LdMiss 1375 .name(name() + ".cp_ld_misses") 1376 .desc("loads that miss in the GPU") 1377 ; 1378 1379 CP_TCPStHits 1380 .name(name() + ".cp_tcp_st_hits") 1381 .desc("stores that hit in the TCP") 1382 ; 1383 CP_TCPStTransfers 1384 .name(name() + ".cp_tcp_st_transfers") 1385 .desc("TCP to TCP store transfers") 1386 ; 1387 CP_TCCStHits 1388 .name(name() + ".cp_tcc_st_hits") 1389 .desc("stores that hit in the TCC") 1390 ; 1391 CP_StMiss 1392 .name(name() + ".cp_st_misses") 1393 .desc("stores that miss in the GPU") 1394 ; 1395} 1396