148 m_runningGarnetStandalone = p->garnet_standalone; 149 assumingRfOCoherence = p->assume_rfo; 150} 151 152GPUCoalescer::~GPUCoalescer() 153{ 154} 155 156void 157GPUCoalescer::wakeup() 158{ 159 // Check for deadlock of any of the requests 160 Cycles current_time = curCycle(); 161 162 // Check across all outstanding requests 163 int total_outstanding = 0; 164 165 RequestTable::iterator read = m_readRequestTable.begin(); 166 RequestTable::iterator read_end = m_readRequestTable.end(); 167 for (; read != read_end; ++read) { 168 GPUCoalescerRequest* request = read->second; 169 if (current_time - request->issue_time < m_deadlock_threshold) 170 continue; 171 172 panic("Possible Deadlock detected. Aborting!\n" 173 "version: %d request.paddr: 0x%x m_readRequestTable: %d " 174 "current time: %u issue_time: %d difference: %d\n", m_version, 175 request->pkt->getAddr(), m_readRequestTable.size(), 176 current_time * clockPeriod(), request->issue_time * clockPeriod(), 177 (current_time - request->issue_time)*clockPeriod()); 178 } 179 180 RequestTable::iterator write = m_writeRequestTable.begin(); 181 RequestTable::iterator write_end = m_writeRequestTable.end(); 182 for (; write != write_end; ++write) { 183 GPUCoalescerRequest* request = write->second; 184 if (current_time - request->issue_time < m_deadlock_threshold) 185 continue; 186 187 panic("Possible Deadlock detected. Aborting!\n" 188 "version: %d request.paddr: 0x%x m_writeRequestTable: %d " 189 "current time: %u issue_time: %d difference: %d\n", m_version, 190 request->pkt->getAddr(), m_writeRequestTable.size(), 191 current_time * clockPeriod(), request->issue_time * clockPeriod(), 192 (current_time - request->issue_time) * clockPeriod()); 193 } 194 195 total_outstanding += m_writeRequestTable.size(); 196 total_outstanding += m_readRequestTable.size(); 197 198 assert(m_outstanding_count == total_outstanding); 199 200 if (m_outstanding_count > 0) { 201 // If there are still outstanding requests, keep checking 202 schedule(deadlockCheckEvent, 203 m_deadlock_threshold * clockPeriod() + 204 curTick()); 205 } 206} 207 208void 209GPUCoalescer::resetStats() 210{ 211 m_latencyHist.reset(); 212 m_missLatencyHist.reset(); 213 for (int i = 0; i < RubyRequestType_NUM; i++) { 214 m_typeLatencyHist[i]->reset(); 215 m_missTypeLatencyHist[i]->reset(); 216 for (int j = 0; j < MachineType_NUM; j++) { 217 m_missTypeMachLatencyHist[i][j]->reset(); 218 } 219 } 220 221 for (int i = 0; i < MachineType_NUM; i++) { 222 m_missMachLatencyHist[i]->reset(); 223 224 m_IssueToInitialDelayHist[i]->reset(); 225 m_InitialToForwardDelayHist[i]->reset(); 226 m_ForwardToFirstResponseDelayHist[i]->reset(); 227 m_FirstResponseToCompletionDelayHist[i]->reset(); 228 } 229} 230 231void 232GPUCoalescer::printProgress(ostream& out) const 233{ 234} 235 236RequestStatus 237GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type) 238{ 239 Addr line_addr = makeLineAddress(pkt->getAddr()); 240 241 if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) { 242 return RequestStatus_BufferFull; 243 } 244 245 if (m_controller->isBlocked(line_addr) && 246 request_type != RubyRequestType_Locked_RMW_Write) { 247 return RequestStatus_Aliased; 248 } 249 250 if ((request_type == RubyRequestType_ST) || 251 (request_type == RubyRequestType_ATOMIC) || 252 (request_type == RubyRequestType_ATOMIC_RETURN) || 253 (request_type == RubyRequestType_ATOMIC_NO_RETURN) || 254 (request_type == RubyRequestType_RMW_Read) || 255 (request_type == RubyRequestType_RMW_Write) || 256 (request_type == RubyRequestType_Load_Linked) || 257 (request_type == RubyRequestType_Store_Conditional) || 258 (request_type == RubyRequestType_Locked_RMW_Read) || 259 (request_type == RubyRequestType_Locked_RMW_Write) || 260 (request_type == RubyRequestType_FLUSH)) { 261 262 // Check if there is any outstanding read request for the same 263 // cache line. 264 if (m_readRequestTable.count(line_addr) > 0) { 265 m_store_waiting_on_load_cycles++; 266 return RequestStatus_Aliased; 267 } 268 269 if (m_writeRequestTable.count(line_addr) > 0) { 270 // There is an outstanding write request for the cache line 271 m_store_waiting_on_store_cycles++; 272 return RequestStatus_Aliased; 273 } 274 } else { 275 // Check if there is any outstanding write request for the same 276 // cache line. 277 if (m_writeRequestTable.count(line_addr) > 0) { 278 m_load_waiting_on_store_cycles++; 279 return RequestStatus_Aliased; 280 } 281 282 if (m_readRequestTable.count(line_addr) > 0) { 283 // There is an outstanding read request for the cache line 284 m_load_waiting_on_load_cycles++; 285 return RequestStatus_Aliased; 286 } 287 } 288 289 return RequestStatus_Ready; 290 291} 292 293 294 295// sets the kernelEndList 296void 297GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt) 298{ 299 // Don't know if this will happen or is possible 300 // but I just want to be careful and not have it become 301 // simulator hang in the future 302 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id); 303 assert(kernelEndList.count(wavefront_id) == 0); 304 305 kernelEndList[wavefront_id] = pkt; 306 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n", 307 kernelEndList.size()); 308} 309 310 311// Insert the request on the correct request table. Return true if 312// the entry was already present. 313bool 314GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type) 315{ 316 assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready || 317 pkt->req->isLockedRMW() || 318 !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())); 319 320 int total_outstanding M5_VAR_USED = 321 m_writeRequestTable.size() + m_readRequestTable.size(); 322 323 assert(m_outstanding_count == total_outstanding); 324 325 // See if we should schedule a deadlock check 326 if (!deadlockCheckEvent.scheduled()) { 327 schedule(deadlockCheckEvent, m_deadlock_threshold + curTick()); 328 } 329 330 Addr line_addr = makeLineAddress(pkt->getAddr()); 331 if ((request_type == RubyRequestType_ST) || 332 (request_type == RubyRequestType_ATOMIC) || 333 (request_type == RubyRequestType_ATOMIC_RETURN) || 334 (request_type == RubyRequestType_ATOMIC_NO_RETURN) || 335 (request_type == RubyRequestType_RMW_Read) || 336 (request_type == RubyRequestType_RMW_Write) || 337 (request_type == RubyRequestType_Load_Linked) || 338 (request_type == RubyRequestType_Store_Conditional) || 339 (request_type == RubyRequestType_Locked_RMW_Read) || 340 (request_type == RubyRequestType_Locked_RMW_Write) || 341 (request_type == RubyRequestType_FLUSH)) { 342 343 pair<RequestTable::iterator, bool> r = 344 m_writeRequestTable.insert(RequestTable::value_type(line_addr, 345 (GPUCoalescerRequest*) NULL)); 346 if (r.second) { 347 RequestTable::iterator i = r.first; 348 i->second = new GPUCoalescerRequest(pkt, request_type, 349 curCycle()); 350 DPRINTF(GPUCoalescer, 351 "Inserting write request for paddr %#x for type %d\n", 352 pkt->req->getPaddr(), i->second->m_type); 353 m_outstanding_count++; 354 } else { 355 return true; 356 } 357 } else { 358 pair<RequestTable::iterator, bool> r = 359 m_readRequestTable.insert(RequestTable::value_type(line_addr, 360 (GPUCoalescerRequest*) NULL)); 361 362 if (r.second) { 363 RequestTable::iterator i = r.first; 364 i->second = new GPUCoalescerRequest(pkt, request_type, 365 curCycle()); 366 DPRINTF(GPUCoalescer, 367 "Inserting read request for paddr %#x for type %d\n", 368 pkt->req->getPaddr(), i->second->m_type); 369 m_outstanding_count++; 370 } else { 371 return true; 372 } 373 } 374 375 m_outstandReqHist.sample(m_outstanding_count); 376 377 total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size(); 378 assert(m_outstanding_count == total_outstanding); 379 380 return false; 381} 382 383void 384GPUCoalescer::markRemoved() 385{ 386 m_outstanding_count--; 387 assert(m_outstanding_count == 388 m_writeRequestTable.size() + m_readRequestTable.size()); 389} 390 391void 392GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest) 393{ 394 assert(m_outstanding_count == 395 m_writeRequestTable.size() + m_readRequestTable.size()); 396 397 Addr line_addr = makeLineAddress(srequest->pkt->getAddr()); 398 if ((srequest->m_type == RubyRequestType_ST) || 399 (srequest->m_type == RubyRequestType_RMW_Read) || 400 (srequest->m_type == RubyRequestType_RMW_Write) || 401 (srequest->m_type == RubyRequestType_Load_Linked) || 402 (srequest->m_type == RubyRequestType_Store_Conditional) || 403 (srequest->m_type == RubyRequestType_Locked_RMW_Read) || 404 (srequest->m_type == RubyRequestType_Locked_RMW_Write)) { 405 m_writeRequestTable.erase(line_addr); 406 } else { 407 m_readRequestTable.erase(line_addr); 408 } 409 410 markRemoved(); 411} 412 413bool 414GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request) 415{ 416 // 417 // The success flag indicates whether the LLSC operation was successful. 418 // LL ops will always succeed, but SC may fail if the cache line is no 419 // longer locked. 420 // 421 bool success = true; 422 if (request->m_type == RubyRequestType_Store_Conditional) { 423 if (!m_dataCache_ptr->isLocked(address, m_version)) { 424 // 425 // For failed SC requests, indicate the failure to the cpu by 426 // setting the extra data to zero. 427 // 428 request->pkt->req->setExtraData(0); 429 success = false; 430 } else { 431 // 432 // For successful SC requests, indicate the success to the cpu by 433 // setting the extra data to one. 434 // 435 request->pkt->req->setExtraData(1); 436 } 437 // 438 // Independent of success, all SC operations must clear the lock 439 // 440 m_dataCache_ptr->clearLocked(address); 441 } else if (request->m_type == RubyRequestType_Load_Linked) { 442 // 443 // Note: To fully follow Alpha LLSC semantics, should the LL clear any 444 // previously locked cache lines? 445 // 446 m_dataCache_ptr->setLocked(address, m_version); 447 } else if ((m_dataCache_ptr->isTagPresent(address)) && 448 (m_dataCache_ptr->isLocked(address, m_version))) { 449 // 450 // Normal writes should clear the locked address 451 // 452 m_dataCache_ptr->clearLocked(address); 453 } 454 return success; 455} 456 457void 458GPUCoalescer::writeCallback(Addr address, DataBlock& data) 459{ 460 writeCallback(address, MachineType_NULL, data); 461} 462 463void 464GPUCoalescer::writeCallback(Addr address, 465 MachineType mach, 466 DataBlock& data) 467{ 468 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); 469} 470 471void 472GPUCoalescer::writeCallback(Addr address, 473 MachineType mach, 474 DataBlock& data, 475 Cycles initialRequestTime, 476 Cycles forwardRequestTime, 477 Cycles firstResponseTime) 478{ 479 writeCallback(address, mach, data, 480 initialRequestTime, forwardRequestTime, firstResponseTime, 481 false); 482} 483 484void 485GPUCoalescer::writeCallback(Addr address, 486 MachineType mach, 487 DataBlock& data, 488 Cycles initialRequestTime, 489 Cycles forwardRequestTime, 490 Cycles firstResponseTime, 491 bool isRegion) 492{ 493 assert(address == makeLineAddress(address)); 494 495 DPRINTF(GPUCoalescer, "write callback for address %#x\n", address); 496 assert(m_writeRequestTable.count(makeLineAddress(address))); 497 498 RequestTable::iterator i = m_writeRequestTable.find(address); 499 assert(i != m_writeRequestTable.end()); 500 GPUCoalescerRequest* request = i->second; 501 502 m_writeRequestTable.erase(i); 503 markRemoved(); 504 505 assert((request->m_type == RubyRequestType_ST) || 506 (request->m_type == RubyRequestType_ATOMIC) || 507 (request->m_type == RubyRequestType_ATOMIC_RETURN) || 508 (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) || 509 (request->m_type == RubyRequestType_RMW_Read) || 510 (request->m_type == RubyRequestType_RMW_Write) || 511 (request->m_type == RubyRequestType_Load_Linked) || 512 (request->m_type == RubyRequestType_Store_Conditional) || 513 (request->m_type == RubyRequestType_Locked_RMW_Read) || 514 (request->m_type == RubyRequestType_Locked_RMW_Write) || 515 (request->m_type == RubyRequestType_FLUSH)); 516 517 518 // 519 // For Alpha, properly handle LL, SC, and write requests with respect to 520 // locked cache blocks. 521 // 522 // Not valid for Garnet_standalone protocl 523 // 524 bool success = true; 525 if (!m_runningGarnetStandalone) 526 success = handleLlsc(address, request); 527 528 if (request->m_type == RubyRequestType_Locked_RMW_Read) { 529 m_controller->blockOnQueue(address, m_mandatory_q_ptr); 530 } else if (request->m_type == RubyRequestType_Locked_RMW_Write) { 531 m_controller->unblock(address); 532 } 533 534 hitCallback(request, mach, data, success, 535 request->issue_time, forwardRequestTime, firstResponseTime, 536 isRegion); 537} 538 539void 540GPUCoalescer::readCallback(Addr address, DataBlock& data) 541{ 542 readCallback(address, MachineType_NULL, data); 543} 544 545void 546GPUCoalescer::readCallback(Addr address, 547 MachineType mach, 548 DataBlock& data) 549{ 550 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); 551} 552 553void 554GPUCoalescer::readCallback(Addr address, 555 MachineType mach, 556 DataBlock& data, 557 Cycles initialRequestTime, 558 Cycles forwardRequestTime, 559 Cycles firstResponseTime) 560{ 561 562 readCallback(address, mach, data, 563 initialRequestTime, forwardRequestTime, firstResponseTime, 564 false); 565} 566 567void 568GPUCoalescer::readCallback(Addr address, 569 MachineType mach, 570 DataBlock& data, 571 Cycles initialRequestTime, 572 Cycles forwardRequestTime, 573 Cycles firstResponseTime, 574 bool isRegion) 575{ 576 assert(address == makeLineAddress(address)); 577 assert(m_readRequestTable.count(makeLineAddress(address))); 578 579 DPRINTF(GPUCoalescer, "read callback for address %#x\n", address); 580 RequestTable::iterator i = m_readRequestTable.find(address); 581 assert(i != m_readRequestTable.end()); 582 GPUCoalescerRequest* request = i->second; 583 584 m_readRequestTable.erase(i); 585 markRemoved(); 586 587 assert((request->m_type == RubyRequestType_LD) || 588 (request->m_type == RubyRequestType_IFETCH)); 589 590 hitCallback(request, mach, data, true, 591 request->issue_time, forwardRequestTime, firstResponseTime, 592 isRegion); 593} 594 595void 596GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest, 597 MachineType mach, 598 DataBlock& data, 599 bool success, 600 Cycles initialRequestTime, 601 Cycles forwardRequestTime, 602 Cycles firstResponseTime, 603 bool isRegion) 604{ 605 PacketPtr pkt = srequest->pkt; 606 Addr request_address = pkt->getAddr(); 607 Addr request_line_address = makeLineAddress(request_address); 608 609 RubyRequestType type = srequest->m_type; 610 611 // Set this cache entry to the most recently used 612 if (type == RubyRequestType_IFETCH) { 613 if (m_instCache_ptr->isTagPresent(request_line_address)) 614 m_instCache_ptr->setMRU(request_line_address); 615 } else { 616 if (m_dataCache_ptr->isTagPresent(request_line_address)) 617 m_dataCache_ptr->setMRU(request_line_address); 618 } 619 620 recordMissLatency(srequest, mach, 621 initialRequestTime, 622 forwardRequestTime, 623 firstResponseTime, 624 success, isRegion); 625 // update the data 626 // 627 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER 628 int len = reqCoalescer[request_line_address].size(); 629 std::vector<PacketPtr> mylist; 630 for (int i = 0; i < len; ++i) { 631 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt; 632 assert(type == reqCoalescer[request_line_address][i].primaryType); 633 request_address = pkt->getAddr(); 634 request_line_address = makeLineAddress(pkt->getAddr()); 635 if (pkt->getPtr<uint8_t>()) { 636 if ((type == RubyRequestType_LD) || 637 (type == RubyRequestType_ATOMIC) || 638 (type == RubyRequestType_ATOMIC_RETURN) || 639 (type == RubyRequestType_IFETCH) || 640 (type == RubyRequestType_RMW_Read) || 641 (type == RubyRequestType_Locked_RMW_Read) || 642 (type == RubyRequestType_Load_Linked)) { 643 pkt->setData( 644 data.getData(getOffset(request_address), pkt->getSize())); 645 } else { 646 data.setData(pkt->getPtr<uint8_t>(), 647 getOffset(request_address), pkt->getSize()); 648 } 649 } else { 650 DPRINTF(MemoryAccess, 651 "WARNING. Data not transfered from Ruby to M5 for type " \ 652 "%s\n", 653 RubyRequestType_to_string(type)); 654 } 655 656 // If using the RubyTester, update the RubyTester sender state's 657 // subBlock with the recieved data. The tester will later access 658 // this state. 659 // Note: RubyPort will access it's sender state before the 660 // RubyTester. 661 if (m_usingRubyTester) { 662 RubyPort::SenderState *requestSenderState = 663 safe_cast<RubyPort::SenderState*>(pkt->senderState); 664 RubyTester::SenderState* testerSenderState = 665 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor); 666 testerSenderState->subBlock.mergeFrom(data); 667 } 668 669 mylist.push_back(pkt); 670 } 671 delete srequest; 672 reqCoalescer.erase(request_line_address); 673 assert(!reqCoalescer.count(request_line_address)); 674 675 676 677 completeHitCallback(mylist, len); 678} 679 680bool 681GPUCoalescer::empty() const 682{ 683 return m_writeRequestTable.empty() && m_readRequestTable.empty(); 684} 685 686// Analyzes the packet to see if this request can be coalesced. 687// If request can be coalesced, this request is added to the reqCoalescer table 688// and makeRequest returns RequestStatus_Issued; 689// If this is the first request to a cacheline, request is added to both 690// newRequests queue and to the reqCoalescer table; makeRequest 691// returns RequestStatus_Issued. 692// If there is a pending request to this cacheline and this request 693// can't be coalesced, RequestStatus_Aliased is returned and 694// the packet needs to be reissued. 695RequestStatus 696GPUCoalescer::makeRequest(PacketPtr pkt) 697{ 698 // Check for GPU Barrier Kernel End or Kernel Begin 699 // Leave these to be handled by the child class 700 // Kernel End/Barrier = isFlush + isRelease 701 // Kernel Begin = isFlush + isAcquire 702 if (pkt->req->isKernel()) { 703 if (pkt->req->isAcquire()){ 704 // This is a Kernel Begin leave handling to 705 // virtual xCoalescer::makeRequest 706 return RequestStatus_Issued; 707 }else if (pkt->req->isRelease()) { 708 // This is a Kernel End leave handling to 709 // virtual xCoalescer::makeRequest 710 // If we are here then we didn't call 711 // a virtual version of this function 712 // so we will also schedule the callback 713 int wf_id = 0; 714 if (pkt->req->hasContextId()) { 715 wf_id = pkt->req->contextId(); 716 } 717 insertKernel(wf_id, pkt); 718 newKernelEnds.push_back(wf_id); 719 if (!issueEvent.scheduled()) { 720 schedule(issueEvent, curTick()); 721 } 722 return RequestStatus_Issued; 723 } 724 } 725 726 // If number of outstanding requests greater than the max allowed, 727 // return RequestStatus_BufferFull. This logic can be extended to 728 // support proper backpressure. 729 if (m_outstanding_count >= m_max_outstanding_requests) { 730 return RequestStatus_BufferFull; 731 } 732 733 RubyRequestType primary_type = RubyRequestType_NULL; 734 RubyRequestType secondary_type = RubyRequestType_NULL; 735 736 if (pkt->isLLSC()) { 737 // 738 // Alpha LL/SC instructions need to be handled carefully by the cache 739 // coherence protocol to ensure they follow the proper semantics. In 740 // particular, by identifying the operations as atomic, the protocol 741 // should understand that migratory sharing optimizations should not 742 // be performed (i.e. a load between the LL and SC should not steal 743 // away exclusive permission). 744 // 745 if (pkt->isWrite()) { 746 primary_type = RubyRequestType_Store_Conditional; 747 } else { 748 assert(pkt->isRead()); 749 primary_type = RubyRequestType_Load_Linked; 750 } 751 secondary_type = RubyRequestType_ATOMIC; 752 } else if (pkt->req->isLockedRMW()) { 753 // 754 // x86 locked instructions are translated to store cache coherence 755 // requests because these requests should always be treated as read 756 // exclusive operations and should leverage any migratory sharing 757 // optimization built into the protocol. 758 // 759 if (pkt->isWrite()) { 760 primary_type = RubyRequestType_Locked_RMW_Write; 761 } else { 762 assert(pkt->isRead()); 763 primary_type = RubyRequestType_Locked_RMW_Read; 764 } 765 secondary_type = RubyRequestType_ST; 766 } else if (pkt->isAtomicOp()) { 767 // 768 // GPU Atomic Operation 769 // 770 primary_type = RubyRequestType_ATOMIC; 771 secondary_type = RubyRequestType_ATOMIC; 772 } else { 773 if (pkt->isRead()) { 774 if (pkt->req->isInstFetch()) { 775 primary_type = secondary_type = RubyRequestType_IFETCH; 776 } else { 777#if THE_ISA == X86_ISA 778 uint32_t flags = pkt->req->getFlags(); 779 bool storeCheck = flags & 780 (TheISA::StoreCheck << TheISA::FlagShift); 781#else 782 bool storeCheck = false; 783#endif // X86_ISA 784 if (storeCheck) { 785 primary_type = RubyRequestType_RMW_Read; 786 secondary_type = RubyRequestType_ST; 787 } else { 788 primary_type = secondary_type = RubyRequestType_LD; 789 } 790 } 791 } else if (pkt->isWrite()) { 792 // 793 // Note: M5 packets do not differentiate ST from RMW_Write 794 // 795 primary_type = secondary_type = RubyRequestType_ST; 796 } else if (pkt->isFlush()) { 797 primary_type = secondary_type = RubyRequestType_FLUSH; 798 } else if (pkt->req->isRelease() || pkt->req->isAcquire()) { 799 if (assumingRfOCoherence) { 800 // If we reached here, this request must be a memFence 801 // and the protocol implements RfO, the coalescer can 802 // assume sequentially consistency and schedule the callback 803 // immediately. 804 // Currently the code implements fence callbacks 805 // by reusing the mechanism for kernel completions. 806 // This should be fixed. 807 int wf_id = 0; 808 if (pkt->req->hasContextId()) { 809 wf_id = pkt->req->contextId(); 810 } 811 insertKernel(wf_id, pkt); 812 newKernelEnds.push_back(wf_id); 813 if (!issueEvent.scheduled()) { 814 schedule(issueEvent, curTick()); 815 } 816 return RequestStatus_Issued; 817 } else { 818 // If not RfO, return issued here and let the child coalescer 819 // take care of it. 820 return RequestStatus_Issued; 821 } 822 } else { 823 panic("Unsupported ruby packet type\n"); 824 } 825 } 826 827 // Check if there is any pending request to this cache line from 828 // previous cycles. 829 // If there is a pending request, return aliased. Since coalescing 830 // across time is not permitted, aliased requests are not coalesced. 831 // If a request for this address has already been issued, we must block 832 RequestStatus status = getRequestStatus(pkt, primary_type); 833 if (status != RequestStatus_Ready) 834 return status; 835 836 Addr line_addr = makeLineAddress(pkt->getAddr()); 837 838 // Check if this request can be coalesced with previous 839 // requests from this cycle. 840 if (!reqCoalescer.count(line_addr)) { 841 // This is the first access to this cache line. 842 // A new request to the memory subsystem has to be 843 // made in the next cycle for this cache line, so 844 // add this line addr to the "newRequests" queue 845 newRequests.push_back(line_addr); 846 847 // There was a request to this cache line in this cycle, 848 // let us see if we can coalesce this request with the previous 849 // requests from this cycle 850 } else if (primary_type != 851 reqCoalescer[line_addr][0].primaryType) { 852 // can't coalesce loads, stores and atomics! 853 return RequestStatus_Aliased; 854 } else if (pkt->req->isLockedRMW() || 855 reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) { 856 // can't coalesce locked accesses, but can coalesce atomics! 857 return RequestStatus_Aliased; 858 } else if (pkt->req->hasContextId() && pkt->req->isRelease() && 859 pkt->req->contextId() != 860 reqCoalescer[line_addr][0].pkt->req->contextId()) { 861 // can't coalesce releases from different wavefronts 862 return RequestStatus_Aliased; 863 } 864 865 // in addition to the packet, we need to save both request types 866 reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type); 867 if (!issueEvent.scheduled()) 868 schedule(issueEvent, curTick()); 869 // TODO: issue hardware prefetches here 870 return RequestStatus_Issued; 871} 872 873void 874GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) 875{ 876 877 int proc_id = -1; 878 if (pkt != NULL && pkt->req->hasContextId()) { 879 proc_id = pkt->req->contextId(); 880 } 881 882 // If valid, copy the pc to the ruby request 883 Addr pc = 0; 884 if (pkt->req->hasPC()) { 885 pc = pkt->req->getPC(); 886 } 887 888 // At the moment setting scopes only counts 889 // for GPU spill space accesses 890 // which is pkt->req->isStack() 891 // this scope is REPLACE since it 892 // does not need to be flushed at the end 893 // of a kernel Private and local may need 894 // to be visible at the end of the kernel 895 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req); 896 HSAScope accessScope = reqScopeToHSAScope(pkt->req); 897 898 Addr line_addr = makeLineAddress(pkt->getAddr()); 899 900 // Creating WriteMask that records written bytes 901 // and atomic operations. This enables partial writes 902 // and partial reads of those writes 903 DataBlock dataBlock; 904 dataBlock.clear(); 905 uint32_t blockSize = RubySystem::getBlockSizeBytes(); 906 std::vector<bool> accessMask(blockSize,false); 907 std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps; 908 uint32_t tableSize = reqCoalescer[line_addr].size(); 909 for (int i = 0; i < tableSize; i++) { 910 PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt; 911 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr; 912 uint32_t tmpSize = tmpPkt->getSize(); 913 if (tmpPkt->isAtomicOp()) { 914 std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset, 915 tmpPkt->getAtomicOp()); 916 atomicOps.push_back(tmpAtomicOp); 917 } else if (tmpPkt->isWrite()) { 918 dataBlock.setData(tmpPkt->getPtr<uint8_t>(), 919 tmpOffset, tmpSize); 920 } 921 for (int j = 0; j < tmpSize; j++) { 922 accessMask[tmpOffset + j] = true; 923 } 924 } 925 std::shared_ptr<RubyRequest> msg; 926 if (pkt->isAtomicOp()) { 927 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), 928 pkt->getPtr<uint8_t>(), 929 pkt->getSize(), pc, secondary_type, 930 RubyAccessMode_Supervisor, pkt, 931 PrefetchBit_No, proc_id, 100, 932 blockSize, accessMask, 933 dataBlock, atomicOps, 934 accessScope, accessSegment); 935 } else { 936 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), 937 pkt->getPtr<uint8_t>(), 938 pkt->getSize(), pc, secondary_type, 939 RubyAccessMode_Supervisor, pkt, 940 PrefetchBit_No, proc_id, 100, 941 blockSize, accessMask, 942 dataBlock, 943 accessScope, accessSegment); 944 } 945 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n", 946 curTick(), m_version, "Coal", "Begin", "", "", 947 printAddress(msg->getPhysicalAddress()), 948 RubyRequestType_to_string(secondary_type)); 949 950 fatal_if(secondary_type == RubyRequestType_IFETCH, 951 "there should not be any I-Fetch requests in the GPU Coalescer"); 952
| 146 m_runningGarnetStandalone = p->garnet_standalone; 147 assumingRfOCoherence = p->assume_rfo; 148} 149 150GPUCoalescer::~GPUCoalescer() 151{ 152} 153 154void 155GPUCoalescer::wakeup() 156{ 157 // Check for deadlock of any of the requests 158 Cycles current_time = curCycle(); 159 160 // Check across all outstanding requests 161 int total_outstanding = 0; 162 163 RequestTable::iterator read = m_readRequestTable.begin(); 164 RequestTable::iterator read_end = m_readRequestTable.end(); 165 for (; read != read_end; ++read) { 166 GPUCoalescerRequest* request = read->second; 167 if (current_time - request->issue_time < m_deadlock_threshold) 168 continue; 169 170 panic("Possible Deadlock detected. Aborting!\n" 171 "version: %d request.paddr: 0x%x m_readRequestTable: %d " 172 "current time: %u issue_time: %d difference: %d\n", m_version, 173 request->pkt->getAddr(), m_readRequestTable.size(), 174 current_time * clockPeriod(), request->issue_time * clockPeriod(), 175 (current_time - request->issue_time)*clockPeriod()); 176 } 177 178 RequestTable::iterator write = m_writeRequestTable.begin(); 179 RequestTable::iterator write_end = m_writeRequestTable.end(); 180 for (; write != write_end; ++write) { 181 GPUCoalescerRequest* request = write->second; 182 if (current_time - request->issue_time < m_deadlock_threshold) 183 continue; 184 185 panic("Possible Deadlock detected. Aborting!\n" 186 "version: %d request.paddr: 0x%x m_writeRequestTable: %d " 187 "current time: %u issue_time: %d difference: %d\n", m_version, 188 request->pkt->getAddr(), m_writeRequestTable.size(), 189 current_time * clockPeriod(), request->issue_time * clockPeriod(), 190 (current_time - request->issue_time) * clockPeriod()); 191 } 192 193 total_outstanding += m_writeRequestTable.size(); 194 total_outstanding += m_readRequestTable.size(); 195 196 assert(m_outstanding_count == total_outstanding); 197 198 if (m_outstanding_count > 0) { 199 // If there are still outstanding requests, keep checking 200 schedule(deadlockCheckEvent, 201 m_deadlock_threshold * clockPeriod() + 202 curTick()); 203 } 204} 205 206void 207GPUCoalescer::resetStats() 208{ 209 m_latencyHist.reset(); 210 m_missLatencyHist.reset(); 211 for (int i = 0; i < RubyRequestType_NUM; i++) { 212 m_typeLatencyHist[i]->reset(); 213 m_missTypeLatencyHist[i]->reset(); 214 for (int j = 0; j < MachineType_NUM; j++) { 215 m_missTypeMachLatencyHist[i][j]->reset(); 216 } 217 } 218 219 for (int i = 0; i < MachineType_NUM; i++) { 220 m_missMachLatencyHist[i]->reset(); 221 222 m_IssueToInitialDelayHist[i]->reset(); 223 m_InitialToForwardDelayHist[i]->reset(); 224 m_ForwardToFirstResponseDelayHist[i]->reset(); 225 m_FirstResponseToCompletionDelayHist[i]->reset(); 226 } 227} 228 229void 230GPUCoalescer::printProgress(ostream& out) const 231{ 232} 233 234RequestStatus 235GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type) 236{ 237 Addr line_addr = makeLineAddress(pkt->getAddr()); 238 239 if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) { 240 return RequestStatus_BufferFull; 241 } 242 243 if (m_controller->isBlocked(line_addr) && 244 request_type != RubyRequestType_Locked_RMW_Write) { 245 return RequestStatus_Aliased; 246 } 247 248 if ((request_type == RubyRequestType_ST) || 249 (request_type == RubyRequestType_ATOMIC) || 250 (request_type == RubyRequestType_ATOMIC_RETURN) || 251 (request_type == RubyRequestType_ATOMIC_NO_RETURN) || 252 (request_type == RubyRequestType_RMW_Read) || 253 (request_type == RubyRequestType_RMW_Write) || 254 (request_type == RubyRequestType_Load_Linked) || 255 (request_type == RubyRequestType_Store_Conditional) || 256 (request_type == RubyRequestType_Locked_RMW_Read) || 257 (request_type == RubyRequestType_Locked_RMW_Write) || 258 (request_type == RubyRequestType_FLUSH)) { 259 260 // Check if there is any outstanding read request for the same 261 // cache line. 262 if (m_readRequestTable.count(line_addr) > 0) { 263 m_store_waiting_on_load_cycles++; 264 return RequestStatus_Aliased; 265 } 266 267 if (m_writeRequestTable.count(line_addr) > 0) { 268 // There is an outstanding write request for the cache line 269 m_store_waiting_on_store_cycles++; 270 return RequestStatus_Aliased; 271 } 272 } else { 273 // Check if there is any outstanding write request for the same 274 // cache line. 275 if (m_writeRequestTable.count(line_addr) > 0) { 276 m_load_waiting_on_store_cycles++; 277 return RequestStatus_Aliased; 278 } 279 280 if (m_readRequestTable.count(line_addr) > 0) { 281 // There is an outstanding read request for the cache line 282 m_load_waiting_on_load_cycles++; 283 return RequestStatus_Aliased; 284 } 285 } 286 287 return RequestStatus_Ready; 288 289} 290 291 292 293// sets the kernelEndList 294void 295GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt) 296{ 297 // Don't know if this will happen or is possible 298 // but I just want to be careful and not have it become 299 // simulator hang in the future 300 DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id); 301 assert(kernelEndList.count(wavefront_id) == 0); 302 303 kernelEndList[wavefront_id] = pkt; 304 DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n", 305 kernelEndList.size()); 306} 307 308 309// Insert the request on the correct request table. Return true if 310// the entry was already present. 311bool 312GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type) 313{ 314 assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready || 315 pkt->req->isLockedRMW() || 316 !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())); 317 318 int total_outstanding M5_VAR_USED = 319 m_writeRequestTable.size() + m_readRequestTable.size(); 320 321 assert(m_outstanding_count == total_outstanding); 322 323 // See if we should schedule a deadlock check 324 if (!deadlockCheckEvent.scheduled()) { 325 schedule(deadlockCheckEvent, m_deadlock_threshold + curTick()); 326 } 327 328 Addr line_addr = makeLineAddress(pkt->getAddr()); 329 if ((request_type == RubyRequestType_ST) || 330 (request_type == RubyRequestType_ATOMIC) || 331 (request_type == RubyRequestType_ATOMIC_RETURN) || 332 (request_type == RubyRequestType_ATOMIC_NO_RETURN) || 333 (request_type == RubyRequestType_RMW_Read) || 334 (request_type == RubyRequestType_RMW_Write) || 335 (request_type == RubyRequestType_Load_Linked) || 336 (request_type == RubyRequestType_Store_Conditional) || 337 (request_type == RubyRequestType_Locked_RMW_Read) || 338 (request_type == RubyRequestType_Locked_RMW_Write) || 339 (request_type == RubyRequestType_FLUSH)) { 340 341 pair<RequestTable::iterator, bool> r = 342 m_writeRequestTable.insert(RequestTable::value_type(line_addr, 343 (GPUCoalescerRequest*) NULL)); 344 if (r.second) { 345 RequestTable::iterator i = r.first; 346 i->second = new GPUCoalescerRequest(pkt, request_type, 347 curCycle()); 348 DPRINTF(GPUCoalescer, 349 "Inserting write request for paddr %#x for type %d\n", 350 pkt->req->getPaddr(), i->second->m_type); 351 m_outstanding_count++; 352 } else { 353 return true; 354 } 355 } else { 356 pair<RequestTable::iterator, bool> r = 357 m_readRequestTable.insert(RequestTable::value_type(line_addr, 358 (GPUCoalescerRequest*) NULL)); 359 360 if (r.second) { 361 RequestTable::iterator i = r.first; 362 i->second = new GPUCoalescerRequest(pkt, request_type, 363 curCycle()); 364 DPRINTF(GPUCoalescer, 365 "Inserting read request for paddr %#x for type %d\n", 366 pkt->req->getPaddr(), i->second->m_type); 367 m_outstanding_count++; 368 } else { 369 return true; 370 } 371 } 372 373 m_outstandReqHist.sample(m_outstanding_count); 374 375 total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size(); 376 assert(m_outstanding_count == total_outstanding); 377 378 return false; 379} 380 381void 382GPUCoalescer::markRemoved() 383{ 384 m_outstanding_count--; 385 assert(m_outstanding_count == 386 m_writeRequestTable.size() + m_readRequestTable.size()); 387} 388 389void 390GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest) 391{ 392 assert(m_outstanding_count == 393 m_writeRequestTable.size() + m_readRequestTable.size()); 394 395 Addr line_addr = makeLineAddress(srequest->pkt->getAddr()); 396 if ((srequest->m_type == RubyRequestType_ST) || 397 (srequest->m_type == RubyRequestType_RMW_Read) || 398 (srequest->m_type == RubyRequestType_RMW_Write) || 399 (srequest->m_type == RubyRequestType_Load_Linked) || 400 (srequest->m_type == RubyRequestType_Store_Conditional) || 401 (srequest->m_type == RubyRequestType_Locked_RMW_Read) || 402 (srequest->m_type == RubyRequestType_Locked_RMW_Write)) { 403 m_writeRequestTable.erase(line_addr); 404 } else { 405 m_readRequestTable.erase(line_addr); 406 } 407 408 markRemoved(); 409} 410 411bool 412GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request) 413{ 414 // 415 // The success flag indicates whether the LLSC operation was successful. 416 // LL ops will always succeed, but SC may fail if the cache line is no 417 // longer locked. 418 // 419 bool success = true; 420 if (request->m_type == RubyRequestType_Store_Conditional) { 421 if (!m_dataCache_ptr->isLocked(address, m_version)) { 422 // 423 // For failed SC requests, indicate the failure to the cpu by 424 // setting the extra data to zero. 425 // 426 request->pkt->req->setExtraData(0); 427 success = false; 428 } else { 429 // 430 // For successful SC requests, indicate the success to the cpu by 431 // setting the extra data to one. 432 // 433 request->pkt->req->setExtraData(1); 434 } 435 // 436 // Independent of success, all SC operations must clear the lock 437 // 438 m_dataCache_ptr->clearLocked(address); 439 } else if (request->m_type == RubyRequestType_Load_Linked) { 440 // 441 // Note: To fully follow Alpha LLSC semantics, should the LL clear any 442 // previously locked cache lines? 443 // 444 m_dataCache_ptr->setLocked(address, m_version); 445 } else if ((m_dataCache_ptr->isTagPresent(address)) && 446 (m_dataCache_ptr->isLocked(address, m_version))) { 447 // 448 // Normal writes should clear the locked address 449 // 450 m_dataCache_ptr->clearLocked(address); 451 } 452 return success; 453} 454 455void 456GPUCoalescer::writeCallback(Addr address, DataBlock& data) 457{ 458 writeCallback(address, MachineType_NULL, data); 459} 460 461void 462GPUCoalescer::writeCallback(Addr address, 463 MachineType mach, 464 DataBlock& data) 465{ 466 writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); 467} 468 469void 470GPUCoalescer::writeCallback(Addr address, 471 MachineType mach, 472 DataBlock& data, 473 Cycles initialRequestTime, 474 Cycles forwardRequestTime, 475 Cycles firstResponseTime) 476{ 477 writeCallback(address, mach, data, 478 initialRequestTime, forwardRequestTime, firstResponseTime, 479 false); 480} 481 482void 483GPUCoalescer::writeCallback(Addr address, 484 MachineType mach, 485 DataBlock& data, 486 Cycles initialRequestTime, 487 Cycles forwardRequestTime, 488 Cycles firstResponseTime, 489 bool isRegion) 490{ 491 assert(address == makeLineAddress(address)); 492 493 DPRINTF(GPUCoalescer, "write callback for address %#x\n", address); 494 assert(m_writeRequestTable.count(makeLineAddress(address))); 495 496 RequestTable::iterator i = m_writeRequestTable.find(address); 497 assert(i != m_writeRequestTable.end()); 498 GPUCoalescerRequest* request = i->second; 499 500 m_writeRequestTable.erase(i); 501 markRemoved(); 502 503 assert((request->m_type == RubyRequestType_ST) || 504 (request->m_type == RubyRequestType_ATOMIC) || 505 (request->m_type == RubyRequestType_ATOMIC_RETURN) || 506 (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) || 507 (request->m_type == RubyRequestType_RMW_Read) || 508 (request->m_type == RubyRequestType_RMW_Write) || 509 (request->m_type == RubyRequestType_Load_Linked) || 510 (request->m_type == RubyRequestType_Store_Conditional) || 511 (request->m_type == RubyRequestType_Locked_RMW_Read) || 512 (request->m_type == RubyRequestType_Locked_RMW_Write) || 513 (request->m_type == RubyRequestType_FLUSH)); 514 515 516 // 517 // For Alpha, properly handle LL, SC, and write requests with respect to 518 // locked cache blocks. 519 // 520 // Not valid for Garnet_standalone protocl 521 // 522 bool success = true; 523 if (!m_runningGarnetStandalone) 524 success = handleLlsc(address, request); 525 526 if (request->m_type == RubyRequestType_Locked_RMW_Read) { 527 m_controller->blockOnQueue(address, m_mandatory_q_ptr); 528 } else if (request->m_type == RubyRequestType_Locked_RMW_Write) { 529 m_controller->unblock(address); 530 } 531 532 hitCallback(request, mach, data, success, 533 request->issue_time, forwardRequestTime, firstResponseTime, 534 isRegion); 535} 536 537void 538GPUCoalescer::readCallback(Addr address, DataBlock& data) 539{ 540 readCallback(address, MachineType_NULL, data); 541} 542 543void 544GPUCoalescer::readCallback(Addr address, 545 MachineType mach, 546 DataBlock& data) 547{ 548 readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); 549} 550 551void 552GPUCoalescer::readCallback(Addr address, 553 MachineType mach, 554 DataBlock& data, 555 Cycles initialRequestTime, 556 Cycles forwardRequestTime, 557 Cycles firstResponseTime) 558{ 559 560 readCallback(address, mach, data, 561 initialRequestTime, forwardRequestTime, firstResponseTime, 562 false); 563} 564 565void 566GPUCoalescer::readCallback(Addr address, 567 MachineType mach, 568 DataBlock& data, 569 Cycles initialRequestTime, 570 Cycles forwardRequestTime, 571 Cycles firstResponseTime, 572 bool isRegion) 573{ 574 assert(address == makeLineAddress(address)); 575 assert(m_readRequestTable.count(makeLineAddress(address))); 576 577 DPRINTF(GPUCoalescer, "read callback for address %#x\n", address); 578 RequestTable::iterator i = m_readRequestTable.find(address); 579 assert(i != m_readRequestTable.end()); 580 GPUCoalescerRequest* request = i->second; 581 582 m_readRequestTable.erase(i); 583 markRemoved(); 584 585 assert((request->m_type == RubyRequestType_LD) || 586 (request->m_type == RubyRequestType_IFETCH)); 587 588 hitCallback(request, mach, data, true, 589 request->issue_time, forwardRequestTime, firstResponseTime, 590 isRegion); 591} 592 593void 594GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest, 595 MachineType mach, 596 DataBlock& data, 597 bool success, 598 Cycles initialRequestTime, 599 Cycles forwardRequestTime, 600 Cycles firstResponseTime, 601 bool isRegion) 602{ 603 PacketPtr pkt = srequest->pkt; 604 Addr request_address = pkt->getAddr(); 605 Addr request_line_address = makeLineAddress(request_address); 606 607 RubyRequestType type = srequest->m_type; 608 609 // Set this cache entry to the most recently used 610 if (type == RubyRequestType_IFETCH) { 611 if (m_instCache_ptr->isTagPresent(request_line_address)) 612 m_instCache_ptr->setMRU(request_line_address); 613 } else { 614 if (m_dataCache_ptr->isTagPresent(request_line_address)) 615 m_dataCache_ptr->setMRU(request_line_address); 616 } 617 618 recordMissLatency(srequest, mach, 619 initialRequestTime, 620 forwardRequestTime, 621 firstResponseTime, 622 success, isRegion); 623 // update the data 624 // 625 // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER 626 int len = reqCoalescer[request_line_address].size(); 627 std::vector<PacketPtr> mylist; 628 for (int i = 0; i < len; ++i) { 629 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt; 630 assert(type == reqCoalescer[request_line_address][i].primaryType); 631 request_address = pkt->getAddr(); 632 request_line_address = makeLineAddress(pkt->getAddr()); 633 if (pkt->getPtr<uint8_t>()) { 634 if ((type == RubyRequestType_LD) || 635 (type == RubyRequestType_ATOMIC) || 636 (type == RubyRequestType_ATOMIC_RETURN) || 637 (type == RubyRequestType_IFETCH) || 638 (type == RubyRequestType_RMW_Read) || 639 (type == RubyRequestType_Locked_RMW_Read) || 640 (type == RubyRequestType_Load_Linked)) { 641 pkt->setData( 642 data.getData(getOffset(request_address), pkt->getSize())); 643 } else { 644 data.setData(pkt->getPtr<uint8_t>(), 645 getOffset(request_address), pkt->getSize()); 646 } 647 } else { 648 DPRINTF(MemoryAccess, 649 "WARNING. Data not transfered from Ruby to M5 for type " \ 650 "%s\n", 651 RubyRequestType_to_string(type)); 652 } 653 654 // If using the RubyTester, update the RubyTester sender state's 655 // subBlock with the recieved data. The tester will later access 656 // this state. 657 // Note: RubyPort will access it's sender state before the 658 // RubyTester. 659 if (m_usingRubyTester) { 660 RubyPort::SenderState *requestSenderState = 661 safe_cast<RubyPort::SenderState*>(pkt->senderState); 662 RubyTester::SenderState* testerSenderState = 663 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor); 664 testerSenderState->subBlock.mergeFrom(data); 665 } 666 667 mylist.push_back(pkt); 668 } 669 delete srequest; 670 reqCoalescer.erase(request_line_address); 671 assert(!reqCoalescer.count(request_line_address)); 672 673 674 675 completeHitCallback(mylist, len); 676} 677 678bool 679GPUCoalescer::empty() const 680{ 681 return m_writeRequestTable.empty() && m_readRequestTable.empty(); 682} 683 684// Analyzes the packet to see if this request can be coalesced. 685// If request can be coalesced, this request is added to the reqCoalescer table 686// and makeRequest returns RequestStatus_Issued; 687// If this is the first request to a cacheline, request is added to both 688// newRequests queue and to the reqCoalescer table; makeRequest 689// returns RequestStatus_Issued. 690// If there is a pending request to this cacheline and this request 691// can't be coalesced, RequestStatus_Aliased is returned and 692// the packet needs to be reissued. 693RequestStatus 694GPUCoalescer::makeRequest(PacketPtr pkt) 695{ 696 // Check for GPU Barrier Kernel End or Kernel Begin 697 // Leave these to be handled by the child class 698 // Kernel End/Barrier = isFlush + isRelease 699 // Kernel Begin = isFlush + isAcquire 700 if (pkt->req->isKernel()) { 701 if (pkt->req->isAcquire()){ 702 // This is a Kernel Begin leave handling to 703 // virtual xCoalescer::makeRequest 704 return RequestStatus_Issued; 705 }else if (pkt->req->isRelease()) { 706 // This is a Kernel End leave handling to 707 // virtual xCoalescer::makeRequest 708 // If we are here then we didn't call 709 // a virtual version of this function 710 // so we will also schedule the callback 711 int wf_id = 0; 712 if (pkt->req->hasContextId()) { 713 wf_id = pkt->req->contextId(); 714 } 715 insertKernel(wf_id, pkt); 716 newKernelEnds.push_back(wf_id); 717 if (!issueEvent.scheduled()) { 718 schedule(issueEvent, curTick()); 719 } 720 return RequestStatus_Issued; 721 } 722 } 723 724 // If number of outstanding requests greater than the max allowed, 725 // return RequestStatus_BufferFull. This logic can be extended to 726 // support proper backpressure. 727 if (m_outstanding_count >= m_max_outstanding_requests) { 728 return RequestStatus_BufferFull; 729 } 730 731 RubyRequestType primary_type = RubyRequestType_NULL; 732 RubyRequestType secondary_type = RubyRequestType_NULL; 733 734 if (pkt->isLLSC()) { 735 // 736 // Alpha LL/SC instructions need to be handled carefully by the cache 737 // coherence protocol to ensure they follow the proper semantics. In 738 // particular, by identifying the operations as atomic, the protocol 739 // should understand that migratory sharing optimizations should not 740 // be performed (i.e. a load between the LL and SC should not steal 741 // away exclusive permission). 742 // 743 if (pkt->isWrite()) { 744 primary_type = RubyRequestType_Store_Conditional; 745 } else { 746 assert(pkt->isRead()); 747 primary_type = RubyRequestType_Load_Linked; 748 } 749 secondary_type = RubyRequestType_ATOMIC; 750 } else if (pkt->req->isLockedRMW()) { 751 // 752 // x86 locked instructions are translated to store cache coherence 753 // requests because these requests should always be treated as read 754 // exclusive operations and should leverage any migratory sharing 755 // optimization built into the protocol. 756 // 757 if (pkt->isWrite()) { 758 primary_type = RubyRequestType_Locked_RMW_Write; 759 } else { 760 assert(pkt->isRead()); 761 primary_type = RubyRequestType_Locked_RMW_Read; 762 } 763 secondary_type = RubyRequestType_ST; 764 } else if (pkt->isAtomicOp()) { 765 // 766 // GPU Atomic Operation 767 // 768 primary_type = RubyRequestType_ATOMIC; 769 secondary_type = RubyRequestType_ATOMIC; 770 } else { 771 if (pkt->isRead()) { 772 if (pkt->req->isInstFetch()) { 773 primary_type = secondary_type = RubyRequestType_IFETCH; 774 } else { 775#if THE_ISA == X86_ISA 776 uint32_t flags = pkt->req->getFlags(); 777 bool storeCheck = flags & 778 (TheISA::StoreCheck << TheISA::FlagShift); 779#else 780 bool storeCheck = false; 781#endif // X86_ISA 782 if (storeCheck) { 783 primary_type = RubyRequestType_RMW_Read; 784 secondary_type = RubyRequestType_ST; 785 } else { 786 primary_type = secondary_type = RubyRequestType_LD; 787 } 788 } 789 } else if (pkt->isWrite()) { 790 // 791 // Note: M5 packets do not differentiate ST from RMW_Write 792 // 793 primary_type = secondary_type = RubyRequestType_ST; 794 } else if (pkt->isFlush()) { 795 primary_type = secondary_type = RubyRequestType_FLUSH; 796 } else if (pkt->req->isRelease() || pkt->req->isAcquire()) { 797 if (assumingRfOCoherence) { 798 // If we reached here, this request must be a memFence 799 // and the protocol implements RfO, the coalescer can 800 // assume sequentially consistency and schedule the callback 801 // immediately. 802 // Currently the code implements fence callbacks 803 // by reusing the mechanism for kernel completions. 804 // This should be fixed. 805 int wf_id = 0; 806 if (pkt->req->hasContextId()) { 807 wf_id = pkt->req->contextId(); 808 } 809 insertKernel(wf_id, pkt); 810 newKernelEnds.push_back(wf_id); 811 if (!issueEvent.scheduled()) { 812 schedule(issueEvent, curTick()); 813 } 814 return RequestStatus_Issued; 815 } else { 816 // If not RfO, return issued here and let the child coalescer 817 // take care of it. 818 return RequestStatus_Issued; 819 } 820 } else { 821 panic("Unsupported ruby packet type\n"); 822 } 823 } 824 825 // Check if there is any pending request to this cache line from 826 // previous cycles. 827 // If there is a pending request, return aliased. Since coalescing 828 // across time is not permitted, aliased requests are not coalesced. 829 // If a request for this address has already been issued, we must block 830 RequestStatus status = getRequestStatus(pkt, primary_type); 831 if (status != RequestStatus_Ready) 832 return status; 833 834 Addr line_addr = makeLineAddress(pkt->getAddr()); 835 836 // Check if this request can be coalesced with previous 837 // requests from this cycle. 838 if (!reqCoalescer.count(line_addr)) { 839 // This is the first access to this cache line. 840 // A new request to the memory subsystem has to be 841 // made in the next cycle for this cache line, so 842 // add this line addr to the "newRequests" queue 843 newRequests.push_back(line_addr); 844 845 // There was a request to this cache line in this cycle, 846 // let us see if we can coalesce this request with the previous 847 // requests from this cycle 848 } else if (primary_type != 849 reqCoalescer[line_addr][0].primaryType) { 850 // can't coalesce loads, stores and atomics! 851 return RequestStatus_Aliased; 852 } else if (pkt->req->isLockedRMW() || 853 reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) { 854 // can't coalesce locked accesses, but can coalesce atomics! 855 return RequestStatus_Aliased; 856 } else if (pkt->req->hasContextId() && pkt->req->isRelease() && 857 pkt->req->contextId() != 858 reqCoalescer[line_addr][0].pkt->req->contextId()) { 859 // can't coalesce releases from different wavefronts 860 return RequestStatus_Aliased; 861 } 862 863 // in addition to the packet, we need to save both request types 864 reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type); 865 if (!issueEvent.scheduled()) 866 schedule(issueEvent, curTick()); 867 // TODO: issue hardware prefetches here 868 return RequestStatus_Issued; 869} 870 871void 872GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) 873{ 874 875 int proc_id = -1; 876 if (pkt != NULL && pkt->req->hasContextId()) { 877 proc_id = pkt->req->contextId(); 878 } 879 880 // If valid, copy the pc to the ruby request 881 Addr pc = 0; 882 if (pkt->req->hasPC()) { 883 pc = pkt->req->getPC(); 884 } 885 886 // At the moment setting scopes only counts 887 // for GPU spill space accesses 888 // which is pkt->req->isStack() 889 // this scope is REPLACE since it 890 // does not need to be flushed at the end 891 // of a kernel Private and local may need 892 // to be visible at the end of the kernel 893 HSASegment accessSegment = reqSegmentToHSASegment(pkt->req); 894 HSAScope accessScope = reqScopeToHSAScope(pkt->req); 895 896 Addr line_addr = makeLineAddress(pkt->getAddr()); 897 898 // Creating WriteMask that records written bytes 899 // and atomic operations. This enables partial writes 900 // and partial reads of those writes 901 DataBlock dataBlock; 902 dataBlock.clear(); 903 uint32_t blockSize = RubySystem::getBlockSizeBytes(); 904 std::vector<bool> accessMask(blockSize,false); 905 std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps; 906 uint32_t tableSize = reqCoalescer[line_addr].size(); 907 for (int i = 0; i < tableSize; i++) { 908 PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt; 909 uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr; 910 uint32_t tmpSize = tmpPkt->getSize(); 911 if (tmpPkt->isAtomicOp()) { 912 std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset, 913 tmpPkt->getAtomicOp()); 914 atomicOps.push_back(tmpAtomicOp); 915 } else if (tmpPkt->isWrite()) { 916 dataBlock.setData(tmpPkt->getPtr<uint8_t>(), 917 tmpOffset, tmpSize); 918 } 919 for (int j = 0; j < tmpSize; j++) { 920 accessMask[tmpOffset + j] = true; 921 } 922 } 923 std::shared_ptr<RubyRequest> msg; 924 if (pkt->isAtomicOp()) { 925 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), 926 pkt->getPtr<uint8_t>(), 927 pkt->getSize(), pc, secondary_type, 928 RubyAccessMode_Supervisor, pkt, 929 PrefetchBit_No, proc_id, 100, 930 blockSize, accessMask, 931 dataBlock, atomicOps, 932 accessScope, accessSegment); 933 } else { 934 msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), 935 pkt->getPtr<uint8_t>(), 936 pkt->getSize(), pc, secondary_type, 937 RubyAccessMode_Supervisor, pkt, 938 PrefetchBit_No, proc_id, 100, 939 blockSize, accessMask, 940 dataBlock, 941 accessScope, accessSegment); 942 } 943 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n", 944 curTick(), m_version, "Coal", "Begin", "", "", 945 printAddress(msg->getPhysicalAddress()), 946 RubyRequestType_to_string(secondary_type)); 947 948 fatal_if(secondary_type == RubyRequestType_IFETCH, 949 "there should not be any I-Fetch requests in the GPU Coalescer"); 950
|
959} 960 961template <class KEY, class VALUE> 962std::ostream & 963operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map) 964{ 965 out << "["; 966 for (auto i = map.begin(); i != map.end(); ++i) 967 out << " " << i->first << "=" << i->second; 968 out << " ]"; 969 970 return out; 971} 972 973void 974GPUCoalescer::print(ostream& out) const 975{ 976 out << "[GPUCoalescer: " << m_version 977 << ", outstanding requests: " << m_outstanding_count 978 << ", read request table: " << m_readRequestTable 979 << ", write request table: " << m_writeRequestTable 980 << "]"; 981} 982 983// this can be called from setState whenever coherence permissions are 984// upgraded when invoked, coherence violations will be checked for the 985// given block 986void 987GPUCoalescer::checkCoherence(Addr addr) 988{ 989#ifdef CHECK_COHERENCE 990 m_ruby_system->checkGlobalCoherenceInvariant(addr); 991#endif 992} 993 994void 995GPUCoalescer::recordRequestType(SequencerRequestType requestType) { 996 DPRINTF(RubyStats, "Recorded statistic: %s\n", 997 SequencerRequestType_to_string(requestType)); 998} 999 1000 1001void 1002GPUCoalescer::completeIssue() 1003{ 1004 // newRequests has the cacheline addresses of all the 1005 // requests which need to be issued to the memory subsystem 1006 // in this cycle 1007 int len = newRequests.size(); 1008 DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len); 1009 for (int i = 0; i < len; ++i) { 1010 // Get the requests from reqCoalescer table. Get only the 1011 // first request for each cacheline, the remaining requests 1012 // can be coalesced with the first request. So, only 1013 // one request is issued per cacheline. 1014 RequestDesc info = reqCoalescer[newRequests[i]][0]; 1015 PacketPtr pkt = info.pkt; 1016 DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n", 1017 i, pkt->req->getPaddr()); 1018 // Insert this request to the read/writeRequestTables. These tables 1019 // are used to track aliased requests in makeRequest subroutine 1020 bool found = insertRequest(pkt, info.primaryType); 1021 1022 if (found) { 1023 panic("GPUCoalescer::makeRequest should never be called if the " 1024 "request is already outstanding\n"); 1025 } 1026 1027 // Issue request to ruby subsystem 1028 issueRequest(pkt, info.secondaryType); 1029 } 1030 newRequests.clear(); 1031 1032 // have Kernel End releases been issued this cycle 1033 len = newKernelEnds.size(); 1034 for (int i = 0; i < len; i++) { 1035 kernelCallback(newKernelEnds[i]); 1036 } 1037 newKernelEnds.clear(); 1038} 1039 1040void 1041GPUCoalescer::evictionCallback(Addr address) 1042{ 1043 ruby_eviction_callback(address); 1044} 1045 1046void 1047GPUCoalescer::kernelCallback(int wavefront_id) 1048{ 1049 assert(kernelEndList.count(wavefront_id)); 1050 1051 ruby_hit_callback(kernelEndList[wavefront_id]); 1052 1053 kernelEndList.erase(wavefront_id); 1054} 1055 1056void 1057GPUCoalescer::atomicCallback(Addr address, 1058 MachineType mach, 1059 const DataBlock& data) 1060{ 1061 assert(address == makeLineAddress(address)); 1062 1063 DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address); 1064 assert(m_writeRequestTable.count(makeLineAddress(address))); 1065 1066 RequestTable::iterator i = m_writeRequestTable.find(address); 1067 assert(i != m_writeRequestTable.end()); 1068 GPUCoalescerRequest* srequest = i->second; 1069 1070 m_writeRequestTable.erase(i); 1071 markRemoved(); 1072 1073 assert((srequest->m_type == RubyRequestType_ATOMIC) || 1074 (srequest->m_type == RubyRequestType_ATOMIC_RETURN) || 1075 (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN)); 1076 1077 1078 // Atomics don't write to cache, so there is no MRU update... 1079 1080 recordMissLatency(srequest, mach, 1081 srequest->issue_time, Cycles(0), Cycles(0), true, false); 1082 1083 PacketPtr pkt = srequest->pkt; 1084 Addr request_address = pkt->getAddr(); 1085 Addr request_line_address = makeLineAddress(pkt->getAddr()); 1086 1087 int len = reqCoalescer[request_line_address].size(); 1088 std::vector<PacketPtr> mylist; 1089 for (int i = 0; i < len; ++i) { 1090 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt; 1091 assert(srequest->m_type == 1092 reqCoalescer[request_line_address][i].primaryType); 1093 request_address = (pkt->getAddr()); 1094 request_line_address = makeLineAddress(request_address); 1095 if (pkt->getPtr<uint8_t>() && 1096 srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) { 1097 /* atomics are done in memory, and return the data *before* the atomic op... */ 1098 pkt->setData( 1099 data.getData(getOffset(request_address), pkt->getSize())); 1100 } else { 1101 DPRINTF(MemoryAccess, 1102 "WARNING. Data not transfered from Ruby to M5 for type " \ 1103 "%s\n", 1104 RubyRequestType_to_string(srequest->m_type)); 1105 } 1106 1107 // If using the RubyTester, update the RubyTester sender state's 1108 // subBlock with the recieved data. The tester will later access 1109 // this state. 1110 // Note: RubyPort will access it's sender state before the 1111 // RubyTester. 1112 if (m_usingRubyTester) { 1113 RubyPort::SenderState *requestSenderState = 1114 safe_cast<RubyPort::SenderState*>(pkt->senderState); 1115 RubyTester::SenderState* testerSenderState = 1116 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor); 1117 testerSenderState->subBlock.mergeFrom(data); 1118 } 1119 1120 mylist.push_back(pkt); 1121 } 1122 delete srequest; 1123 reqCoalescer.erase(request_line_address); 1124 assert(!reqCoalescer.count(request_line_address)); 1125 1126 completeHitCallback(mylist, len); 1127} 1128 1129void 1130GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID) 1131{ 1132 if (myMachID == senderMachID) { 1133 CP_TCPLdHits++; 1134 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) { 1135 CP_TCPLdTransfers++; 1136 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) { 1137 CP_TCCLdHits++; 1138 } else { 1139 CP_LdMiss++; 1140 } 1141} 1142 1143void 1144GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID) 1145{ 1146 if (myMachID == senderMachID) { 1147 CP_TCPStHits++; 1148 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) { 1149 CP_TCPStTransfers++; 1150 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) { 1151 CP_TCCStHits++; 1152 } else { 1153 CP_StMiss++; 1154 } 1155} 1156 1157void 1158GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len) 1159{ 1160 for (int i = 0; i < len; ++i) { 1161 RubyPort::SenderState *ss = 1162 safe_cast<RubyPort::SenderState *>(mylist[i]->senderState); 1163 MemSlavePort *port = ss->port; 1164 assert(port != NULL); 1165 1166 mylist[i]->senderState = ss->predecessor; 1167 delete ss; 1168 port->hitCallback(mylist[i]); 1169 trySendRetries(); 1170 } 1171 1172 testDrainComplete(); 1173} 1174 1175PacketPtr 1176GPUCoalescer::mapAddrToPkt(Addr address) 1177{ 1178 RequestTable::iterator i = m_readRequestTable.find(address); 1179 assert(i != m_readRequestTable.end()); 1180 GPUCoalescerRequest* request = i->second; 1181 return request->pkt; 1182} 1183 1184void 1185GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest, 1186 MachineType mach, 1187 Cycles initialRequestTime, 1188 Cycles forwardRequestTime, 1189 Cycles firstResponseTime, 1190 bool success, bool isRegion) 1191{ 1192 RubyRequestType type = srequest->m_type; 1193 Cycles issued_time = srequest->issue_time; 1194 Cycles completion_time = curCycle(); 1195 assert(completion_time >= issued_time); 1196 Cycles total_lat = completion_time - issued_time; 1197 1198 // cache stats (valid for RfO protocol only) 1199 if (mach == MachineType_TCP) { 1200 if (type == RubyRequestType_LD) { 1201 GPU_TCPLdHits++; 1202 } else { 1203 GPU_TCPStHits++; 1204 } 1205 } else if (mach == MachineType_L1Cache_wCC) { 1206 if (type == RubyRequestType_LD) { 1207 GPU_TCPLdTransfers++; 1208 } else { 1209 GPU_TCPStTransfers++; 1210 } 1211 } else if (mach == MachineType_TCC) { 1212 if (type == RubyRequestType_LD) { 1213 GPU_TCCLdHits++; 1214 } else { 1215 GPU_TCCStHits++; 1216 } 1217 } else { 1218 if (type == RubyRequestType_LD) { 1219 GPU_LdMiss++; 1220 } else { 1221 GPU_StMiss++; 1222 } 1223 } 1224 1225 // Profile all access latency, even zero latency accesses 1226 m_latencyHist.sample(total_lat); 1227 m_typeLatencyHist[type]->sample(total_lat); 1228 1229 // Profile the miss latency for all non-zero demand misses 1230 if (total_lat != Cycles(0)) { 1231 m_missLatencyHist.sample(total_lat); 1232 m_missTypeLatencyHist[type]->sample(total_lat); 1233 1234 if (mach != MachineType_NUM) { 1235 m_missMachLatencyHist[mach]->sample(total_lat); 1236 m_missTypeMachLatencyHist[type][mach]->sample(total_lat); 1237 1238 if ((issued_time <= initialRequestTime) && 1239 (initialRequestTime <= forwardRequestTime) && 1240 (forwardRequestTime <= firstResponseTime) && 1241 (firstResponseTime <= completion_time)) { 1242 1243 m_IssueToInitialDelayHist[mach]->sample( 1244 initialRequestTime - issued_time); 1245 m_InitialToForwardDelayHist[mach]->sample( 1246 forwardRequestTime - initialRequestTime); 1247 m_ForwardToFirstResponseDelayHist[mach]->sample( 1248 firstResponseTime - forwardRequestTime); 1249 m_FirstResponseToCompletionDelayHist[mach]->sample( 1250 completion_time - firstResponseTime); 1251 } 1252 } 1253 1254 } 1255 1256 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n", 1257 curTick(), m_version, "Coal", 1258 success ? "Done" : "SC_Failed", "", "", 1259 printAddress(srequest->pkt->getAddr()), total_lat); 1260} 1261 1262void 1263GPUCoalescer::regStats() 1264{ 1265 RubyPort::regStats(); 1266 1267 // These statistical variables are not for display. 1268 // The profiler will collate these across different 1269 // coalescers and display those collated statistics. 1270 m_outstandReqHist.init(10); 1271 m_latencyHist.init(10); 1272 m_missLatencyHist.init(10); 1273 1274 for (int i = 0; i < RubyRequestType_NUM; i++) { 1275 m_typeLatencyHist.push_back(new Stats::Histogram()); 1276 m_typeLatencyHist[i]->init(10); 1277 1278 m_missTypeLatencyHist.push_back(new Stats::Histogram()); 1279 m_missTypeLatencyHist[i]->init(10); 1280 } 1281 1282 for (int i = 0; i < MachineType_NUM; i++) { 1283 m_missMachLatencyHist.push_back(new Stats::Histogram()); 1284 m_missMachLatencyHist[i]->init(10); 1285 1286 m_IssueToInitialDelayHist.push_back(new Stats::Histogram()); 1287 m_IssueToInitialDelayHist[i]->init(10); 1288 1289 m_InitialToForwardDelayHist.push_back(new Stats::Histogram()); 1290 m_InitialToForwardDelayHist[i]->init(10); 1291 1292 m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram()); 1293 m_ForwardToFirstResponseDelayHist[i]->init(10); 1294 1295 m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram()); 1296 m_FirstResponseToCompletionDelayHist[i]->init(10); 1297 } 1298 1299 for (int i = 0; i < RubyRequestType_NUM; i++) { 1300 m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>()); 1301 1302 for (int j = 0; j < MachineType_NUM; j++) { 1303 m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram()); 1304 m_missTypeMachLatencyHist[i][j]->init(10); 1305 } 1306 } 1307 1308 // GPU cache stats 1309 GPU_TCPLdHits 1310 .name(name() + ".gpu_tcp_ld_hits") 1311 .desc("loads that hit in the TCP") 1312 ; 1313 GPU_TCPLdTransfers 1314 .name(name() + ".gpu_tcp_ld_transfers") 1315 .desc("TCP to TCP load transfers") 1316 ; 1317 GPU_TCCLdHits 1318 .name(name() + ".gpu_tcc_ld_hits") 1319 .desc("loads that hit in the TCC") 1320 ; 1321 GPU_LdMiss 1322 .name(name() + ".gpu_ld_misses") 1323 .desc("loads that miss in the GPU") 1324 ; 1325 1326 GPU_TCPStHits 1327 .name(name() + ".gpu_tcp_st_hits") 1328 .desc("stores that hit in the TCP") 1329 ; 1330 GPU_TCPStTransfers 1331 .name(name() + ".gpu_tcp_st_transfers") 1332 .desc("TCP to TCP store transfers") 1333 ; 1334 GPU_TCCStHits 1335 .name(name() + ".gpu_tcc_st_hits") 1336 .desc("stores that hit in the TCC") 1337 ; 1338 GPU_StMiss 1339 .name(name() + ".gpu_st_misses") 1340 .desc("stores that miss in the GPU") 1341 ; 1342 1343 // CP cache stats 1344 CP_TCPLdHits 1345 .name(name() + ".cp_tcp_ld_hits") 1346 .desc("loads that hit in the TCP") 1347 ; 1348 CP_TCPLdTransfers 1349 .name(name() + ".cp_tcp_ld_transfers") 1350 .desc("TCP to TCP load transfers") 1351 ; 1352 CP_TCCLdHits 1353 .name(name() + ".cp_tcc_ld_hits") 1354 .desc("loads that hit in the TCC") 1355 ; 1356 CP_LdMiss 1357 .name(name() + ".cp_ld_misses") 1358 .desc("loads that miss in the GPU") 1359 ; 1360 1361 CP_TCPStHits 1362 .name(name() + ".cp_tcp_st_hits") 1363 .desc("stores that hit in the TCP") 1364 ; 1365 CP_TCPStTransfers 1366 .name(name() + ".cp_tcp_st_transfers") 1367 .desc("TCP to TCP store transfers") 1368 ; 1369 CP_TCCStHits 1370 .name(name() + ".cp_tcc_st_hits") 1371 .desc("stores that hit in the TCC") 1372 ; 1373 CP_StMiss 1374 .name(name() + ".cp_st_misses") 1375 .desc("stores that miss in the GPU") 1376 ; 1377}
| 957} 958 959template <class KEY, class VALUE> 960std::ostream & 961operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map) 962{ 963 out << "["; 964 for (auto i = map.begin(); i != map.end(); ++i) 965 out << " " << i->first << "=" << i->second; 966 out << " ]"; 967 968 return out; 969} 970 971void 972GPUCoalescer::print(ostream& out) const 973{ 974 out << "[GPUCoalescer: " << m_version 975 << ", outstanding requests: " << m_outstanding_count 976 << ", read request table: " << m_readRequestTable 977 << ", write request table: " << m_writeRequestTable 978 << "]"; 979} 980 981// this can be called from setState whenever coherence permissions are 982// upgraded when invoked, coherence violations will be checked for the 983// given block 984void 985GPUCoalescer::checkCoherence(Addr addr) 986{ 987#ifdef CHECK_COHERENCE 988 m_ruby_system->checkGlobalCoherenceInvariant(addr); 989#endif 990} 991 992void 993GPUCoalescer::recordRequestType(SequencerRequestType requestType) { 994 DPRINTF(RubyStats, "Recorded statistic: %s\n", 995 SequencerRequestType_to_string(requestType)); 996} 997 998 999void 1000GPUCoalescer::completeIssue() 1001{ 1002 // newRequests has the cacheline addresses of all the 1003 // requests which need to be issued to the memory subsystem 1004 // in this cycle 1005 int len = newRequests.size(); 1006 DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len); 1007 for (int i = 0; i < len; ++i) { 1008 // Get the requests from reqCoalescer table. Get only the 1009 // first request for each cacheline, the remaining requests 1010 // can be coalesced with the first request. So, only 1011 // one request is issued per cacheline. 1012 RequestDesc info = reqCoalescer[newRequests[i]][0]; 1013 PacketPtr pkt = info.pkt; 1014 DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n", 1015 i, pkt->req->getPaddr()); 1016 // Insert this request to the read/writeRequestTables. These tables 1017 // are used to track aliased requests in makeRequest subroutine 1018 bool found = insertRequest(pkt, info.primaryType); 1019 1020 if (found) { 1021 panic("GPUCoalescer::makeRequest should never be called if the " 1022 "request is already outstanding\n"); 1023 } 1024 1025 // Issue request to ruby subsystem 1026 issueRequest(pkt, info.secondaryType); 1027 } 1028 newRequests.clear(); 1029 1030 // have Kernel End releases been issued this cycle 1031 len = newKernelEnds.size(); 1032 for (int i = 0; i < len; i++) { 1033 kernelCallback(newKernelEnds[i]); 1034 } 1035 newKernelEnds.clear(); 1036} 1037 1038void 1039GPUCoalescer::evictionCallback(Addr address) 1040{ 1041 ruby_eviction_callback(address); 1042} 1043 1044void 1045GPUCoalescer::kernelCallback(int wavefront_id) 1046{ 1047 assert(kernelEndList.count(wavefront_id)); 1048 1049 ruby_hit_callback(kernelEndList[wavefront_id]); 1050 1051 kernelEndList.erase(wavefront_id); 1052} 1053 1054void 1055GPUCoalescer::atomicCallback(Addr address, 1056 MachineType mach, 1057 const DataBlock& data) 1058{ 1059 assert(address == makeLineAddress(address)); 1060 1061 DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address); 1062 assert(m_writeRequestTable.count(makeLineAddress(address))); 1063 1064 RequestTable::iterator i = m_writeRequestTable.find(address); 1065 assert(i != m_writeRequestTable.end()); 1066 GPUCoalescerRequest* srequest = i->second; 1067 1068 m_writeRequestTable.erase(i); 1069 markRemoved(); 1070 1071 assert((srequest->m_type == RubyRequestType_ATOMIC) || 1072 (srequest->m_type == RubyRequestType_ATOMIC_RETURN) || 1073 (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN)); 1074 1075 1076 // Atomics don't write to cache, so there is no MRU update... 1077 1078 recordMissLatency(srequest, mach, 1079 srequest->issue_time, Cycles(0), Cycles(0), true, false); 1080 1081 PacketPtr pkt = srequest->pkt; 1082 Addr request_address = pkt->getAddr(); 1083 Addr request_line_address = makeLineAddress(pkt->getAddr()); 1084 1085 int len = reqCoalescer[request_line_address].size(); 1086 std::vector<PacketPtr> mylist; 1087 for (int i = 0; i < len; ++i) { 1088 PacketPtr pkt = reqCoalescer[request_line_address][i].pkt; 1089 assert(srequest->m_type == 1090 reqCoalescer[request_line_address][i].primaryType); 1091 request_address = (pkt->getAddr()); 1092 request_line_address = makeLineAddress(request_address); 1093 if (pkt->getPtr<uint8_t>() && 1094 srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) { 1095 /* atomics are done in memory, and return the data *before* the atomic op... */ 1096 pkt->setData( 1097 data.getData(getOffset(request_address), pkt->getSize())); 1098 } else { 1099 DPRINTF(MemoryAccess, 1100 "WARNING. Data not transfered from Ruby to M5 for type " \ 1101 "%s\n", 1102 RubyRequestType_to_string(srequest->m_type)); 1103 } 1104 1105 // If using the RubyTester, update the RubyTester sender state's 1106 // subBlock with the recieved data. The tester will later access 1107 // this state. 1108 // Note: RubyPort will access it's sender state before the 1109 // RubyTester. 1110 if (m_usingRubyTester) { 1111 RubyPort::SenderState *requestSenderState = 1112 safe_cast<RubyPort::SenderState*>(pkt->senderState); 1113 RubyTester::SenderState* testerSenderState = 1114 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor); 1115 testerSenderState->subBlock.mergeFrom(data); 1116 } 1117 1118 mylist.push_back(pkt); 1119 } 1120 delete srequest; 1121 reqCoalescer.erase(request_line_address); 1122 assert(!reqCoalescer.count(request_line_address)); 1123 1124 completeHitCallback(mylist, len); 1125} 1126 1127void 1128GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID) 1129{ 1130 if (myMachID == senderMachID) { 1131 CP_TCPLdHits++; 1132 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) { 1133 CP_TCPLdTransfers++; 1134 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) { 1135 CP_TCCLdHits++; 1136 } else { 1137 CP_LdMiss++; 1138 } 1139} 1140 1141void 1142GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID) 1143{ 1144 if (myMachID == senderMachID) { 1145 CP_TCPStHits++; 1146 } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) { 1147 CP_TCPStTransfers++; 1148 } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) { 1149 CP_TCCStHits++; 1150 } else { 1151 CP_StMiss++; 1152 } 1153} 1154 1155void 1156GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len) 1157{ 1158 for (int i = 0; i < len; ++i) { 1159 RubyPort::SenderState *ss = 1160 safe_cast<RubyPort::SenderState *>(mylist[i]->senderState); 1161 MemSlavePort *port = ss->port; 1162 assert(port != NULL); 1163 1164 mylist[i]->senderState = ss->predecessor; 1165 delete ss; 1166 port->hitCallback(mylist[i]); 1167 trySendRetries(); 1168 } 1169 1170 testDrainComplete(); 1171} 1172 1173PacketPtr 1174GPUCoalescer::mapAddrToPkt(Addr address) 1175{ 1176 RequestTable::iterator i = m_readRequestTable.find(address); 1177 assert(i != m_readRequestTable.end()); 1178 GPUCoalescerRequest* request = i->second; 1179 return request->pkt; 1180} 1181 1182void 1183GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest, 1184 MachineType mach, 1185 Cycles initialRequestTime, 1186 Cycles forwardRequestTime, 1187 Cycles firstResponseTime, 1188 bool success, bool isRegion) 1189{ 1190 RubyRequestType type = srequest->m_type; 1191 Cycles issued_time = srequest->issue_time; 1192 Cycles completion_time = curCycle(); 1193 assert(completion_time >= issued_time); 1194 Cycles total_lat = completion_time - issued_time; 1195 1196 // cache stats (valid for RfO protocol only) 1197 if (mach == MachineType_TCP) { 1198 if (type == RubyRequestType_LD) { 1199 GPU_TCPLdHits++; 1200 } else { 1201 GPU_TCPStHits++; 1202 } 1203 } else if (mach == MachineType_L1Cache_wCC) { 1204 if (type == RubyRequestType_LD) { 1205 GPU_TCPLdTransfers++; 1206 } else { 1207 GPU_TCPStTransfers++; 1208 } 1209 } else if (mach == MachineType_TCC) { 1210 if (type == RubyRequestType_LD) { 1211 GPU_TCCLdHits++; 1212 } else { 1213 GPU_TCCStHits++; 1214 } 1215 } else { 1216 if (type == RubyRequestType_LD) { 1217 GPU_LdMiss++; 1218 } else { 1219 GPU_StMiss++; 1220 } 1221 } 1222 1223 // Profile all access latency, even zero latency accesses 1224 m_latencyHist.sample(total_lat); 1225 m_typeLatencyHist[type]->sample(total_lat); 1226 1227 // Profile the miss latency for all non-zero demand misses 1228 if (total_lat != Cycles(0)) { 1229 m_missLatencyHist.sample(total_lat); 1230 m_missTypeLatencyHist[type]->sample(total_lat); 1231 1232 if (mach != MachineType_NUM) { 1233 m_missMachLatencyHist[mach]->sample(total_lat); 1234 m_missTypeMachLatencyHist[type][mach]->sample(total_lat); 1235 1236 if ((issued_time <= initialRequestTime) && 1237 (initialRequestTime <= forwardRequestTime) && 1238 (forwardRequestTime <= firstResponseTime) && 1239 (firstResponseTime <= completion_time)) { 1240 1241 m_IssueToInitialDelayHist[mach]->sample( 1242 initialRequestTime - issued_time); 1243 m_InitialToForwardDelayHist[mach]->sample( 1244 forwardRequestTime - initialRequestTime); 1245 m_ForwardToFirstResponseDelayHist[mach]->sample( 1246 firstResponseTime - forwardRequestTime); 1247 m_FirstResponseToCompletionDelayHist[mach]->sample( 1248 completion_time - firstResponseTime); 1249 } 1250 } 1251 1252 } 1253 1254 DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n", 1255 curTick(), m_version, "Coal", 1256 success ? "Done" : "SC_Failed", "", "", 1257 printAddress(srequest->pkt->getAddr()), total_lat); 1258} 1259 1260void 1261GPUCoalescer::regStats() 1262{ 1263 RubyPort::regStats(); 1264 1265 // These statistical variables are not for display. 1266 // The profiler will collate these across different 1267 // coalescers and display those collated statistics. 1268 m_outstandReqHist.init(10); 1269 m_latencyHist.init(10); 1270 m_missLatencyHist.init(10); 1271 1272 for (int i = 0; i < RubyRequestType_NUM; i++) { 1273 m_typeLatencyHist.push_back(new Stats::Histogram()); 1274 m_typeLatencyHist[i]->init(10); 1275 1276 m_missTypeLatencyHist.push_back(new Stats::Histogram()); 1277 m_missTypeLatencyHist[i]->init(10); 1278 } 1279 1280 for (int i = 0; i < MachineType_NUM; i++) { 1281 m_missMachLatencyHist.push_back(new Stats::Histogram()); 1282 m_missMachLatencyHist[i]->init(10); 1283 1284 m_IssueToInitialDelayHist.push_back(new Stats::Histogram()); 1285 m_IssueToInitialDelayHist[i]->init(10); 1286 1287 m_InitialToForwardDelayHist.push_back(new Stats::Histogram()); 1288 m_InitialToForwardDelayHist[i]->init(10); 1289 1290 m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram()); 1291 m_ForwardToFirstResponseDelayHist[i]->init(10); 1292 1293 m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram()); 1294 m_FirstResponseToCompletionDelayHist[i]->init(10); 1295 } 1296 1297 for (int i = 0; i < RubyRequestType_NUM; i++) { 1298 m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>()); 1299 1300 for (int j = 0; j < MachineType_NUM; j++) { 1301 m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram()); 1302 m_missTypeMachLatencyHist[i][j]->init(10); 1303 } 1304 } 1305 1306 // GPU cache stats 1307 GPU_TCPLdHits 1308 .name(name() + ".gpu_tcp_ld_hits") 1309 .desc("loads that hit in the TCP") 1310 ; 1311 GPU_TCPLdTransfers 1312 .name(name() + ".gpu_tcp_ld_transfers") 1313 .desc("TCP to TCP load transfers") 1314 ; 1315 GPU_TCCLdHits 1316 .name(name() + ".gpu_tcc_ld_hits") 1317 .desc("loads that hit in the TCC") 1318 ; 1319 GPU_LdMiss 1320 .name(name() + ".gpu_ld_misses") 1321 .desc("loads that miss in the GPU") 1322 ; 1323 1324 GPU_TCPStHits 1325 .name(name() + ".gpu_tcp_st_hits") 1326 .desc("stores that hit in the TCP") 1327 ; 1328 GPU_TCPStTransfers 1329 .name(name() + ".gpu_tcp_st_transfers") 1330 .desc("TCP to TCP store transfers") 1331 ; 1332 GPU_TCCStHits 1333 .name(name() + ".gpu_tcc_st_hits") 1334 .desc("stores that hit in the TCC") 1335 ; 1336 GPU_StMiss 1337 .name(name() + ".gpu_st_misses") 1338 .desc("stores that miss in the GPU") 1339 ; 1340 1341 // CP cache stats 1342 CP_TCPLdHits 1343 .name(name() + ".cp_tcp_ld_hits") 1344 .desc("loads that hit in the TCP") 1345 ; 1346 CP_TCPLdTransfers 1347 .name(name() + ".cp_tcp_ld_transfers") 1348 .desc("TCP to TCP load transfers") 1349 ; 1350 CP_TCCLdHits 1351 .name(name() + ".cp_tcc_ld_hits") 1352 .desc("loads that hit in the TCC") 1353 ; 1354 CP_LdMiss 1355 .name(name() + ".cp_ld_misses") 1356 .desc("loads that miss in the GPU") 1357 ; 1358 1359 CP_TCPStHits 1360 .name(name() + ".cp_tcp_st_hits") 1361 .desc("stores that hit in the TCP") 1362 ; 1363 CP_TCPStTransfers 1364 .name(name() + ".cp_tcp_st_transfers") 1365 .desc("TCP to TCP store transfers") 1366 ; 1367 CP_TCCStHits 1368 .name(name() + ".cp_tcc_st_hits") 1369 .desc("stores that hit in the TCC") 1370 ; 1371 CP_StMiss 1372 .name(name() + ".cp_st_misses") 1373 .desc("stores that miss in the GPU") 1374 ; 1375}
|