tlb_coalescer.cc revision 11523
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#include "gpu-compute/tlb_coalescer.hh" 37 38#include <cstring> 39 40#include "debug/GPUTLB.hh" 41 42TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p), 43 clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle), 44 coalescingWindow(p->coalescingWindow), 45 disableCoalescing(p->disableCoalescing), probeTLBEvent(this), 46 cleanupEvent(this) 47{ 48 // create the slave ports based on the number of connected ports 49 for (size_t i = 0; i < p->port_slave_connection_count; ++i) { 50 cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i), 51 this, i)); 52 } 53 54 // create the master ports based on the number of connected ports 55 for (size_t i = 0; i < p->port_master_connection_count; ++i) { 56 memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i), 57 this, i)); 58 } 59} 60 61BaseSlavePort& 62TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx) 63{ 64 if (if_name == "slave") { 65 if (idx >= static_cast<PortID>(cpuSidePort.size())) { 66 panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx); 67 } 68 69 return *cpuSidePort[idx]; 70 } else { 71 panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name); 72 } 73} 74 75BaseMasterPort& 76TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx) 77{ 78 if (if_name == "master") { 79 if (idx >= static_cast<PortID>(memSidePort.size())) { 80 panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx); 81 } 82 83 return *memSidePort[idx]; 84 } else { 85 panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name); 86 } 87} 88 89/* 90 * This method returns true if the <incoming_pkt> 91 * can be coalesced with <coalesced_pkt> and false otherwise. 92 * A given set of rules is checked. 93 * The rules can potentially be modified based on the TLB level. 94 */ 95bool 96TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt) 97{ 98 if (disableCoalescing) 99 return false; 100 101 TheISA::GpuTLB::TranslationState *incoming_state = 102 safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState); 103 104 TheISA::GpuTLB::TranslationState *coalesced_state = 105 safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState); 106 107 // Rule 1: Coalesce requests only if they 108 // fall within the same virtual page 109 Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(), 110 TheISA::PageBytes); 111 112 Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(), 113 TheISA::PageBytes); 114 115 if (incoming_virt_page_addr != coalesced_virt_page_addr) 116 return false; 117 118 //* Rule 2: Coalesce requests only if they 119 // share a TLB Mode, i.e. they are both read 120 // or write requests. 121 BaseTLB::Mode incoming_mode = incoming_state->tlbMode; 122 BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode; 123 124 if (incoming_mode != coalesced_mode) 125 return false; 126 127 // when we can coalesce a packet update the reqCnt 128 // that is the number of packets represented by 129 // this coalesced packet 130 if (!incoming_state->prefetch) 131 coalesced_state->reqCnt.back() += incoming_state->reqCnt.back(); 132 133 return true; 134} 135 136/* 137 * We need to update the physical addresses of all the translation requests 138 * that were coalesced into the one that just returned. 139 */ 140void 141TLBCoalescer::updatePhysAddresses(PacketPtr pkt) 142{ 143 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); 144 145 DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n", 146 issuedTranslationsTable[virt_page_addr].size(), virt_page_addr); 147 148 TheISA::GpuTLB::TranslationState *sender_state = 149 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 150 151 TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry; 152 assert(tlb_entry); 153 Addr first_entry_vaddr = tlb_entry->vaddr; 154 Addr first_entry_paddr = tlb_entry->paddr; 155 int page_size = tlb_entry->size(); 156 bool uncacheable = tlb_entry->uncacheable; 157 int first_hit_level = sender_state->hitLevel; 158 bool valid = tlb_entry->valid; 159 160 // Get the physical page address of the translated request 161 // Using the page_size specified in the TLBEntry allows us 162 // to support different page sizes. 163 Addr phys_page_paddr = pkt->req->getPaddr(); 164 phys_page_paddr &= ~(page_size - 1); 165 166 for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) { 167 PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i]; 168 TheISA::GpuTLB::TranslationState *sender_state = 169 safe_cast<TheISA::GpuTLB::TranslationState*>( 170 local_pkt->senderState); 171 172 // we are sending the packet back, so pop the reqCnt associated 173 // with this level in the TLB hiearchy 174 if (!sender_state->prefetch) 175 sender_state->reqCnt.pop_back(); 176 177 /* 178 * Only the first packet from this coalesced request has been 179 * translated. Grab the translated phys. page addr and update the 180 * physical addresses of the remaining packets with the appropriate 181 * page offsets. 182 */ 183 if (i) { 184 Addr paddr = phys_page_paddr; 185 paddr |= (local_pkt->req->getVaddr() & (page_size - 1)); 186 local_pkt->req->setPaddr(paddr); 187 188 if (uncacheable) 189 local_pkt->req->setFlags(Request::UNCACHEABLE); 190 191 // update senderState->tlbEntry, so we can insert 192 // the correct TLBEentry in the TLBs above. 193 sender_state->tlbEntry = 194 new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr, 195 valid); 196 197 // update the hitLevel for all uncoalesced reqs 198 // so that each packet knows where it hit 199 // (used for statistics in the CUs) 200 sender_state->hitLevel = first_hit_level; 201 } 202 203 SlavePort *return_port = sender_state->ports.back(); 204 sender_state->ports.pop_back(); 205 206 // Translation is done - Convert to a response pkt if necessary and 207 // send the translation back 208 if (local_pkt->isRequest()) { 209 local_pkt->makeTimingResponse(); 210 } 211 212 return_port->sendTimingResp(local_pkt); 213 } 214 215 // schedule clean up for end of this cycle 216 // This is a maximum priority event and must be on 217 // the same cycle as GPUTLB cleanup event to prevent 218 // race conditions with an IssueProbeEvent caused by 219 // MemSidePort::recvReqRetry 220 cleanupQueue.push(virt_page_addr); 221 222 if (!cleanupEvent.scheduled()) 223 schedule(cleanupEvent, curTick()); 224} 225 226// Receive translation requests, create a coalesced request, 227// and send them to the TLB (TLBProbesPerCycle) 228bool 229TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) 230{ 231 // first packet of a coalesced request 232 PacketPtr first_packet = nullptr; 233 // true if we are able to do coalescing 234 bool didCoalesce = false; 235 // number of coalesced reqs for a given window 236 int coalescedReq_cnt = 0; 237 238 TheISA::GpuTLB::TranslationState *sender_state = 239 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 240 241 // push back the port to remember the path back 242 sender_state->ports.push_back(this); 243 244 bool update_stats = !sender_state->prefetch; 245 246 if (update_stats) { 247 // if reqCnt is empty then this packet does not represent 248 // multiple uncoalesced reqs(pkts) but just a single pkt. 249 // If it does though then the reqCnt for each level in the 250 // hierarchy accumulates the total number of reqs this packet 251 // represents 252 int req_cnt = 1; 253 254 if (!sender_state->reqCnt.empty()) 255 req_cnt = sender_state->reqCnt.back(); 256 257 sender_state->reqCnt.push_back(req_cnt); 258 259 // update statistics 260 coalescer->uncoalescedAccesses++; 261 req_cnt = sender_state->reqCnt.back(); 262 DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt); 263 coalescer->queuingCycles -= (curTick() * req_cnt); 264 coalescer->localqueuingCycles -= curTick(); 265 } 266 267 // FIXME if you want to coalesce not based on the issueTime 268 // of the packets (i.e., from the compute unit's perspective) 269 // but based on when they reached this coalescer then 270 // remove the following if statement and use curTick() or 271 // coalescingWindow for the tick_index. 272 if (!sender_state->issueTime) 273 sender_state->issueTime = curTick(); 274 275 // The tick index is used as a key to the coalescerFIFO hashmap. 276 // It is shared by all candidates that fall within the 277 // given coalescingWindow. 278 int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow; 279 280 if (coalescer->coalescerFIFO.count(tick_index)) { 281 coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size(); 282 } 283 284 // see if we can coalesce the incoming pkt with another 285 // coalesced request with the same tick_index 286 for (int i = 0; i < coalescedReq_cnt; ++i) { 287 first_packet = coalescer->coalescerFIFO[tick_index][i][0]; 288 289 if (coalescer->canCoalesce(pkt, first_packet)) { 290 coalescer->coalescerFIFO[tick_index][i].push_back(pkt); 291 292 DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n", 293 i, tick_index, 294 coalescer->coalescerFIFO[tick_index][i].size()); 295 296 didCoalesce = true; 297 break; 298 } 299 } 300 301 // if this is the first request for this tick_index 302 // or we did not manage to coalesce, update stats 303 // and make necessary allocations. 304 if (!coalescedReq_cnt || !didCoalesce) { 305 if (update_stats) 306 coalescer->coalescedAccesses++; 307 308 std::vector<PacketPtr> new_array; 309 new_array.push_back(pkt); 310 coalescer->coalescerFIFO[tick_index].push_back(new_array); 311 312 DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after " 313 "push\n", tick_index, 314 coalescer->coalescerFIFO[tick_index].size()); 315 } 316 317 //schedule probeTLBEvent next cycle to send the 318 //coalesced requests to the TLB 319 if (!coalescer->probeTLBEvent.scheduled()) { 320 coalescer->schedule(coalescer->probeTLBEvent, 321 curTick() + coalescer->ticks(1)); 322 } 323 324 return true; 325} 326 327void 328TLBCoalescer::CpuSidePort::recvReqRetry() 329{ 330 assert(false); 331} 332 333void 334TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt) 335{ 336 337 TheISA::GpuTLB::TranslationState *sender_state = 338 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 339 340 bool update_stats = !sender_state->prefetch; 341 342 if (update_stats) 343 coalescer->uncoalescedAccesses++; 344 345 // If there is a pending timing request for this virtual address 346 // print a warning message. This is a temporary caveat of 347 // the current simulator where atomic and timing requests can 348 // coexist. FIXME remove this check/warning in the future. 349 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); 350 int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr); 351 352 if (map_count) { 353 DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing " 354 "req. pending\n", virt_page_addr); 355 } 356 357 coalescer->memSidePort[0]->sendFunctional(pkt); 358} 359 360AddrRangeList 361TLBCoalescer::CpuSidePort::getAddrRanges() const 362{ 363 // currently not checked by the master 364 AddrRangeList ranges; 365 366 return ranges; 367} 368 369bool 370TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt) 371{ 372 // a translation completed and returned 373 coalescer->updatePhysAddresses(pkt); 374 375 return true; 376} 377 378void 379TLBCoalescer::MemSidePort::recvReqRetry() 380{ 381 //we've receeived a retry. Schedule a probeTLBEvent 382 if (!coalescer->probeTLBEvent.scheduled()) 383 coalescer->schedule(coalescer->probeTLBEvent, 384 curTick() + coalescer->ticks(1)); 385} 386 387void 388TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt) 389{ 390 fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n"); 391} 392 393TLBCoalescer::IssueProbeEvent::IssueProbeEvent(TLBCoalescer * _coalescer) 394 : Event(CPU_Tick_Pri), coalescer(_coalescer) 395{ 396} 397 398const char* 399TLBCoalescer::IssueProbeEvent::description() const 400{ 401 return "Probe the TLB below"; 402} 403 404/* 405 * Here we scan the coalescer FIFO and issue the max 406 * number of permitted probes to the TLB below. We 407 * permit bypassing of coalesced requests for the same 408 * tick_index. 409 * 410 * We do not access the next tick_index unless we've 411 * drained the previous one. The coalesced requests 412 * that are successfully sent are moved to the 413 * issuedTranslationsTable table (the table which keeps 414 * track of the outstanding reqs) 415 */ 416void 417TLBCoalescer::IssueProbeEvent::process() 418{ 419 // number of TLB probes sent so far 420 int sent_probes = 0; 421 // rejected denotes a blocking event 422 bool rejected = false; 423 424 // It is set to true either when the recvTiming of the TLB below 425 // returns false or when there is another outstanding request for the 426 // same virt. page. 427 428 DPRINTF(GPUTLB, "triggered TLBCoalescer IssueProbeEvent\n"); 429 430 for (auto iter = coalescer->coalescerFIFO.begin(); 431 iter != coalescer->coalescerFIFO.end() && !rejected; ) { 432 int coalescedReq_cnt = iter->second.size(); 433 int i = 0; 434 int vector_index = 0; 435 436 DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n", 437 coalescedReq_cnt, iter->first); 438 439 while (i < coalescedReq_cnt) { 440 ++i; 441 PacketPtr first_packet = iter->second[vector_index][0]; 442 443 // compute virtual page address for this request 444 Addr virt_page_addr = roundDown(first_packet->req->getVaddr(), 445 TheISA::PageBytes); 446 447 // is there another outstanding request for the same page addr? 448 int pending_reqs = 449 coalescer->issuedTranslationsTable.count(virt_page_addr); 450 451 if (pending_reqs) { 452 DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for " 453 "page %#x\n", virt_page_addr); 454 455 ++vector_index; 456 rejected = true; 457 458 continue; 459 } 460 461 // send the coalesced request for virt_page_addr 462 if (!coalescer->memSidePort[0]->sendTimingReq(first_packet)) { 463 DPRINTF(GPUTLB, "Failed to send TLB request for page %#x", 464 virt_page_addr); 465 466 // No need for a retries queue since we are already buffering 467 // the coalesced request in coalescerFIFO. 468 rejected = true; 469 ++vector_index; 470 } else { 471 TheISA::GpuTLB::TranslationState *tmp_sender_state = 472 safe_cast<TheISA::GpuTLB::TranslationState*> 473 (first_packet->senderState); 474 475 bool update_stats = !tmp_sender_state->prefetch; 476 477 if (update_stats) { 478 // req_cnt is total number of packets represented 479 // by the one we just sent counting all the way from 480 // the top of TLB hiearchy (i.e., from the CU) 481 int req_cnt = tmp_sender_state->reqCnt.back(); 482 coalescer->queuingCycles += (curTick() * req_cnt); 483 484 DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n", 485 coalescer->name(), req_cnt); 486 487 // pkt_cnt is number of packets we coalesced into the one 488 // we just sent but only at this coalescer level 489 int pkt_cnt = iter->second[vector_index].size(); 490 coalescer->localqueuingCycles += (curTick() * pkt_cnt); 491 } 492 493 DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x", 494 virt_page_addr); 495 496 //copy coalescedReq to issuedTranslationsTable 497 coalescer->issuedTranslationsTable[virt_page_addr] 498 = iter->second[vector_index]; 499 500 //erase the entry of this coalesced req 501 iter->second.erase(iter->second.begin() + vector_index); 502 503 if (iter->second.empty()) 504 assert(i == coalescedReq_cnt); 505 506 sent_probes++; 507 if (sent_probes == coalescer->TLBProbesPerCycle) 508 return; 509 } 510 } 511 512 //if there are no more coalesced reqs for this tick_index 513 //erase the hash_map with the first iterator 514 if (iter->second.empty()) { 515 coalescer->coalescerFIFO.erase(iter++); 516 } else { 517 ++iter; 518 } 519 } 520} 521 522TLBCoalescer::CleanupEvent::CleanupEvent(TLBCoalescer* _coalescer) 523 : Event(Maximum_Pri), coalescer(_coalescer) 524{ 525} 526 527const char* 528TLBCoalescer::CleanupEvent::description() const 529{ 530 return "Cleanup issuedTranslationsTable hashmap"; 531} 532 533void 534TLBCoalescer::CleanupEvent::process() 535{ 536 while (!coalescer->cleanupQueue.empty()) { 537 Addr cleanup_addr = coalescer->cleanupQueue.front(); 538 coalescer->cleanupQueue.pop(); 539 coalescer->issuedTranslationsTable.erase(cleanup_addr); 540 541 DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n", 542 cleanup_addr); 543 } 544} 545 546void 547TLBCoalescer::regStats() 548{ 549 MemObject::regStats(); 550 551 uncoalescedAccesses 552 .name(name() + ".uncoalesced_accesses") 553 .desc("Number of uncoalesced TLB accesses") 554 ; 555 556 coalescedAccesses 557 .name(name() + ".coalesced_accesses") 558 .desc("Number of coalesced TLB accesses") 559 ; 560 561 queuingCycles 562 .name(name() + ".queuing_cycles") 563 .desc("Number of cycles spent in queue") 564 ; 565 566 localqueuingCycles 567 .name(name() + ".local_queuing_cycles") 568 .desc("Number of cycles spent in queue for all incoming reqs") 569 ; 570 571 localLatency 572 .name(name() + ".local_latency") 573 .desc("Avg. latency over all incoming pkts") 574 ; 575 576 localLatency = localqueuingCycles / uncoalescedAccesses; 577} 578 579 580TLBCoalescer* 581TLBCoalescerParams::create() 582{ 583 return new TLBCoalescer(this); 584} 585 586