tlb_coalescer.cc revision 12126:06c1fbaa5724
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#include "gpu-compute/tlb_coalescer.hh" 37 38#include <cstring> 39 40#include "debug/GPUTLB.hh" 41 42TLBCoalescer::TLBCoalescer(const Params *p) 43 : MemObject(p), 44 clock(p->clk_domain->clockPeriod()), 45 TLBProbesPerCycle(p->probesPerCycle), 46 coalescingWindow(p->coalescingWindow), 47 disableCoalescing(p->disableCoalescing), 48 probeTLBEvent([this]{ processProbeTLBEvent(); }, 49 "Probe the TLB below", 50 false, Event::CPU_Tick_Pri), 51 cleanupEvent([this]{ processCleanupEvent(); }, 52 "Cleanup issuedTranslationsTable hashmap", 53 false, Event::Maximum_Pri) 54{ 55 // create the slave ports based on the number of connected ports 56 for (size_t i = 0; i < p->port_slave_connection_count; ++i) { 57 cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i), 58 this, i)); 59 } 60 61 // create the master ports based on the number of connected ports 62 for (size_t i = 0; i < p->port_master_connection_count; ++i) { 63 memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i), 64 this, i)); 65 } 66} 67 68BaseSlavePort& 69TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx) 70{ 71 if (if_name == "slave") { 72 if (idx >= static_cast<PortID>(cpuSidePort.size())) { 73 panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx); 74 } 75 76 return *cpuSidePort[idx]; 77 } else { 78 panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name); 79 } 80} 81 82BaseMasterPort& 83TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx) 84{ 85 if (if_name == "master") { 86 if (idx >= static_cast<PortID>(memSidePort.size())) { 87 panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx); 88 } 89 90 return *memSidePort[idx]; 91 } else { 92 panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name); 93 } 94} 95 96/* 97 * This method returns true if the <incoming_pkt> 98 * can be coalesced with <coalesced_pkt> and false otherwise. 99 * A given set of rules is checked. 100 * The rules can potentially be modified based on the TLB level. 101 */ 102bool 103TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt) 104{ 105 if (disableCoalescing) 106 return false; 107 108 TheISA::GpuTLB::TranslationState *incoming_state = 109 safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState); 110 111 TheISA::GpuTLB::TranslationState *coalesced_state = 112 safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState); 113 114 // Rule 1: Coalesce requests only if they 115 // fall within the same virtual page 116 Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(), 117 TheISA::PageBytes); 118 119 Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(), 120 TheISA::PageBytes); 121 122 if (incoming_virt_page_addr != coalesced_virt_page_addr) 123 return false; 124 125 //* Rule 2: Coalesce requests only if they 126 // share a TLB Mode, i.e. they are both read 127 // or write requests. 128 BaseTLB::Mode incoming_mode = incoming_state->tlbMode; 129 BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode; 130 131 if (incoming_mode != coalesced_mode) 132 return false; 133 134 // when we can coalesce a packet update the reqCnt 135 // that is the number of packets represented by 136 // this coalesced packet 137 if (!incoming_state->prefetch) 138 coalesced_state->reqCnt.back() += incoming_state->reqCnt.back(); 139 140 return true; 141} 142 143/* 144 * We need to update the physical addresses of all the translation requests 145 * that were coalesced into the one that just returned. 146 */ 147void 148TLBCoalescer::updatePhysAddresses(PacketPtr pkt) 149{ 150 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); 151 152 DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n", 153 issuedTranslationsTable[virt_page_addr].size(), virt_page_addr); 154 155 TheISA::GpuTLB::TranslationState *sender_state = 156 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 157 158 TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry; 159 assert(tlb_entry); 160 Addr first_entry_vaddr = tlb_entry->vaddr; 161 Addr first_entry_paddr = tlb_entry->paddr; 162 int page_size = tlb_entry->size(); 163 bool uncacheable = tlb_entry->uncacheable; 164 int first_hit_level = sender_state->hitLevel; 165 bool valid = tlb_entry->valid; 166 167 // Get the physical page address of the translated request 168 // Using the page_size specified in the TLBEntry allows us 169 // to support different page sizes. 170 Addr phys_page_paddr = pkt->req->getPaddr(); 171 phys_page_paddr &= ~(page_size - 1); 172 173 for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) { 174 PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i]; 175 TheISA::GpuTLB::TranslationState *sender_state = 176 safe_cast<TheISA::GpuTLB::TranslationState*>( 177 local_pkt->senderState); 178 179 // we are sending the packet back, so pop the reqCnt associated 180 // with this level in the TLB hiearchy 181 if (!sender_state->prefetch) 182 sender_state->reqCnt.pop_back(); 183 184 /* 185 * Only the first packet from this coalesced request has been 186 * translated. Grab the translated phys. page addr and update the 187 * physical addresses of the remaining packets with the appropriate 188 * page offsets. 189 */ 190 if (i) { 191 Addr paddr = phys_page_paddr; 192 paddr |= (local_pkt->req->getVaddr() & (page_size - 1)); 193 local_pkt->req->setPaddr(paddr); 194 195 if (uncacheable) 196 local_pkt->req->setFlags(Request::UNCACHEABLE); 197 198 // update senderState->tlbEntry, so we can insert 199 // the correct TLBEentry in the TLBs above. 200 sender_state->tlbEntry = 201 new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr, 202 valid); 203 204 // update the hitLevel for all uncoalesced reqs 205 // so that each packet knows where it hit 206 // (used for statistics in the CUs) 207 sender_state->hitLevel = first_hit_level; 208 } 209 210 SlavePort *return_port = sender_state->ports.back(); 211 sender_state->ports.pop_back(); 212 213 // Translation is done - Convert to a response pkt if necessary and 214 // send the translation back 215 if (local_pkt->isRequest()) { 216 local_pkt->makeTimingResponse(); 217 } 218 219 return_port->sendTimingResp(local_pkt); 220 } 221 222 // schedule clean up for end of this cycle 223 // This is a maximum priority event and must be on 224 // the same cycle as GPUTLB cleanup event to prevent 225 // race conditions with an IssueProbeEvent caused by 226 // MemSidePort::recvReqRetry 227 cleanupQueue.push(virt_page_addr); 228 229 if (!cleanupEvent.scheduled()) 230 schedule(cleanupEvent, curTick()); 231} 232 233// Receive translation requests, create a coalesced request, 234// and send them to the TLB (TLBProbesPerCycle) 235bool 236TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) 237{ 238 // first packet of a coalesced request 239 PacketPtr first_packet = nullptr; 240 // true if we are able to do coalescing 241 bool didCoalesce = false; 242 // number of coalesced reqs for a given window 243 int coalescedReq_cnt = 0; 244 245 TheISA::GpuTLB::TranslationState *sender_state = 246 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 247 248 // push back the port to remember the path back 249 sender_state->ports.push_back(this); 250 251 bool update_stats = !sender_state->prefetch; 252 253 if (update_stats) { 254 // if reqCnt is empty then this packet does not represent 255 // multiple uncoalesced reqs(pkts) but just a single pkt. 256 // If it does though then the reqCnt for each level in the 257 // hierarchy accumulates the total number of reqs this packet 258 // represents 259 int req_cnt = 1; 260 261 if (!sender_state->reqCnt.empty()) 262 req_cnt = sender_state->reqCnt.back(); 263 264 sender_state->reqCnt.push_back(req_cnt); 265 266 // update statistics 267 coalescer->uncoalescedAccesses++; 268 req_cnt = sender_state->reqCnt.back(); 269 DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt); 270 coalescer->queuingCycles -= (curTick() * req_cnt); 271 coalescer->localqueuingCycles -= curTick(); 272 } 273 274 // FIXME if you want to coalesce not based on the issueTime 275 // of the packets (i.e., from the compute unit's perspective) 276 // but based on when they reached this coalescer then 277 // remove the following if statement and use curTick() or 278 // coalescingWindow for the tick_index. 279 if (!sender_state->issueTime) 280 sender_state->issueTime = curTick(); 281 282 // The tick index is used as a key to the coalescerFIFO hashmap. 283 // It is shared by all candidates that fall within the 284 // given coalescingWindow. 285 int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow; 286 287 if (coalescer->coalescerFIFO.count(tick_index)) { 288 coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size(); 289 } 290 291 // see if we can coalesce the incoming pkt with another 292 // coalesced request with the same tick_index 293 for (int i = 0; i < coalescedReq_cnt; ++i) { 294 first_packet = coalescer->coalescerFIFO[tick_index][i][0]; 295 296 if (coalescer->canCoalesce(pkt, first_packet)) { 297 coalescer->coalescerFIFO[tick_index][i].push_back(pkt); 298 299 DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n", 300 i, tick_index, 301 coalescer->coalescerFIFO[tick_index][i].size()); 302 303 didCoalesce = true; 304 break; 305 } 306 } 307 308 // if this is the first request for this tick_index 309 // or we did not manage to coalesce, update stats 310 // and make necessary allocations. 311 if (!coalescedReq_cnt || !didCoalesce) { 312 if (update_stats) 313 coalescer->coalescedAccesses++; 314 315 std::vector<PacketPtr> new_array; 316 new_array.push_back(pkt); 317 coalescer->coalescerFIFO[tick_index].push_back(new_array); 318 319 DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after " 320 "push\n", tick_index, 321 coalescer->coalescerFIFO[tick_index].size()); 322 } 323 324 //schedule probeTLBEvent next cycle to send the 325 //coalesced requests to the TLB 326 if (!coalescer->probeTLBEvent.scheduled()) { 327 coalescer->schedule(coalescer->probeTLBEvent, 328 curTick() + coalescer->ticks(1)); 329 } 330 331 return true; 332} 333 334void 335TLBCoalescer::CpuSidePort::recvReqRetry() 336{ 337 assert(false); 338} 339 340void 341TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt) 342{ 343 344 TheISA::GpuTLB::TranslationState *sender_state = 345 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 346 347 bool update_stats = !sender_state->prefetch; 348 349 if (update_stats) 350 coalescer->uncoalescedAccesses++; 351 352 // If there is a pending timing request for this virtual address 353 // print a warning message. This is a temporary caveat of 354 // the current simulator where atomic and timing requests can 355 // coexist. FIXME remove this check/warning in the future. 356 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); 357 int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr); 358 359 if (map_count) { 360 DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing " 361 "req. pending\n", virt_page_addr); 362 } 363 364 coalescer->memSidePort[0]->sendFunctional(pkt); 365} 366 367AddrRangeList 368TLBCoalescer::CpuSidePort::getAddrRanges() const 369{ 370 // currently not checked by the master 371 AddrRangeList ranges; 372 373 return ranges; 374} 375 376bool 377TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt) 378{ 379 // a translation completed and returned 380 coalescer->updatePhysAddresses(pkt); 381 382 return true; 383} 384 385void 386TLBCoalescer::MemSidePort::recvReqRetry() 387{ 388 //we've receeived a retry. Schedule a probeTLBEvent 389 if (!coalescer->probeTLBEvent.scheduled()) 390 coalescer->schedule(coalescer->probeTLBEvent, 391 curTick() + coalescer->ticks(1)); 392} 393 394void 395TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt) 396{ 397 fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n"); 398} 399 400/* 401 * Here we scan the coalescer FIFO and issue the max 402 * number of permitted probes to the TLB below. We 403 * permit bypassing of coalesced requests for the same 404 * tick_index. 405 * 406 * We do not access the next tick_index unless we've 407 * drained the previous one. The coalesced requests 408 * that are successfully sent are moved to the 409 * issuedTranslationsTable table (the table which keeps 410 * track of the outstanding reqs) 411 */ 412void 413TLBCoalescer::processProbeTLBEvent() 414{ 415 // number of TLB probes sent so far 416 int sent_probes = 0; 417 // rejected denotes a blocking event 418 bool rejected = false; 419 420 // It is set to true either when the recvTiming of the TLB below 421 // returns false or when there is another outstanding request for the 422 // same virt. page. 423 424 DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__); 425 426 for (auto iter = coalescerFIFO.begin(); 427 iter != coalescerFIFO.end() && !rejected; ) { 428 int coalescedReq_cnt = iter->second.size(); 429 int i = 0; 430 int vector_index = 0; 431 432 DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n", 433 coalescedReq_cnt, iter->first); 434 435 while (i < coalescedReq_cnt) { 436 ++i; 437 PacketPtr first_packet = iter->second[vector_index][0]; 438 439 // compute virtual page address for this request 440 Addr virt_page_addr = roundDown(first_packet->req->getVaddr(), 441 TheISA::PageBytes); 442 443 // is there another outstanding request for the same page addr? 444 int pending_reqs = 445 issuedTranslationsTable.count(virt_page_addr); 446 447 if (pending_reqs) { 448 DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for " 449 "page %#x\n", virt_page_addr); 450 451 ++vector_index; 452 rejected = true; 453 454 continue; 455 } 456 457 // send the coalesced request for virt_page_addr 458 if (!memSidePort[0]->sendTimingReq(first_packet)) { 459 DPRINTF(GPUTLB, "Failed to send TLB request for page %#x", 460 virt_page_addr); 461 462 // No need for a retries queue since we are already buffering 463 // the coalesced request in coalescerFIFO. 464 rejected = true; 465 ++vector_index; 466 } else { 467 TheISA::GpuTLB::TranslationState *tmp_sender_state = 468 safe_cast<TheISA::GpuTLB::TranslationState*> 469 (first_packet->senderState); 470 471 bool update_stats = !tmp_sender_state->prefetch; 472 473 if (update_stats) { 474 // req_cnt is total number of packets represented 475 // by the one we just sent counting all the way from 476 // the top of TLB hiearchy (i.e., from the CU) 477 int req_cnt = tmp_sender_state->reqCnt.back(); 478 queuingCycles += (curTick() * req_cnt); 479 480 DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n", 481 name(), req_cnt); 482 483 // pkt_cnt is number of packets we coalesced into the one 484 // we just sent but only at this coalescer level 485 int pkt_cnt = iter->second[vector_index].size(); 486 localqueuingCycles += (curTick() * pkt_cnt); 487 } 488 489 DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x", 490 virt_page_addr); 491 492 //copy coalescedReq to issuedTranslationsTable 493 issuedTranslationsTable[virt_page_addr] 494 = iter->second[vector_index]; 495 496 //erase the entry of this coalesced req 497 iter->second.erase(iter->second.begin() + vector_index); 498 499 if (iter->second.empty()) 500 assert(i == coalescedReq_cnt); 501 502 sent_probes++; 503 if (sent_probes == TLBProbesPerCycle) 504 return; 505 } 506 } 507 508 //if there are no more coalesced reqs for this tick_index 509 //erase the hash_map with the first iterator 510 if (iter->second.empty()) { 511 coalescerFIFO.erase(iter++); 512 } else { 513 ++iter; 514 } 515 } 516} 517 518void 519TLBCoalescer::processCleanupEvent() 520{ 521 while (!cleanupQueue.empty()) { 522 Addr cleanup_addr = cleanupQueue.front(); 523 cleanupQueue.pop(); 524 issuedTranslationsTable.erase(cleanup_addr); 525 526 DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n", 527 cleanup_addr); 528 } 529} 530 531void 532TLBCoalescer::regStats() 533{ 534 MemObject::regStats(); 535 536 uncoalescedAccesses 537 .name(name() + ".uncoalesced_accesses") 538 .desc("Number of uncoalesced TLB accesses") 539 ; 540 541 coalescedAccesses 542 .name(name() + ".coalesced_accesses") 543 .desc("Number of coalesced TLB accesses") 544 ; 545 546 queuingCycles 547 .name(name() + ".queuing_cycles") 548 .desc("Number of cycles spent in queue") 549 ; 550 551 localqueuingCycles 552 .name(name() + ".local_queuing_cycles") 553 .desc("Number of cycles spent in queue for all incoming reqs") 554 ; 555 556 localLatency 557 .name(name() + ".local_latency") 558 .desc("Avg. latency over all incoming pkts") 559 ; 560 561 localLatency = localqueuingCycles / uncoalescedAccesses; 562} 563 564 565TLBCoalescer* 566TLBCoalescerParams::create() 567{ 568 return new TLBCoalescer(this); 569} 570 571