1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Lisa Hsu 34 */ 35 36#include "gpu-compute/tlb_coalescer.hh" 37 38#include <cstring> 39 40#include "base/logging.hh" 41#include "debug/GPUTLB.hh" 42#include "sim/process.hh" 43 44TLBCoalescer::TLBCoalescer(const Params *p) 45 : ClockedObject(p), 46 clock(p->clk_domain->clockPeriod()), 47 TLBProbesPerCycle(p->probesPerCycle), 48 coalescingWindow(p->coalescingWindow), 49 disableCoalescing(p->disableCoalescing), 50 probeTLBEvent([this]{ processProbeTLBEvent(); }, 51 "Probe the TLB below", 52 false, Event::CPU_Tick_Pri), 53 cleanupEvent([this]{ processCleanupEvent(); }, 54 "Cleanup issuedTranslationsTable hashmap", 55 false, Event::Maximum_Pri) 56{ 57 // create the slave ports based on the number of connected ports 58 for (size_t i = 0; i < p->port_slave_connection_count; ++i) { 59 cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i), 60 this, i)); 61 } 62 63 // create the master ports based on the number of connected ports 64 for (size_t i = 0; i < p->port_master_connection_count; ++i) { 65 memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i), 66 this, i)); 67 } 68} 69 70Port & 71TLBCoalescer::getPort(const std::string &if_name, PortID idx) 72{ 73 if (if_name == "slave") { 74 if (idx >= static_cast<PortID>(cpuSidePort.size())) { 75 panic("TLBCoalescer::getPort: unknown index %d\n", idx); 76 } 77 78 return *cpuSidePort[idx]; 79 } else if (if_name == "master") { 80 if (idx >= static_cast<PortID>(memSidePort.size())) { 81 panic("TLBCoalescer::getPort: unknown index %d\n", idx); 82 } 83 84 return *memSidePort[idx]; 85 } else { 86 panic("TLBCoalescer::getPort: unknown port %s\n", if_name); 87 } 88} 89 90/* 91 * This method returns true if the <incoming_pkt> 92 * can be coalesced with <coalesced_pkt> and false otherwise. 93 * A given set of rules is checked. 94 * The rules can potentially be modified based on the TLB level. 95 */ 96bool 97TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt) 98{ 99 if (disableCoalescing) 100 return false; 101 102 TheISA::GpuTLB::TranslationState *incoming_state = 103 safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState); 104 105 TheISA::GpuTLB::TranslationState *coalesced_state = 106 safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState); 107 108 // Rule 1: Coalesce requests only if they 109 // fall within the same virtual page 110 Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(), 111 TheISA::PageBytes); 112 113 Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(), 114 TheISA::PageBytes); 115 116 if (incoming_virt_page_addr != coalesced_virt_page_addr) 117 return false; 118 119 //* Rule 2: Coalesce requests only if they 120 // share a TLB Mode, i.e. they are both read 121 // or write requests. 122 BaseTLB::Mode incoming_mode = incoming_state->tlbMode; 123 BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode; 124 125 if (incoming_mode != coalesced_mode) 126 return false; 127 128 // when we can coalesce a packet update the reqCnt 129 // that is the number of packets represented by 130 // this coalesced packet 131 if (!incoming_state->prefetch) 132 coalesced_state->reqCnt.back() += incoming_state->reqCnt.back(); 133 134 return true; 135} 136 137/* 138 * We need to update the physical addresses of all the translation requests 139 * that were coalesced into the one that just returned. 140 */ 141void 142TLBCoalescer::updatePhysAddresses(PacketPtr pkt) 143{ 144 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); 145 146 DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n", 147 issuedTranslationsTable[virt_page_addr].size(), virt_page_addr); 148 149 TheISA::GpuTLB::TranslationState *sender_state = 150 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 151 152 TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry; 153 assert(tlb_entry); 154 Addr first_entry_vaddr = tlb_entry->vaddr; 155 Addr first_entry_paddr = tlb_entry->paddr; 156 int page_size = tlb_entry->size(); 157 bool uncacheable = tlb_entry->uncacheable; 158 int first_hit_level = sender_state->hitLevel; 159 160 // Get the physical page address of the translated request 161 // Using the page_size specified in the TLBEntry allows us 162 // to support different page sizes. 163 Addr phys_page_paddr = pkt->req->getPaddr(); 164 phys_page_paddr &= ~(page_size - 1); 165 166 for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) { 167 PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i]; 168 TheISA::GpuTLB::TranslationState *sender_state = 169 safe_cast<TheISA::GpuTLB::TranslationState*>( 170 local_pkt->senderState); 171 172 // we are sending the packet back, so pop the reqCnt associated 173 // with this level in the TLB hiearchy 174 if (!sender_state->prefetch) 175 sender_state->reqCnt.pop_back(); 176 177 /* 178 * Only the first packet from this coalesced request has been 179 * translated. Grab the translated phys. page addr and update the 180 * physical addresses of the remaining packets with the appropriate 181 * page offsets. 182 */ 183 if (i) { 184 Addr paddr = phys_page_paddr; 185 paddr |= (local_pkt->req->getVaddr() & (page_size - 1)); 186 local_pkt->req->setPaddr(paddr); 187 188 if (uncacheable) 189 local_pkt->req->setFlags(Request::UNCACHEABLE); 190 191 // update senderState->tlbEntry, so we can insert 192 // the correct TLBEentry in the TLBs above. 193 auto p = sender_state->tc->getProcessPtr(); 194 sender_state->tlbEntry = 195 new TheISA::TlbEntry(p->pid(), first_entry_vaddr, 196 first_entry_paddr, false, false); 197 198 // update the hitLevel for all uncoalesced reqs 199 // so that each packet knows where it hit 200 // (used for statistics in the CUs) 201 sender_state->hitLevel = first_hit_level; 202 } 203 204 SlavePort *return_port = sender_state->ports.back(); 205 sender_state->ports.pop_back(); 206 207 // Translation is done - Convert to a response pkt if necessary and 208 // send the translation back 209 if (local_pkt->isRequest()) { 210 local_pkt->makeTimingResponse(); 211 } 212 213 return_port->sendTimingResp(local_pkt); 214 } 215 216 // schedule clean up for end of this cycle 217 // This is a maximum priority event and must be on 218 // the same cycle as GPUTLB cleanup event to prevent 219 // race conditions with an IssueProbeEvent caused by 220 // MemSidePort::recvReqRetry 221 cleanupQueue.push(virt_page_addr); 222 223 if (!cleanupEvent.scheduled()) 224 schedule(cleanupEvent, curTick()); 225} 226 227// Receive translation requests, create a coalesced request, 228// and send them to the TLB (TLBProbesPerCycle) 229bool 230TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) 231{ 232 // first packet of a coalesced request 233 PacketPtr first_packet = nullptr; 234 // true if we are able to do coalescing 235 bool didCoalesce = false; 236 // number of coalesced reqs for a given window 237 int coalescedReq_cnt = 0; 238 239 TheISA::GpuTLB::TranslationState *sender_state = 240 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 241 242 // push back the port to remember the path back 243 sender_state->ports.push_back(this); 244 245 bool update_stats = !sender_state->prefetch; 246 247 if (update_stats) { 248 // if reqCnt is empty then this packet does not represent 249 // multiple uncoalesced reqs(pkts) but just a single pkt. 250 // If it does though then the reqCnt for each level in the 251 // hierarchy accumulates the total number of reqs this packet 252 // represents 253 int req_cnt = 1; 254 255 if (!sender_state->reqCnt.empty()) 256 req_cnt = sender_state->reqCnt.back(); 257 258 sender_state->reqCnt.push_back(req_cnt); 259 260 // update statistics 261 coalescer->uncoalescedAccesses++; 262 req_cnt = sender_state->reqCnt.back(); 263 DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt); 264 coalescer->queuingCycles -= (curTick() * req_cnt); 265 coalescer->localqueuingCycles -= curTick(); 266 } 267 268 // FIXME if you want to coalesce not based on the issueTime 269 // of the packets (i.e., from the compute unit's perspective) 270 // but based on when they reached this coalescer then 271 // remove the following if statement and use curTick() or 272 // coalescingWindow for the tick_index. 273 if (!sender_state->issueTime) 274 sender_state->issueTime = curTick(); 275 276 // The tick index is used as a key to the coalescerFIFO hashmap. 277 // It is shared by all candidates that fall within the 278 // given coalescingWindow. 279 int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow; 280 281 if (coalescer->coalescerFIFO.count(tick_index)) { 282 coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size(); 283 } 284 285 // see if we can coalesce the incoming pkt with another 286 // coalesced request with the same tick_index 287 for (int i = 0; i < coalescedReq_cnt; ++i) { 288 first_packet = coalescer->coalescerFIFO[tick_index][i][0]; 289 290 if (coalescer->canCoalesce(pkt, first_packet)) { 291 coalescer->coalescerFIFO[tick_index][i].push_back(pkt); 292 293 DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n", 294 i, tick_index, 295 coalescer->coalescerFIFO[tick_index][i].size()); 296 297 didCoalesce = true; 298 break; 299 } 300 } 301 302 // if this is the first request for this tick_index 303 // or we did not manage to coalesce, update stats 304 // and make necessary allocations. 305 if (!coalescedReq_cnt || !didCoalesce) { 306 if (update_stats) 307 coalescer->coalescedAccesses++; 308 309 std::vector<PacketPtr> new_array; 310 new_array.push_back(pkt); 311 coalescer->coalescerFIFO[tick_index].push_back(new_array); 312 313 DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after " 314 "push\n", tick_index, 315 coalescer->coalescerFIFO[tick_index].size()); 316 } 317 318 //schedule probeTLBEvent next cycle to send the 319 //coalesced requests to the TLB 320 if (!coalescer->probeTLBEvent.scheduled()) { 321 coalescer->schedule(coalescer->probeTLBEvent, 322 curTick() + coalescer->ticks(1)); 323 } 324 325 return true; 326} 327 328void 329TLBCoalescer::CpuSidePort::recvReqRetry() 330{ 331 panic("recvReqRetry called"); 332} 333 334void 335TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt) 336{ 337 338 TheISA::GpuTLB::TranslationState *sender_state = 339 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 340 341 bool update_stats = !sender_state->prefetch; 342 343 if (update_stats) 344 coalescer->uncoalescedAccesses++; 345 346 // If there is a pending timing request for this virtual address 347 // print a warning message. This is a temporary caveat of 348 // the current simulator where atomic and timing requests can 349 // coexist. FIXME remove this check/warning in the future. 350 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); 351 int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr); 352 353 if (map_count) { 354 DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing " 355 "req. pending\n", virt_page_addr); 356 } 357 358 coalescer->memSidePort[0]->sendFunctional(pkt); 359} 360 361AddrRangeList 362TLBCoalescer::CpuSidePort::getAddrRanges() const 363{ 364 // currently not checked by the master 365 AddrRangeList ranges; 366 367 return ranges; 368} 369 370bool 371TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt) 372{ 373 // a translation completed and returned 374 coalescer->updatePhysAddresses(pkt); 375 376 return true; 377} 378 379void 380TLBCoalescer::MemSidePort::recvReqRetry() 381{ 382 //we've receeived a retry. Schedule a probeTLBEvent 383 if (!coalescer->probeTLBEvent.scheduled()) 384 coalescer->schedule(coalescer->probeTLBEvent, 385 curTick() + coalescer->ticks(1)); 386} 387 388void 389TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt) 390{ 391 fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n"); 392} 393 394/* 395 * Here we scan the coalescer FIFO and issue the max 396 * number of permitted probes to the TLB below. We 397 * permit bypassing of coalesced requests for the same 398 * tick_index. 399 * 400 * We do not access the next tick_index unless we've 401 * drained the previous one. The coalesced requests 402 * that are successfully sent are moved to the 403 * issuedTranslationsTable table (the table which keeps 404 * track of the outstanding reqs) 405 */ 406void 407TLBCoalescer::processProbeTLBEvent() 408{ 409 // number of TLB probes sent so far 410 int sent_probes = 0; 411 // rejected denotes a blocking event 412 bool rejected = false; 413 414 // It is set to true either when the recvTiming of the TLB below 415 // returns false or when there is another outstanding request for the 416 // same virt. page. 417 418 DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__); 419 420 for (auto iter = coalescerFIFO.begin(); 421 iter != coalescerFIFO.end() && !rejected; ) { 422 int coalescedReq_cnt = iter->second.size(); 423 int i = 0; 424 int vector_index = 0; 425 426 DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n", 427 coalescedReq_cnt, iter->first); 428 429 while (i < coalescedReq_cnt) { 430 ++i; 431 PacketPtr first_packet = iter->second[vector_index][0]; 432 433 // compute virtual page address for this request 434 Addr virt_page_addr = roundDown(first_packet->req->getVaddr(), 435 TheISA::PageBytes); 436 437 // is there another outstanding request for the same page addr? 438 int pending_reqs = 439 issuedTranslationsTable.count(virt_page_addr); 440 441 if (pending_reqs) { 442 DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for " 443 "page %#x\n", virt_page_addr); 444 445 ++vector_index; 446 rejected = true; 447 448 continue; 449 } 450 451 // send the coalesced request for virt_page_addr 452 if (!memSidePort[0]->sendTimingReq(first_packet)) { 453 DPRINTF(GPUTLB, "Failed to send TLB request for page %#x", 454 virt_page_addr); 455 456 // No need for a retries queue since we are already buffering 457 // the coalesced request in coalescerFIFO. 458 rejected = true; 459 ++vector_index; 460 } else { 461 TheISA::GpuTLB::TranslationState *tmp_sender_state = 462 safe_cast<TheISA::GpuTLB::TranslationState*> 463 (first_packet->senderState); 464 465 bool update_stats = !tmp_sender_state->prefetch; 466 467 if (update_stats) { 468 // req_cnt is total number of packets represented 469 // by the one we just sent counting all the way from 470 // the top of TLB hiearchy (i.e., from the CU) 471 int req_cnt = tmp_sender_state->reqCnt.back(); 472 queuingCycles += (curTick() * req_cnt); 473 474 DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n", 475 name(), req_cnt); 476 477 // pkt_cnt is number of packets we coalesced into the one 478 // we just sent but only at this coalescer level 479 int pkt_cnt = iter->second[vector_index].size(); 480 localqueuingCycles += (curTick() * pkt_cnt); 481 } 482 483 DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x", 484 virt_page_addr); 485 486 //copy coalescedReq to issuedTranslationsTable 487 issuedTranslationsTable[virt_page_addr] 488 = iter->second[vector_index]; 489 490 //erase the entry of this coalesced req 491 iter->second.erase(iter->second.begin() + vector_index); 492 493 if (iter->second.empty()) 494 assert(i == coalescedReq_cnt); 495 496 sent_probes++; 497 if (sent_probes == TLBProbesPerCycle) 498 return; 499 } 500 } 501 502 //if there are no more coalesced reqs for this tick_index 503 //erase the hash_map with the first iterator 504 if (iter->second.empty()) { 505 coalescerFIFO.erase(iter++); 506 } else { 507 ++iter; 508 } 509 } 510} 511 512void 513TLBCoalescer::processCleanupEvent() 514{ 515 while (!cleanupQueue.empty()) { 516 Addr cleanup_addr = cleanupQueue.front(); 517 cleanupQueue.pop(); 518 issuedTranslationsTable.erase(cleanup_addr); 519 520 DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n", 521 cleanup_addr); 522 } 523} 524 525void 526TLBCoalescer::regStats() 527{ 528 ClockedObject::regStats(); 529 530 uncoalescedAccesses 531 .name(name() + ".uncoalesced_accesses") 532 .desc("Number of uncoalesced TLB accesses") 533 ; 534 535 coalescedAccesses 536 .name(name() + ".coalesced_accesses") 537 .desc("Number of coalesced TLB accesses") 538 ; 539 540 queuingCycles 541 .name(name() + ".queuing_cycles") 542 .desc("Number of cycles spent in queue") 543 ; 544 545 localqueuingCycles 546 .name(name() + ".local_queuing_cycles") 547 .desc("Number of cycles spent in queue for all incoming reqs") 548 ; 549 550 localLatency 551 .name(name() + ".local_latency") 552 .desc("Avg. latency over all incoming pkts") 553 ; 554 555 localLatency = localqueuingCycles / uncoalescedAccesses; 556} 557 558 559TLBCoalescer* 560TLBCoalescerParams::create() 561{ 562 return new TLBCoalescer(this); 563} 564 565