gpu_tlb.hh revision 11704
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#ifndef __GPU_TLB_HH__ 37#define __GPU_TLB_HH__ 38 39#include <fstream> 40#include <list> 41#include <queue> 42#include <string> 43#include <vector> 44 45#include "arch/generic/tlb.hh" 46#include "arch/x86/pagetable.hh" 47#include "arch/x86/pagetable_walker.hh" 48#include "arch/x86/regs/segment.hh" 49#include "base/callback.hh" 50#include "base/misc.hh" 51#include "base/statistics.hh" 52#include "gpu-compute/compute_unit.hh" 53#include "mem/mem_object.hh" 54#include "mem/port.hh" 55#include "mem/request.hh" 56#include "params/X86GPUTLB.hh" 57#include "sim/sim_object.hh" 58 59class BaseTLB; 60class Packet; 61class ThreadContext; 62 63namespace X86ISA 64{ 65 class GpuTlbEntry : public TlbEntry 66 { 67 public: 68 GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid) 69 : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { } 70 71 GpuTlbEntry() : TlbEntry() { } 72 73 bool valid; 74 }; 75 76 class GpuTLB : public MemObject 77 { 78 protected: 79 friend class Walker; 80 81 typedef std::list<GpuTlbEntry*> EntryList; 82 83 uint32_t configAddress; 84 85 // TLB clock: will inherit clock from shader's clock period in terms 86 // of nuber of ticks of curTime (aka global simulation clock) 87 // The assignment of TLB clock from shader clock is done in the python 88 // config files. 89 int clock; 90 91 public: 92 // clock related functions ; maps to-and-from Simulation ticks and 93 // object clocks. 94 Tick frequency() const { return SimClock::Frequency / clock; } 95 96 Tick 97 ticks(int numCycles) const 98 { 99 return (Tick)clock * numCycles; 100 } 101 102 Tick curCycle() const { return curTick() / clock; } 103 Tick tickToCycles(Tick val) const { return val / clock;} 104 105 typedef X86GPUTLBParams Params; 106 GpuTLB(const Params *p); 107 ~GpuTLB(); 108 109 typedef enum BaseTLB::Mode Mode; 110 111 class Translation 112 { 113 public: 114 virtual ~Translation() { } 115 116 /** 117 * Signal that the translation has been delayed due to a hw page 118 * table walk. 119 */ 120 virtual void markDelayed() = 0; 121 122 /** 123 * The memory for this object may be dynamically allocated, and it 124 * may be responsible for cleaning itslef up which will happen in 125 * this function. Once it's called the object is no longer valid. 126 */ 127 virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc, 128 Mode mode) = 0; 129 }; 130 131 void dumpAll(); 132 GpuTlbEntry *lookup(Addr va, bool update_lru=true); 133 void setConfigAddress(uint32_t addr); 134 135 protected: 136 EntryList::iterator lookupIt(Addr va, bool update_lru=true); 137 Walker *walker; 138 139 public: 140 Walker *getWalker(); 141 void invalidateAll(); 142 void invalidateNonGlobal(); 143 void demapPage(Addr va, uint64_t asn); 144 145 protected: 146 int size; 147 int assoc; 148 int numSets; 149 150 /** 151 * true if this is a fully-associative TLB 152 */ 153 bool FA; 154 Addr setMask; 155 156 /** 157 * Allocation Policy: true if we always allocate on a hit, false 158 * otherwise. Default is true. 159 */ 160 bool allocationPolicy; 161 162 /** 163 * if true, then this is not the last level TLB 164 */ 165 bool hasMemSidePort; 166 167 /** 168 * Print out accessDistance stats. One stat file 169 * per TLB. 170 */ 171 bool accessDistance; 172 173 std::vector<GpuTlbEntry> tlb; 174 175 /* 176 * It's a per-set list. As long as we have not reached 177 * the full capacity of the given set, grab an entry from 178 * the freeList. 179 */ 180 std::vector<EntryList> freeList; 181 182 /** 183 * An entryList per set is the equivalent of an LRU stack; 184 * it's used to guide replacement decisions. The head of the list 185 * contains the MRU TLB entry of the given set. If the freeList 186 * for this set is empty, the last element of the list 187 * is evicted (i.e., dropped on the floor). 188 */ 189 std::vector<EntryList> entryList; 190 191 Fault translateInt(RequestPtr req, ThreadContext *tc); 192 193 Fault translate(RequestPtr req, ThreadContext *tc, 194 Translation *translation, Mode mode, bool &delayedResponse, 195 bool timing, int &latency); 196 197 public: 198 // latencies for a TLB hit, miss and page fault 199 int hitLatency; 200 int missLatency1; 201 int missLatency2; 202 203 // local_stats are as seen from the TLB 204 // without taking into account coalescing 205 Stats::Scalar localNumTLBAccesses; 206 Stats::Scalar localNumTLBHits; 207 Stats::Scalar localNumTLBMisses; 208 Stats::Formula localTLBMissRate; 209 210 // global_stats are as seen from the 211 // CU's perspective taking into account 212 // all coalesced requests. 213 Stats::Scalar globalNumTLBAccesses; 214 Stats::Scalar globalNumTLBHits; 215 Stats::Scalar globalNumTLBMisses; 216 Stats::Formula globalTLBMissRate; 217 218 // from the CU perspective (global) 219 Stats::Scalar accessCycles; 220 // from the CU perspective (global) 221 Stats::Scalar pageTableCycles; 222 Stats::Scalar numUniquePages; 223 // from the perspective of this TLB 224 Stats::Scalar localCycles; 225 // from the perspective of this TLB 226 Stats::Formula localLatency; 227 // I take the avg. per page and then 228 // the avg. over all pages. 229 Stats::Scalar avgReuseDistance; 230 231 void regStats(); 232 void updatePageFootprint(Addr virt_page_addr); 233 void printAccessPattern(); 234 235 236 Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode, 237 int &latency); 238 239 void translateTiming(RequestPtr req, ThreadContext *tc, 240 Translation *translation, Mode mode, 241 int &latency); 242 243 Tick doMmuRegRead(ThreadContext *tc, Packet *pkt); 244 Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt); 245 246 GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry); 247 248 // Checkpointing 249 virtual void serialize(CheckpointOut& cp) const; 250 virtual void unserialize(CheckpointIn& cp); 251 void issueTranslation(); 252 enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN}; 253 bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats); 254 255 void handleTranslationReturn(Addr addr, tlbOutcome outcome, 256 PacketPtr pkt); 257 258 void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome); 259 260 void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt, 261 GpuTlbEntry *tlb_entry, Mode mode); 262 263 void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry, 264 Addr phys_page_addr); 265 266 void issueTLBLookup(PacketPtr pkt); 267 268 // CpuSidePort is the TLB Port closer to the CPU/CU side 269 class CpuSidePort : public SlavePort 270 { 271 public: 272 CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB, 273 PortID _index) 274 : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } 275 276 protected: 277 GpuTLB *tlb; 278 int index; 279 280 virtual bool recvTimingReq(PacketPtr pkt); 281 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 282 virtual void recvFunctional(PacketPtr pkt); 283 virtual void recvRangeChange() { } 284 virtual void recvReqRetry(); 285 virtual void recvRespRetry() { assert(false); } 286 virtual AddrRangeList getAddrRanges() const; 287 }; 288 289 /** 290 * MemSidePort is the TLB Port closer to the memory side 291 * If this is a last level TLB then this port will not be connected. 292 * 293 * Future action item: if we ever do real page walks, then this port 294 * should be connected to a RubyPort. 295 */ 296 class MemSidePort : public MasterPort 297 { 298 public: 299 MemSidePort(const std::string &_name, GpuTLB * gpu_TLB, 300 PortID _index) 301 : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } 302 303 std::deque<PacketPtr> retries; 304 305 protected: 306 GpuTLB *tlb; 307 int index; 308 309 virtual bool recvTimingResp(PacketPtr pkt); 310 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 311 virtual void recvFunctional(PacketPtr pkt) { } 312 virtual void recvRangeChange() { } 313 virtual void recvReqRetry(); 314 }; 315 316 // TLB ports on the cpu Side 317 std::vector<CpuSidePort*> cpuSidePort; 318 // TLB ports on the memory side 319 std::vector<MemSidePort*> memSidePort; 320 321 BaseMasterPort &getMasterPort(const std::string &if_name, 322 PortID idx=InvalidPortID); 323 324 BaseSlavePort &getSlavePort(const std::string &if_name, 325 PortID idx=InvalidPortID); 326 327 /** 328 * TLB TranslationState: this currently is a somewhat bastardization of 329 * the usage of SenderState, whereby the receiver of a packet is not 330 * usually supposed to need to look at the contents of the senderState, 331 * you're really only supposed to look at what you pushed on, pop it 332 * off, and send it back. 333 * 334 * However, since there is state that we want to pass to the TLBs using 335 * the send/recv Timing/Functional/etc. APIs, which don't allow for new 336 * arguments, we need a common TLB senderState to pass between TLBs, 337 * both "forwards" and "backwards." 338 * 339 * So, basically, the rule is that any packet received by a TLB port 340 * (cpuside OR memside) must be safely castable to a TranslationState. 341 */ 342 343 struct TranslationState : public Packet::SenderState 344 { 345 // TLB mode, read or write 346 Mode tlbMode; 347 // Thread context associated with this req 348 ThreadContext *tc; 349 350 /* 351 * TLB entry to be populated and passed back and filled in 352 * previous TLBs. Equivalent to the data cache concept of 353 * "data return." 354 */ 355 GpuTlbEntry *tlbEntry; 356 // Is this a TLB prefetch request? 357 bool prefetch; 358 // When was the req for this translation issued 359 uint64_t issueTime; 360 // Remember where this came from 361 std::vector<SlavePort*>ports; 362 363 // keep track of #uncoalesced reqs per packet per TLB level; 364 // reqCnt per level >= reqCnt higher level 365 std::vector<int> reqCnt; 366 // TLB level this packet hit in; 0 if it hit in the page table 367 int hitLevel; 368 Packet::SenderState *saved; 369 370 TranslationState(Mode tlb_mode, ThreadContext *_tc, 371 bool _prefetch=false, 372 Packet::SenderState *_saved=nullptr) 373 : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr), 374 prefetch(_prefetch), issueTime(0), 375 hitLevel(0),saved(_saved) { } 376 }; 377 378 // maximum number of permitted coalesced requests per cycle 379 int maxCoalescedReqs; 380 381 // Current number of outstandings coalesced requests. 382 // Should be <= maxCoalescedReqs 383 int outstandingReqs; 384 385 /** 386 * A TLBEvent is scheduled after the TLB lookup and helps us take the 387 * appropriate actions: 388 * (e.g., update TLB on a hit, 389 * send request to lower level TLB on a miss, 390 * or start a page walk if this was the last-level TLB). 391 */ 392 void translationReturn(Addr virtPageAddr, tlbOutcome outcome, 393 PacketPtr pkt); 394 395 class TLBEvent : public Event 396 { 397 private: 398 GpuTLB *tlb; 399 Addr virtPageAddr; 400 /** 401 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK 402 */ 403 tlbOutcome outcome; 404 PacketPtr pkt; 405 406 public: 407 TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome, 408 PacketPtr _pkt); 409 410 void process(); 411 const char *description() const; 412 413 // updateOutcome updates the tlbOutcome of a TLBEvent 414 void updateOutcome(tlbOutcome _outcome); 415 Addr getTLBEventVaddr(); 416 }; 417 418 std::unordered_map<Addr, TLBEvent*> translationReturnEvent; 419 420 // this FIFO queue keeps track of the virt. page addresses 421 // that are pending cleanup 422 std::queue<Addr> cleanupQueue; 423 424 // the cleanupEvent is scheduled after a TLBEvent triggers in order to 425 // free memory and do the required clean-up 426 void cleanup(); 427 428 EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent; 429 430 /** 431 * This hash map will use the virtual page address as a key 432 * and will keep track of total number of accesses per page 433 */ 434 435 struct AccessInfo 436 { 437 unsigned int lastTimeAccessed; // last access to this page 438 unsigned int accessesPerPage; 439 // need to divide it by accessesPerPage at the end 440 unsigned int totalReuseDistance; 441 442 /** 443 * The field below will help us compute the access distance, 444 * that is the number of (coalesced) TLB accesses that 445 * happened in between each access to this page 446 * 447 * localTLBAccesses[x] is the value of localTLBNumAccesses 448 * when the page <Addr> was accessed for the <x>th time 449 */ 450 std::vector<unsigned int> localTLBAccesses; 451 unsigned int sumDistance; 452 unsigned int meanDistance; 453 }; 454 455 typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable; 456 AccessPatternTable TLBFootprint; 457 458 // Called at the end of simulation to dump page access stats. 459 void exitCallback(); 460 461 EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent; 462 }; 463} 464 465#endif // __GPU_TLB_HH__ 466