gpu_tlb.hh revision 12717
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Lisa Hsu 34 */ 35 36#ifndef __GPU_TLB_HH__ 37#define __GPU_TLB_HH__ 38 39#include <fstream> 40#include <list> 41#include <queue> 42#include <string> 43#include <vector> 44 45#include "arch/generic/tlb.hh" 46#include "arch/x86/pagetable.hh" 47#include "arch/x86/pagetable_walker.hh" 48#include "arch/x86/regs/segment.hh" 49#include "base/callback.hh" 50#include "base/logging.hh" 51#include "base/statistics.hh" 52#include "gpu-compute/compute_unit.hh" 53#include "mem/mem_object.hh" 54#include "mem/port.hh" 55#include "mem/request.hh" 56#include "params/X86GPUTLB.hh" 57#include "sim/sim_object.hh" 58 59class BaseTLB; 60class Packet; 61class ThreadContext; 62 63namespace X86ISA 64{ 65 class GpuTLB : public MemObject 66 { 67 protected: 68 friend class Walker; 69 70 typedef std::list<TlbEntry*> EntryList; 71 72 uint32_t configAddress; 73 74 // TLB clock: will inherit clock from shader's clock period in terms 75 // of nuber of ticks of curTime (aka global simulation clock) 76 // The assignment of TLB clock from shader clock is done in the python 77 // config files. 78 int clock; 79 80 public: 81 // clock related functions ; maps to-and-from Simulation ticks and 82 // object clocks. 83 Tick frequency() const { return SimClock::Frequency / clock; } 84 85 Tick 86 ticks(int numCycles) const 87 { 88 return (Tick)clock * numCycles; 89 } 90 91 Tick curCycle() const { return curTick() / clock; } 92 Tick tickToCycles(Tick val) const { return val / clock;} 93 94 typedef X86GPUTLBParams Params; 95 GpuTLB(const Params *p); 96 ~GpuTLB(); 97 98 typedef enum BaseTLB::Mode Mode; 99 100 class Translation 101 { 102 public: 103 virtual ~Translation() { } 104 105 /** 106 * Signal that the translation has been delayed due to a hw page 107 * table walk. 108 */ 109 virtual void markDelayed() = 0; 110 111 /** 112 * The memory for this object may be dynamically allocated, and it 113 * may be responsible for cleaning itslef up which will happen in 114 * this function. Once it's called the object is no longer valid. 115 */ 116 virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc, 117 Mode mode) = 0; 118 }; 119 120 void dumpAll(); 121 TlbEntry *lookup(Addr va, bool update_lru=true); 122 void setConfigAddress(uint32_t addr); 123 124 protected: 125 EntryList::iterator lookupIt(Addr va, bool update_lru=true); 126 Walker *walker; 127 128 public: 129 Walker *getWalker(); 130 void invalidateAll(); 131 void invalidateNonGlobal(); 132 void demapPage(Addr va, uint64_t asn); 133 134 protected: 135 int size; 136 int assoc; 137 int numSets; 138 139 /** 140 * true if this is a fully-associative TLB 141 */ 142 bool FA; 143 Addr setMask; 144 145 /** 146 * Allocation Policy: true if we always allocate on a hit, false 147 * otherwise. Default is true. 148 */ 149 bool allocationPolicy; 150 151 /** 152 * if true, then this is not the last level TLB 153 */ 154 bool hasMemSidePort; 155 156 /** 157 * Print out accessDistance stats. One stat file 158 * per TLB. 159 */ 160 bool accessDistance; 161 162 std::vector<TlbEntry> tlb; 163 164 /* 165 * It's a per-set list. As long as we have not reached 166 * the full capacity of the given set, grab an entry from 167 * the freeList. 168 */ 169 std::vector<EntryList> freeList; 170 171 /** 172 * An entryList per set is the equivalent of an LRU stack; 173 * it's used to guide replacement decisions. The head of the list 174 * contains the MRU TLB entry of the given set. If the freeList 175 * for this set is empty, the last element of the list 176 * is evicted (i.e., dropped on the floor). 177 */ 178 std::vector<EntryList> entryList; 179 180 Fault translateInt(RequestPtr req, ThreadContext *tc); 181 182 Fault translate(RequestPtr req, ThreadContext *tc, 183 Translation *translation, Mode mode, bool &delayedResponse, 184 bool timing, int &latency); 185 186 public: 187 // latencies for a TLB hit, miss and page fault 188 int hitLatency; 189 int missLatency1; 190 int missLatency2; 191 192 // local_stats are as seen from the TLB 193 // without taking into account coalescing 194 Stats::Scalar localNumTLBAccesses; 195 Stats::Scalar localNumTLBHits; 196 Stats::Scalar localNumTLBMisses; 197 Stats::Formula localTLBMissRate; 198 199 // global_stats are as seen from the 200 // CU's perspective taking into account 201 // all coalesced requests. 202 Stats::Scalar globalNumTLBAccesses; 203 Stats::Scalar globalNumTLBHits; 204 Stats::Scalar globalNumTLBMisses; 205 Stats::Formula globalTLBMissRate; 206 207 // from the CU perspective (global) 208 Stats::Scalar accessCycles; 209 // from the CU perspective (global) 210 Stats::Scalar pageTableCycles; 211 Stats::Scalar numUniquePages; 212 // from the perspective of this TLB 213 Stats::Scalar localCycles; 214 // from the perspective of this TLB 215 Stats::Formula localLatency; 216 // I take the avg. per page and then 217 // the avg. over all pages. 218 Stats::Scalar avgReuseDistance; 219 220 void regStats(); 221 void updatePageFootprint(Addr virt_page_addr); 222 void printAccessPattern(); 223 224 225 Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode, 226 int &latency); 227 228 void translateTiming(RequestPtr req, ThreadContext *tc, 229 Translation *translation, Mode mode, 230 int &latency); 231 232 Tick doMmuRegRead(ThreadContext *tc, Packet *pkt); 233 Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt); 234 235 TlbEntry *insert(Addr vpn, TlbEntry &entry); 236 237 // Checkpointing 238 virtual void serialize(CheckpointOut& cp) const; 239 virtual void unserialize(CheckpointIn& cp); 240 void issueTranslation(); 241 enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN}; 242 bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats); 243 244 void handleTranslationReturn(Addr addr, tlbOutcome outcome, 245 PacketPtr pkt); 246 247 void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome); 248 249 void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt, 250 TlbEntry *tlb_entry, Mode mode); 251 252 void updatePhysAddresses(Addr virt_page_addr, TlbEntry *tlb_entry, 253 Addr phys_page_addr); 254 255 void issueTLBLookup(PacketPtr pkt); 256 257 // CpuSidePort is the TLB Port closer to the CPU/CU side 258 class CpuSidePort : public SlavePort 259 { 260 public: 261 CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB, 262 PortID _index) 263 : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } 264 265 protected: 266 GpuTLB *tlb; 267 int index; 268 269 virtual bool recvTimingReq(PacketPtr pkt); 270 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 271 virtual void recvFunctional(PacketPtr pkt); 272 virtual void recvRangeChange() { } 273 virtual void recvReqRetry(); 274 virtual void recvRespRetry() { assert(false); } 275 virtual AddrRangeList getAddrRanges() const; 276 }; 277 278 /** 279 * MemSidePort is the TLB Port closer to the memory side 280 * If this is a last level TLB then this port will not be connected. 281 * 282 * Future action item: if we ever do real page walks, then this port 283 * should be connected to a RubyPort. 284 */ 285 class MemSidePort : public MasterPort 286 { 287 public: 288 MemSidePort(const std::string &_name, GpuTLB * gpu_TLB, 289 PortID _index) 290 : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } 291 292 std::deque<PacketPtr> retries; 293 294 protected: 295 GpuTLB *tlb; 296 int index; 297 298 virtual bool recvTimingResp(PacketPtr pkt); 299 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 300 virtual void recvFunctional(PacketPtr pkt) { } 301 virtual void recvRangeChange() { } 302 virtual void recvReqRetry(); 303 }; 304 305 // TLB ports on the cpu Side 306 std::vector<CpuSidePort*> cpuSidePort; 307 // TLB ports on the memory side 308 std::vector<MemSidePort*> memSidePort; 309 310 BaseMasterPort &getMasterPort(const std::string &if_name, 311 PortID idx=InvalidPortID); 312 313 BaseSlavePort &getSlavePort(const std::string &if_name, 314 PortID idx=InvalidPortID); 315 316 /** 317 * TLB TranslationState: this currently is a somewhat bastardization of 318 * the usage of SenderState, whereby the receiver of a packet is not 319 * usually supposed to need to look at the contents of the senderState, 320 * you're really only supposed to look at what you pushed on, pop it 321 * off, and send it back. 322 * 323 * However, since there is state that we want to pass to the TLBs using 324 * the send/recv Timing/Functional/etc. APIs, which don't allow for new 325 * arguments, we need a common TLB senderState to pass between TLBs, 326 * both "forwards" and "backwards." 327 * 328 * So, basically, the rule is that any packet received by a TLB port 329 * (cpuside OR memside) must be safely castable to a TranslationState. 330 */ 331 332 struct TranslationState : public Packet::SenderState 333 { 334 // TLB mode, read or write 335 Mode tlbMode; 336 // Thread context associated with this req 337 ThreadContext *tc; 338 339 /* 340 * TLB entry to be populated and passed back and filled in 341 * previous TLBs. Equivalent to the data cache concept of 342 * "data return." 343 */ 344 TlbEntry *tlbEntry; 345 // Is this a TLB prefetch request? 346 bool prefetch; 347 // When was the req for this translation issued 348 uint64_t issueTime; 349 // Remember where this came from 350 std::vector<SlavePort*>ports; 351 352 // keep track of #uncoalesced reqs per packet per TLB level; 353 // reqCnt per level >= reqCnt higher level 354 std::vector<int> reqCnt; 355 // TLB level this packet hit in; 0 if it hit in the page table 356 int hitLevel; 357 Packet::SenderState *saved; 358 359 TranslationState(Mode tlb_mode, ThreadContext *_tc, 360 bool _prefetch=false, 361 Packet::SenderState *_saved=nullptr) 362 : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr), 363 prefetch(_prefetch), issueTime(0), 364 hitLevel(0),saved(_saved) { } 365 }; 366 367 // maximum number of permitted coalesced requests per cycle 368 int maxCoalescedReqs; 369 370 // Current number of outstandings coalesced requests. 371 // Should be <= maxCoalescedReqs 372 int outstandingReqs; 373 374 /** 375 * A TLBEvent is scheduled after the TLB lookup and helps us take the 376 * appropriate actions: 377 * (e.g., update TLB on a hit, 378 * send request to lower level TLB on a miss, 379 * or start a page walk if this was the last-level TLB). 380 */ 381 void translationReturn(Addr virtPageAddr, tlbOutcome outcome, 382 PacketPtr pkt); 383 384 class TLBEvent : public Event 385 { 386 private: 387 GpuTLB *tlb; 388 Addr virtPageAddr; 389 /** 390 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK 391 */ 392 tlbOutcome outcome; 393 PacketPtr pkt; 394 395 public: 396 TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome, 397 PacketPtr _pkt); 398 399 void process(); 400 const char *description() const; 401 402 // updateOutcome updates the tlbOutcome of a TLBEvent 403 void updateOutcome(tlbOutcome _outcome); 404 Addr getTLBEventVaddr(); 405 }; 406 407 std::unordered_map<Addr, TLBEvent*> translationReturnEvent; 408 409 // this FIFO queue keeps track of the virt. page addresses 410 // that are pending cleanup 411 std::queue<Addr> cleanupQueue; 412 413 // the cleanupEvent is scheduled after a TLBEvent triggers in order to 414 // free memory and do the required clean-up 415 void cleanup(); 416 417 EventFunctionWrapper cleanupEvent; 418 419 /** 420 * This hash map will use the virtual page address as a key 421 * and will keep track of total number of accesses per page 422 */ 423 424 struct AccessInfo 425 { 426 unsigned int lastTimeAccessed; // last access to this page 427 unsigned int accessesPerPage; 428 // need to divide it by accessesPerPage at the end 429 unsigned int totalReuseDistance; 430 431 /** 432 * The field below will help us compute the access distance, 433 * that is the number of (coalesced) TLB accesses that 434 * happened in between each access to this page 435 * 436 * localTLBAccesses[x] is the value of localTLBNumAccesses 437 * when the page <Addr> was accessed for the <x>th time 438 */ 439 std::vector<unsigned int> localTLBAccesses; 440 unsigned int sumDistance; 441 unsigned int meanDistance; 442 }; 443 444 typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable; 445 AccessPatternTable TLBFootprint; 446 447 // Called at the end of simulation to dump page access stats. 448 void exitCallback(); 449 450 EventFunctionWrapper exitEvent; 451 }; 452} 453 454#endif // __GPU_TLB_HH__ 455