gpu_tlb.hh revision 13449
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Lisa Hsu 34 */ 35 36#ifndef __GPU_TLB_HH__ 37#define __GPU_TLB_HH__ 38 39#include <fstream> 40#include <list> 41#include <queue> 42#include <string> 43#include <vector> 44 45#include "arch/generic/tlb.hh" 46#include "arch/x86/pagetable.hh" 47#include "arch/x86/pagetable_walker.hh" 48#include "arch/x86/regs/segment.hh" 49#include "base/callback.hh" 50#include "base/logging.hh" 51#include "base/statistics.hh" 52#include "gpu-compute/compute_unit.hh" 53#include "mem/mem_object.hh" 54#include "mem/port.hh" 55#include "mem/request.hh" 56#include "params/X86GPUTLB.hh" 57#include "sim/sim_object.hh" 58 59class BaseTLB; 60class Packet; 61class ThreadContext; 62 63namespace X86ISA 64{ 65 class GpuTLB : public MemObject 66 { 67 protected: 68 friend class Walker; 69 70 typedef std::list<TlbEntry*> EntryList; 71 72 uint32_t configAddress; 73 74 // TLB clock: will inherit clock from shader's clock period in terms 75 // of nuber of ticks of curTime (aka global simulation clock) 76 // The assignment of TLB clock from shader clock is done in the python 77 // config files. 78 int clock; 79 80 public: 81 // clock related functions ; maps to-and-from Simulation ticks and 82 // object clocks. 83 Tick frequency() const { return SimClock::Frequency / clock; } 84 85 Tick 86 ticks(int numCycles) const 87 { 88 return (Tick)clock * numCycles; 89 } 90 91 Tick curCycle() const { return curTick() / clock; } 92 Tick tickToCycles(Tick val) const { return val / clock;} 93 94 typedef X86GPUTLBParams Params; 95 GpuTLB(const Params *p); 96 ~GpuTLB(); 97 98 typedef enum BaseTLB::Mode Mode; 99 100 class Translation 101 { 102 public: 103 virtual ~Translation() { } 104 105 /** 106 * Signal that the translation has been delayed due to a hw page 107 * table walk. 108 */ 109 virtual void markDelayed() = 0; 110 111 /** 112 * The memory for this object may be dynamically allocated, and it 113 * may be responsible for cleaning itslef up which will happen in 114 * this function. Once it's called the object is no longer valid. 115 */ 116 virtual void finish(Fault fault, const RequestPtr &req, 117 ThreadContext *tc, Mode mode) = 0; 118 }; 119 120 void dumpAll(); 121 TlbEntry *lookup(Addr va, bool update_lru=true); 122 void setConfigAddress(uint32_t addr); 123 124 protected: 125 EntryList::iterator lookupIt(Addr va, bool update_lru=true); 126 Walker *walker; 127 128 public: 129 Walker *getWalker(); 130 void invalidateAll(); 131 void invalidateNonGlobal(); 132 void demapPage(Addr va, uint64_t asn); 133 134 protected: 135 int size; 136 int assoc; 137 int numSets; 138 139 /** 140 * true if this is a fully-associative TLB 141 */ 142 bool FA; 143 Addr setMask; 144 145 /** 146 * Allocation Policy: true if we always allocate on a hit, false 147 * otherwise. Default is true. 148 */ 149 bool allocationPolicy; 150 151 /** 152 * if true, then this is not the last level TLB 153 */ 154 bool hasMemSidePort; 155 156 /** 157 * Print out accessDistance stats. One stat file 158 * per TLB. 159 */ 160 bool accessDistance; 161 162 std::vector<TlbEntry> tlb; 163 164 /* 165 * It's a per-set list. As long as we have not reached 166 * the full capacity of the given set, grab an entry from 167 * the freeList. 168 */ 169 std::vector<EntryList> freeList; 170 171 /** 172 * An entryList per set is the equivalent of an LRU stack; 173 * it's used to guide replacement decisions. The head of the list 174 * contains the MRU TLB entry of the given set. If the freeList 175 * for this set is empty, the last element of the list 176 * is evicted (i.e., dropped on the floor). 177 */ 178 std::vector<EntryList> entryList; 179 180 Fault translateInt(const RequestPtr &req, ThreadContext *tc); 181 182 Fault translate(const RequestPtr &req, ThreadContext *tc, 183 Translation *translation, Mode mode, bool &delayedResponse, 184 bool timing, int &latency); 185 186 public: 187 // latencies for a TLB hit, miss and page fault 188 int hitLatency; 189 int missLatency1; 190 int missLatency2; 191 192 // local_stats are as seen from the TLB 193 // without taking into account coalescing 194 Stats::Scalar localNumTLBAccesses; 195 Stats::Scalar localNumTLBHits; 196 Stats::Scalar localNumTLBMisses; 197 Stats::Formula localTLBMissRate; 198 199 // global_stats are as seen from the 200 // CU's perspective taking into account 201 // all coalesced requests. 202 Stats::Scalar globalNumTLBAccesses; 203 Stats::Scalar globalNumTLBHits; 204 Stats::Scalar globalNumTLBMisses; 205 Stats::Formula globalTLBMissRate; 206 207 // from the CU perspective (global) 208 Stats::Scalar accessCycles; 209 // from the CU perspective (global) 210 Stats::Scalar pageTableCycles; 211 Stats::Scalar numUniquePages; 212 // from the perspective of this TLB 213 Stats::Scalar localCycles; 214 // from the perspective of this TLB 215 Stats::Formula localLatency; 216 // I take the avg. per page and then 217 // the avg. over all pages. 218 Stats::Scalar avgReuseDistance; 219 220 void regStats(); 221 void updatePageFootprint(Addr virt_page_addr); 222 void printAccessPattern(); 223 224 225 Fault translateAtomic(const RequestPtr &req, ThreadContext *tc, 226 Mode mode, int &latency); 227 228 void translateTiming(const RequestPtr &req, ThreadContext *tc, 229 Translation *translation, Mode mode, 230 int &latency); 231 232 Tick doMmuRegRead(ThreadContext *tc, Packet *pkt); 233 Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt); 234 235 TlbEntry *insert(Addr vpn, TlbEntry &entry); 236 237 // Checkpointing 238 virtual void serialize(CheckpointOut& cp) const; 239 virtual void unserialize(CheckpointIn& cp); 240 void issueTranslation(); 241 enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN}; 242 bool tlbLookup(const RequestPtr &req, 243 ThreadContext *tc, bool update_stats); 244 245 void handleTranslationReturn(Addr addr, tlbOutcome outcome, 246 PacketPtr pkt); 247 248 void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome); 249 250 void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt, 251 TlbEntry *tlb_entry, Mode mode); 252 253 void updatePhysAddresses(Addr virt_page_addr, TlbEntry *tlb_entry, 254 Addr phys_page_addr); 255 256 void issueTLBLookup(PacketPtr pkt); 257 258 // CpuSidePort is the TLB Port closer to the CPU/CU side 259 class CpuSidePort : public SlavePort 260 { 261 public: 262 CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB, 263 PortID _index) 264 : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } 265 266 protected: 267 GpuTLB *tlb; 268 int index; 269 270 virtual bool recvTimingReq(PacketPtr pkt); 271 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 272 virtual void recvFunctional(PacketPtr pkt); 273 virtual void recvRangeChange() { } 274 virtual void recvReqRetry(); 275 virtual void recvRespRetry() { panic("recvRespRetry called"); } 276 virtual AddrRangeList getAddrRanges() const; 277 }; 278 279 /** 280 * MemSidePort is the TLB Port closer to the memory side 281 * If this is a last level TLB then this port will not be connected. 282 * 283 * Future action item: if we ever do real page walks, then this port 284 * should be connected to a RubyPort. 285 */ 286 class MemSidePort : public MasterPort 287 { 288 public: 289 MemSidePort(const std::string &_name, GpuTLB * gpu_TLB, 290 PortID _index) 291 : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } 292 293 std::deque<PacketPtr> retries; 294 295 protected: 296 GpuTLB *tlb; 297 int index; 298 299 virtual bool recvTimingResp(PacketPtr pkt); 300 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 301 virtual void recvFunctional(PacketPtr pkt) { } 302 virtual void recvRangeChange() { } 303 virtual void recvReqRetry(); 304 }; 305 306 // TLB ports on the cpu Side 307 std::vector<CpuSidePort*> cpuSidePort; 308 // TLB ports on the memory side 309 std::vector<MemSidePort*> memSidePort; 310 311 BaseMasterPort &getMasterPort(const std::string &if_name, 312 PortID idx=InvalidPortID); 313 314 BaseSlavePort &getSlavePort(const std::string &if_name, 315 PortID idx=InvalidPortID); 316 317 /** 318 * TLB TranslationState: this currently is a somewhat bastardization of 319 * the usage of SenderState, whereby the receiver of a packet is not 320 * usually supposed to need to look at the contents of the senderState, 321 * you're really only supposed to look at what you pushed on, pop it 322 * off, and send it back. 323 * 324 * However, since there is state that we want to pass to the TLBs using 325 * the send/recv Timing/Functional/etc. APIs, which don't allow for new 326 * arguments, we need a common TLB senderState to pass between TLBs, 327 * both "forwards" and "backwards." 328 * 329 * So, basically, the rule is that any packet received by a TLB port 330 * (cpuside OR memside) must be safely castable to a TranslationState. 331 */ 332 333 struct TranslationState : public Packet::SenderState 334 { 335 // TLB mode, read or write 336 Mode tlbMode; 337 // Thread context associated with this req 338 ThreadContext *tc; 339 340 /* 341 * TLB entry to be populated and passed back and filled in 342 * previous TLBs. Equivalent to the data cache concept of 343 * "data return." 344 */ 345 TlbEntry *tlbEntry; 346 // Is this a TLB prefetch request? 347 bool prefetch; 348 // When was the req for this translation issued 349 uint64_t issueTime; 350 // Remember where this came from 351 std::vector<SlavePort*>ports; 352 353 // keep track of #uncoalesced reqs per packet per TLB level; 354 // reqCnt per level >= reqCnt higher level 355 std::vector<int> reqCnt; 356 // TLB level this packet hit in; 0 if it hit in the page table 357 int hitLevel; 358 Packet::SenderState *saved; 359 360 TranslationState(Mode tlb_mode, ThreadContext *_tc, 361 bool _prefetch=false, 362 Packet::SenderState *_saved=nullptr) 363 : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr), 364 prefetch(_prefetch), issueTime(0), 365 hitLevel(0),saved(_saved) { } 366 }; 367 368 // maximum number of permitted coalesced requests per cycle 369 int maxCoalescedReqs; 370 371 // Current number of outstandings coalesced requests. 372 // Should be <= maxCoalescedReqs 373 int outstandingReqs; 374 375 /** 376 * A TLBEvent is scheduled after the TLB lookup and helps us take the 377 * appropriate actions: 378 * (e.g., update TLB on a hit, 379 * send request to lower level TLB on a miss, 380 * or start a page walk if this was the last-level TLB). 381 */ 382 void translationReturn(Addr virtPageAddr, tlbOutcome outcome, 383 PacketPtr pkt); 384 385 class TLBEvent : public Event 386 { 387 private: 388 GpuTLB *tlb; 389 Addr virtPageAddr; 390 /** 391 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK 392 */ 393 tlbOutcome outcome; 394 PacketPtr pkt; 395 396 public: 397 TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome, 398 PacketPtr _pkt); 399 400 void process(); 401 const char *description() const; 402 403 // updateOutcome updates the tlbOutcome of a TLBEvent 404 void updateOutcome(tlbOutcome _outcome); 405 Addr getTLBEventVaddr(); 406 }; 407 408 std::unordered_map<Addr, TLBEvent*> translationReturnEvent; 409 410 // this FIFO queue keeps track of the virt. page addresses 411 // that are pending cleanup 412 std::queue<Addr> cleanupQueue; 413 414 // the cleanupEvent is scheduled after a TLBEvent triggers in order to 415 // free memory and do the required clean-up 416 void cleanup(); 417 418 EventFunctionWrapper cleanupEvent; 419 420 /** 421 * This hash map will use the virtual page address as a key 422 * and will keep track of total number of accesses per page 423 */ 424 425 struct AccessInfo 426 { 427 unsigned int lastTimeAccessed; // last access to this page 428 unsigned int accessesPerPage; 429 // need to divide it by accessesPerPage at the end 430 unsigned int totalReuseDistance; 431 432 /** 433 * The field below will help us compute the access distance, 434 * that is the number of (coalesced) TLB accesses that 435 * happened in between each access to this page 436 * 437 * localTLBAccesses[x] is the value of localTLBNumAccesses 438 * when the page <Addr> was accessed for the <x>th time 439 */ 440 std::vector<unsigned int> localTLBAccesses; 441 unsigned int sumDistance; 442 unsigned int meanDistance; 443 }; 444 445 typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable; 446 AccessPatternTable TLBFootprint; 447 448 // Called at the end of simulation to dump page access stats. 449 void exitCallback(); 450 451 EventFunctionWrapper exitEvent; 452 }; 453} 454 455#endif // __GPU_TLB_HH__ 456