1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Lisa Hsu 34 */ 35 36#ifndef __GPU_TLB_HH__ 37#define __GPU_TLB_HH__ 38 39#include <fstream> 40#include <list> 41#include <queue> 42#include <string> 43#include <vector> 44 45#include "arch/generic/tlb.hh" 46#include "arch/x86/pagetable.hh" 47#include "arch/x86/pagetable_walker.hh" 48#include "arch/x86/regs/segment.hh" 49#include "base/callback.hh" 50#include "base/logging.hh" 51#include "base/statistics.hh" 52#include "gpu-compute/compute_unit.hh" 53#include "mem/port.hh" 54#include "mem/request.hh" 55#include "params/X86GPUTLB.hh" 56#include "sim/clocked_object.hh" 57#include "sim/sim_object.hh" 58 59class BaseTLB; 60class Packet; 61class ThreadContext; 62 63namespace X86ISA 64{ 65 class GpuTLB : public ClockedObject 66 { 67 protected: 68 friend class Walker; 69 70 typedef std::list<TlbEntry*> EntryList; 71 72 uint32_t configAddress; 73 74 // TLB clock: will inherit clock from shader's clock period in terms 75 // of nuber of ticks of curTime (aka global simulation clock) 76 // The assignment of TLB clock from shader clock is done in the python 77 // config files. 78 int clock; 79 80 public: 81 // clock related functions ; maps to-and-from Simulation ticks and 82 // object clocks. 83 Tick frequency() const { return SimClock::Frequency / clock; } 84 85 Tick 86 ticks(int numCycles) const 87 { 88 return (Tick)clock * numCycles; 89 } 90 91 Tick curCycle() const { return curTick() / clock; } 92 Tick tickToCycles(Tick val) const { return val / clock;} 93 94 typedef X86GPUTLBParams Params; 95 GpuTLB(const Params *p); 96 ~GpuTLB(); 97 98 typedef enum BaseTLB::Mode Mode; 99 100 class Translation 101 { 102 public: 103 virtual ~Translation() { } 104 105 /** 106 * Signal that the translation has been delayed due to a hw page 107 * table walk. 108 */ 109 virtual void markDelayed() = 0; 110 111 /** 112 * The memory for this object may be dynamically allocated, and it 113 * may be responsible for cleaning itslef up which will happen in 114 * this function. Once it's called the object is no longer valid. 115 */ 116 virtual void finish(Fault fault, const RequestPtr &req, 117 ThreadContext *tc, Mode mode) = 0; 118 }; 119 120 void dumpAll(); 121 TlbEntry *lookup(Addr va, bool update_lru=true); 122 void setConfigAddress(uint32_t addr); 123 124 protected: 125 EntryList::iterator lookupIt(Addr va, bool update_lru=true); 126 Walker *walker; 127 128 public: 129 Walker *getWalker(); 130 void invalidateAll(); 131 void invalidateNonGlobal(); 132 void demapPage(Addr va, uint64_t asn); 133 134 protected: 135 int size; 136 int assoc; 137 int numSets; 138 139 /** 140 * true if this is a fully-associative TLB 141 */ 142 bool FA; 143 Addr setMask; 144 145 /** 146 * Allocation Policy: true if we always allocate on a hit, false 147 * otherwise. Default is true. 148 */ 149 bool allocationPolicy; 150 151 /** 152 * if true, then this is not the last level TLB 153 */ 154 bool hasMemSidePort; 155 156 /** 157 * Print out accessDistance stats. One stat file 158 * per TLB. 159 */ 160 bool accessDistance; 161 162 std::vector<TlbEntry> tlb; 163 164 /* 165 * It's a per-set list. As long as we have not reached 166 * the full capacity of the given set, grab an entry from 167 * the freeList. 168 */ 169 std::vector<EntryList> freeList; 170 171 /** 172 * An entryList per set is the equivalent of an LRU stack; 173 * it's used to guide replacement decisions. The head of the list 174 * contains the MRU TLB entry of the given set. If the freeList 175 * for this set is empty, the last element of the list 176 * is evicted (i.e., dropped on the floor). 177 */ 178 std::vector<EntryList> entryList; 179 180 Fault translateInt(const RequestPtr &req, ThreadContext *tc); 181 182 Fault translate(const RequestPtr &req, ThreadContext *tc, 183 Translation *translation, Mode mode, bool &delayedResponse, 184 bool timing, int &latency); 185 186 public: 187 // latencies for a TLB hit, miss and page fault 188 int hitLatency; 189 int missLatency1; 190 int missLatency2; 191 192 // local_stats are as seen from the TLB 193 // without taking into account coalescing 194 Stats::Scalar localNumTLBAccesses; 195 Stats::Scalar localNumTLBHits; 196 Stats::Scalar localNumTLBMisses; 197 Stats::Formula localTLBMissRate; 198 199 // global_stats are as seen from the 200 // CU's perspective taking into account 201 // all coalesced requests. 202 Stats::Scalar globalNumTLBAccesses; 203 Stats::Scalar globalNumTLBHits; 204 Stats::Scalar globalNumTLBMisses; 205 Stats::Formula globalTLBMissRate; 206 207 // from the CU perspective (global) 208 Stats::Scalar accessCycles; 209 // from the CU perspective (global) 210 Stats::Scalar pageTableCycles; 211 Stats::Scalar numUniquePages; 212 // from the perspective of this TLB 213 Stats::Scalar localCycles; 214 // from the perspective of this TLB 215 Stats::Formula localLatency; 216 // I take the avg. per page and then 217 // the avg. over all pages. 218 Stats::Scalar avgReuseDistance; 219 220 void regStats() override; 221 void updatePageFootprint(Addr virt_page_addr); 222 void printAccessPattern(); 223 224 225 Fault translateAtomic(const RequestPtr &req, ThreadContext *tc, 226 Mode mode, int &latency); 227 228 void translateTiming(const RequestPtr &req, ThreadContext *tc, 229 Translation *translation, Mode mode, 230 int &latency); 231 232 Tick doMmuRegRead(ThreadContext *tc, Packet *pkt); 233 Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt); 234 235 TlbEntry *insert(Addr vpn, TlbEntry &entry); 236 237 // Checkpointing 238 virtual void serialize(CheckpointOut& cp) const override; 239 virtual void unserialize(CheckpointIn& cp) override; 240 void issueTranslation(); 241 enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN}; 242 bool tlbLookup(const RequestPtr &req, 243 ThreadContext *tc, bool update_stats); 244 245 void handleTranslationReturn(Addr addr, tlbOutcome outcome, 246 PacketPtr pkt); 247 248 void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome); 249 250 void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt, 251 TlbEntry *tlb_entry, Mode mode); 252 253 void updatePhysAddresses(Addr virt_page_addr, TlbEntry *tlb_entry, 254 Addr phys_page_addr); 255 256 void issueTLBLookup(PacketPtr pkt); 257 258 // CpuSidePort is the TLB Port closer to the CPU/CU side 259 class CpuSidePort : public SlavePort 260 { 261 public: 262 CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB, 263 PortID _index) 264 : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } 265 266 protected: 267 GpuTLB *tlb; 268 int index; 269 270 virtual bool recvTimingReq(PacketPtr pkt); 271 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 272 virtual void recvFunctional(PacketPtr pkt); 273 virtual void recvRangeChange() { } 274 virtual void recvReqRetry(); 275 virtual void recvRespRetry() { panic("recvRespRetry called"); } 276 virtual AddrRangeList getAddrRanges() const; 277 }; 278 279 /** 280 * MemSidePort is the TLB Port closer to the memory side 281 * If this is a last level TLB then this port will not be connected. 282 * 283 * Future action item: if we ever do real page walks, then this port 284 * should be connected to a RubyPort. 285 */ 286 class MemSidePort : public MasterPort 287 { 288 public: 289 MemSidePort(const std::string &_name, GpuTLB * gpu_TLB, 290 PortID _index) 291 : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } 292 293 std::deque<PacketPtr> retries; 294 295 protected: 296 GpuTLB *tlb; 297 int index; 298 299 virtual bool recvTimingResp(PacketPtr pkt); 300 virtual Tick recvAtomic(PacketPtr pkt) { return 0; } 301 virtual void recvFunctional(PacketPtr pkt) { } 302 virtual void recvRangeChange() { } 303 virtual void recvReqRetry(); 304 }; 305 306 // TLB ports on the cpu Side 307 std::vector<CpuSidePort*> cpuSidePort; 308 // TLB ports on the memory side 309 std::vector<MemSidePort*> memSidePort; 310 311 Port &getPort(const std::string &if_name, 312 PortID idx=InvalidPortID) override; 313 314 /** 315 * TLB TranslationState: this currently is a somewhat bastardization of 316 * the usage of SenderState, whereby the receiver of a packet is not 317 * usually supposed to need to look at the contents of the senderState, 318 * you're really only supposed to look at what you pushed on, pop it 319 * off, and send it back. 320 * 321 * However, since there is state that we want to pass to the TLBs using 322 * the send/recv Timing/Functional/etc. APIs, which don't allow for new 323 * arguments, we need a common TLB senderState to pass between TLBs, 324 * both "forwards" and "backwards." 325 * 326 * So, basically, the rule is that any packet received by a TLB port 327 * (cpuside OR memside) must be safely castable to a TranslationState. 328 */ 329 330 struct TranslationState : public Packet::SenderState 331 { 332 // TLB mode, read or write 333 Mode tlbMode; 334 // Thread context associated with this req 335 ThreadContext *tc; 336 337 /* 338 * TLB entry to be populated and passed back and filled in 339 * previous TLBs. Equivalent to the data cache concept of 340 * "data return." 341 */ 342 TlbEntry *tlbEntry; 343 // Is this a TLB prefetch request? 344 bool prefetch; 345 // When was the req for this translation issued 346 uint64_t issueTime; 347 // Remember where this came from 348 std::vector<SlavePort*>ports; 349 350 // keep track of #uncoalesced reqs per packet per TLB level; 351 // reqCnt per level >= reqCnt higher level 352 std::vector<int> reqCnt; 353 // TLB level this packet hit in; 0 if it hit in the page table 354 int hitLevel; 355 Packet::SenderState *saved; 356 357 TranslationState(Mode tlb_mode, ThreadContext *_tc, 358 bool _prefetch=false, 359 Packet::SenderState *_saved=nullptr) 360 : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr), 361 prefetch(_prefetch), issueTime(0), 362 hitLevel(0),saved(_saved) { } 363 }; 364 365 // maximum number of permitted coalesced requests per cycle 366 int maxCoalescedReqs; 367 368 // Current number of outstandings coalesced requests. 369 // Should be <= maxCoalescedReqs 370 int outstandingReqs; 371 372 /** 373 * A TLBEvent is scheduled after the TLB lookup and helps us take the 374 * appropriate actions: 375 * (e.g., update TLB on a hit, 376 * send request to lower level TLB on a miss, 377 * or start a page walk if this was the last-level TLB). 378 */ 379 void translationReturn(Addr virtPageAddr, tlbOutcome outcome, 380 PacketPtr pkt); 381 382 class TLBEvent : public Event 383 { 384 private: 385 GpuTLB *tlb; 386 Addr virtPageAddr; 387 /** 388 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK 389 */ 390 tlbOutcome outcome; 391 PacketPtr pkt; 392 393 public: 394 TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome, 395 PacketPtr _pkt); 396 397 void process(); 398 const char *description() const; 399 400 // updateOutcome updates the tlbOutcome of a TLBEvent 401 void updateOutcome(tlbOutcome _outcome); 402 Addr getTLBEventVaddr(); 403 }; 404 405 std::unordered_map<Addr, TLBEvent*> translationReturnEvent; 406 407 // this FIFO queue keeps track of the virt. page addresses 408 // that are pending cleanup 409 std::queue<Addr> cleanupQueue; 410 411 // the cleanupEvent is scheduled after a TLBEvent triggers in order to 412 // free memory and do the required clean-up 413 void cleanup(); 414 415 EventFunctionWrapper cleanupEvent; 416 417 /** 418 * This hash map will use the virtual page address as a key 419 * and will keep track of total number of accesses per page 420 */ 421 422 struct AccessInfo 423 { 424 unsigned int lastTimeAccessed; // last access to this page 425 unsigned int accessesPerPage; 426 // need to divide it by accessesPerPage at the end 427 unsigned int totalReuseDistance; 428 429 /** 430 * The field below will help us compute the access distance, 431 * that is the number of (coalesced) TLB accesses that 432 * happened in between each access to this page 433 * 434 * localTLBAccesses[x] is the value of localTLBNumAccesses 435 * when the page <Addr> was accessed for the <x>th time 436 */ 437 std::vector<unsigned int> localTLBAccesses; 438 unsigned int sumDistance; 439 unsigned int meanDistance; 440 }; 441 442 typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable; 443 AccessPatternTable TLBFootprint; 444 445 // Called at the end of simulation to dump page access stats. 446 void exitCallback(); 447 448 EventFunctionWrapper exitEvent; 449 }; 450} 451 452#endif // __GPU_TLB_HH__ 453