gpu_tlb.hh revision 11713
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#ifndef __GPU_TLB_HH__
37#define __GPU_TLB_HH__
38
39#include <fstream>
40#include <list>
41#include <queue>
42#include <string>
43#include <vector>
44
45#include "arch/generic/tlb.hh"
46#include "arch/x86/pagetable.hh"
47#include "arch/x86/pagetable_walker.hh"
48#include "arch/x86/regs/segment.hh"
49#include "base/callback.hh"
50#include "base/misc.hh"
51#include "base/statistics.hh"
52#include "gpu-compute/compute_unit.hh"
53#include "mem/mem_object.hh"
54#include "mem/port.hh"
55#include "mem/request.hh"
56#include "params/X86GPUTLB.hh"
57#include "sim/sim_object.hh"
58
59class BaseTLB;
60class Packet;
61class ThreadContext;
62
63namespace X86ISA
64{
65    class GpuTlbEntry : public TlbEntry
66    {
67      public:
68        GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
69          : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
70
71        GpuTlbEntry() : TlbEntry(), valid(false) { }
72
73        bool valid;
74    };
75
76    class GpuTLB : public MemObject
77    {
78      protected:
79        friend class Walker;
80
81        typedef std::list<GpuTlbEntry*> EntryList;
82
83        uint32_t configAddress;
84
85        // TLB clock: will inherit clock from shader's clock period in terms
86        // of nuber of ticks of curTime (aka global simulation clock)
87        // The assignment of TLB clock from shader clock is done in the python
88        // config files.
89        int clock;
90
91      public:
92        // clock related functions ; maps to-and-from Simulation ticks and
93        // object clocks.
94        Tick frequency() const { return SimClock::Frequency / clock; }
95
96        Tick
97        ticks(int numCycles) const
98        {
99            return (Tick)clock * numCycles;
100        }
101
102        Tick curCycle() const { return curTick() / clock; }
103        Tick tickToCycles(Tick val) const { return val / clock;}
104
105        typedef X86GPUTLBParams Params;
106        GpuTLB(const Params *p);
107        ~GpuTLB();
108
109        typedef enum BaseTLB::Mode Mode;
110
111        class Translation
112        {
113          public:
114            virtual ~Translation() { }
115
116            /**
117             * Signal that the translation has been delayed due to a hw page
118             * table walk.
119             */
120            virtual void markDelayed() = 0;
121
122            /**
123             * The memory for this object may be dynamically allocated, and it
124             * may be responsible for cleaning itslef up which will happen in
125             * this function. Once it's called the object is no longer valid.
126             */
127            virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
128                    Mode mode) = 0;
129        };
130
131        void dumpAll();
132        GpuTlbEntry *lookup(Addr va, bool update_lru=true);
133        void setConfigAddress(uint32_t addr);
134
135      protected:
136        EntryList::iterator lookupIt(Addr va, bool update_lru=true);
137        Walker *walker;
138
139      public:
140        Walker *getWalker();
141        void invalidateAll();
142        void invalidateNonGlobal();
143        void demapPage(Addr va, uint64_t asn);
144
145      protected:
146        int size;
147        int assoc;
148        int numSets;
149
150        /**
151         *  true if this is a fully-associative TLB
152         */
153        bool FA;
154        Addr setMask;
155
156        /**
157         * Allocation Policy: true if we always allocate on a hit, false
158         * otherwise. Default is true.
159         */
160        bool allocationPolicy;
161
162        /**
163         * if true, then this is not the last level TLB
164         */
165        bool hasMemSidePort;
166
167        /**
168         * Print out accessDistance stats. One stat file
169         * per TLB.
170         */
171        bool accessDistance;
172
173        std::vector<GpuTlbEntry> tlb;
174
175        /*
176         * It's a per-set list. As long as we have not reached
177         * the full capacity of the given set, grab an entry from
178         * the freeList.
179         */
180        std::vector<EntryList> freeList;
181
182        /**
183         * An entryList per set is the equivalent of an LRU stack;
184         * it's used to guide replacement decisions. The head of the list
185         * contains the MRU TLB entry of the given set. If the freeList
186         * for this set is empty, the last element of the list
187         * is evicted (i.e., dropped on the floor).
188         */
189        std::vector<EntryList> entryList;
190
191        Fault translateInt(RequestPtr req, ThreadContext *tc);
192
193        Fault translate(RequestPtr req, ThreadContext *tc,
194                Translation *translation, Mode mode, bool &delayedResponse,
195                bool timing, int &latency);
196
197      public:
198        // latencies for a TLB hit, miss and page fault
199        int hitLatency;
200        int missLatency1;
201        int missLatency2;
202
203        // local_stats are as seen from the TLB
204        // without taking into account coalescing
205        Stats::Scalar localNumTLBAccesses;
206        Stats::Scalar localNumTLBHits;
207        Stats::Scalar localNumTLBMisses;
208        Stats::Formula localTLBMissRate;
209
210        // global_stats are as seen from the
211        // CU's perspective taking into account
212        // all coalesced requests.
213        Stats::Scalar globalNumTLBAccesses;
214        Stats::Scalar globalNumTLBHits;
215        Stats::Scalar globalNumTLBMisses;
216        Stats::Formula globalTLBMissRate;
217
218        // from the CU perspective (global)
219        Stats::Scalar accessCycles;
220        // from the CU perspective (global)
221        Stats::Scalar pageTableCycles;
222        Stats::Scalar numUniquePages;
223        // from the perspective of this TLB
224        Stats::Scalar localCycles;
225        // from the perspective of this TLB
226        Stats::Formula localLatency;
227        // I take the avg. per page and then
228        // the avg. over all pages.
229        Stats::Scalar avgReuseDistance;
230
231        void regStats();
232        void updatePageFootprint(Addr virt_page_addr);
233        void printAccessPattern();
234
235
236        Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
237                              int &latency);
238
239        void translateTiming(RequestPtr req, ThreadContext *tc,
240                             Translation *translation, Mode mode,
241                             int &latency);
242
243        Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
244        Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
245
246        GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
247
248        // Checkpointing
249        virtual void serialize(CheckpointOut& cp) const;
250        virtual void unserialize(CheckpointIn& cp);
251        void issueTranslation();
252        enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
253        bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
254
255        void handleTranslationReturn(Addr addr, tlbOutcome outcome,
256                                     PacketPtr pkt);
257
258        void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
259
260        void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
261                                    GpuTlbEntry *tlb_entry, Mode mode);
262
263        void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
264                                 Addr phys_page_addr);
265
266        void issueTLBLookup(PacketPtr pkt);
267
268        // CpuSidePort is the TLB Port closer to the CPU/CU side
269        class CpuSidePort : public SlavePort
270        {
271          public:
272            CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
273                        PortID _index)
274                : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
275
276          protected:
277            GpuTLB *tlb;
278            int index;
279
280            virtual bool recvTimingReq(PacketPtr pkt);
281            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
282            virtual void recvFunctional(PacketPtr pkt);
283            virtual void recvRangeChange() { }
284            virtual void recvReqRetry();
285            virtual void recvRespRetry() { assert(false); }
286            virtual AddrRangeList getAddrRanges() const;
287        };
288
289        /**
290         * MemSidePort is the TLB Port closer to the memory side
291         * If this is a last level TLB then this port will not be connected.
292         *
293         * Future action item: if we ever do real page walks, then this port
294         * should be connected to a RubyPort.
295         */
296        class MemSidePort : public MasterPort
297        {
298          public:
299            MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
300                        PortID _index)
301                : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
302
303            std::deque<PacketPtr> retries;
304
305          protected:
306            GpuTLB *tlb;
307            int index;
308
309            virtual bool recvTimingResp(PacketPtr pkt);
310            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
311            virtual void recvFunctional(PacketPtr pkt) { }
312            virtual void recvRangeChange() { }
313            virtual void recvReqRetry();
314        };
315
316        // TLB ports on the cpu Side
317        std::vector<CpuSidePort*> cpuSidePort;
318        // TLB ports on the memory side
319        std::vector<MemSidePort*> memSidePort;
320
321        BaseMasterPort &getMasterPort(const std::string &if_name,
322                                      PortID idx=InvalidPortID);
323
324        BaseSlavePort &getSlavePort(const std::string &if_name,
325                                    PortID idx=InvalidPortID);
326
327        /**
328         * TLB TranslationState: this currently is a somewhat bastardization of
329         * the usage of SenderState, whereby the receiver of a packet is not
330         * usually supposed to need to look at the contents of the senderState,
331         * you're really only supposed to look at what you pushed on, pop it
332         * off, and send it back.
333         *
334         * However, since there is state that we want to pass to the TLBs using
335         * the send/recv Timing/Functional/etc. APIs, which don't allow for new
336         * arguments, we need a common TLB senderState to pass between TLBs,
337         * both "forwards" and "backwards."
338         *
339         * So, basically, the rule is that any packet received by a TLB port
340         * (cpuside OR memside) must be safely castable to a TranslationState.
341         */
342
343        struct TranslationState : public Packet::SenderState
344        {
345            // TLB mode, read or write
346            Mode tlbMode;
347            // Thread context associated with this req
348            ThreadContext *tc;
349
350            /*
351            * TLB entry to be populated and passed back and filled in
352            * previous TLBs.  Equivalent to the data cache concept of
353            * "data return."
354            */
355            GpuTlbEntry *tlbEntry;
356            // Is this a TLB prefetch request?
357            bool prefetch;
358            // When was the req for this translation issued
359            uint64_t issueTime;
360            // Remember where this came from
361            std::vector<SlavePort*>ports;
362
363            // keep track of #uncoalesced reqs per packet per TLB level;
364            // reqCnt per level >= reqCnt higher level
365            std::vector<int> reqCnt;
366            // TLB level this packet hit in; 0 if it hit in the page table
367            int hitLevel;
368            Packet::SenderState *saved;
369
370            TranslationState(Mode tlb_mode, ThreadContext *_tc,
371                             bool _prefetch=false,
372                             Packet::SenderState *_saved=nullptr)
373                : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
374                  prefetch(_prefetch), issueTime(0),
375                  hitLevel(0),saved(_saved) { }
376        };
377
378        // maximum number of permitted coalesced requests per cycle
379        int maxCoalescedReqs;
380
381        // Current number of outstandings coalesced requests.
382        // Should be <= maxCoalescedReqs
383        int outstandingReqs;
384
385        /**
386         * A TLBEvent is scheduled after the TLB lookup and helps us take the
387         * appropriate actions:
388         *  (e.g., update TLB on a hit,
389         *  send request to lower level TLB on a miss,
390         *  or start a page walk if this was the last-level TLB).
391         */
392        void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
393                               PacketPtr pkt);
394
395        class TLBEvent : public Event
396        {
397            private:
398                GpuTLB *tlb;
399                Addr virtPageAddr;
400                /**
401                 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
402                 */
403                tlbOutcome outcome;
404                PacketPtr pkt;
405
406            public:
407                TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
408                        PacketPtr _pkt);
409
410                void process();
411                const char *description() const;
412
413                // updateOutcome updates the tlbOutcome of a TLBEvent
414                void updateOutcome(tlbOutcome _outcome);
415                Addr getTLBEventVaddr();
416        };
417
418        std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
419
420        // this FIFO queue keeps track of the virt. page addresses
421        // that are pending cleanup
422        std::queue<Addr> cleanupQueue;
423
424        // the cleanupEvent is scheduled after a TLBEvent triggers in order to
425        // free memory and do the required clean-up
426        void cleanup();
427
428        EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
429
430        /**
431         * This hash map will use the virtual page address as a key
432         * and will keep track of total number of accesses per page
433         */
434
435        struct AccessInfo
436        {
437            unsigned int lastTimeAccessed; // last access to this page
438            unsigned int accessesPerPage;
439            // need to divide it by accessesPerPage at the end
440            unsigned int totalReuseDistance;
441
442            /**
443             * The field below will help us compute the access distance,
444             * that is the number of (coalesced) TLB accesses that
445             * happened in between each access to this page
446             *
447             * localTLBAccesses[x] is the value of localTLBNumAccesses
448             * when the page <Addr> was accessed for the <x>th time
449             */
450            std::vector<unsigned int> localTLBAccesses;
451            unsigned int sumDistance;
452            unsigned int meanDistance;
453        };
454
455        typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
456        AccessPatternTable TLBFootprint;
457
458        // Called at the end of simulation to dump page access stats.
459        void exitCallback();
460
461        EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
462    };
463}
464
465#endif // __GPU_TLB_HH__
466