gpu_tlb.hh revision 12717
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Lisa Hsu
34 */
35
36#ifndef __GPU_TLB_HH__
37#define __GPU_TLB_HH__
38
39#include <fstream>
40#include <list>
41#include <queue>
42#include <string>
43#include <vector>
44
45#include "arch/generic/tlb.hh"
46#include "arch/x86/pagetable.hh"
47#include "arch/x86/pagetable_walker.hh"
48#include "arch/x86/regs/segment.hh"
49#include "base/callback.hh"
50#include "base/logging.hh"
51#include "base/statistics.hh"
52#include "gpu-compute/compute_unit.hh"
53#include "mem/mem_object.hh"
54#include "mem/port.hh"
55#include "mem/request.hh"
56#include "params/X86GPUTLB.hh"
57#include "sim/sim_object.hh"
58
59class BaseTLB;
60class Packet;
61class ThreadContext;
62
63namespace X86ISA
64{
65    class GpuTLB : public MemObject
66    {
67      protected:
68        friend class Walker;
69
70        typedef std::list<TlbEntry*> EntryList;
71
72        uint32_t configAddress;
73
74        // TLB clock: will inherit clock from shader's clock period in terms
75        // of nuber of ticks of curTime (aka global simulation clock)
76        // The assignment of TLB clock from shader clock is done in the python
77        // config files.
78        int clock;
79
80      public:
81        // clock related functions ; maps to-and-from Simulation ticks and
82        // object clocks.
83        Tick frequency() const { return SimClock::Frequency / clock; }
84
85        Tick
86        ticks(int numCycles) const
87        {
88            return (Tick)clock * numCycles;
89        }
90
91        Tick curCycle() const { return curTick() / clock; }
92        Tick tickToCycles(Tick val) const { return val / clock;}
93
94        typedef X86GPUTLBParams Params;
95        GpuTLB(const Params *p);
96        ~GpuTLB();
97
98        typedef enum BaseTLB::Mode Mode;
99
100        class Translation
101        {
102          public:
103            virtual ~Translation() { }
104
105            /**
106             * Signal that the translation has been delayed due to a hw page
107             * table walk.
108             */
109            virtual void markDelayed() = 0;
110
111            /**
112             * The memory for this object may be dynamically allocated, and it
113             * may be responsible for cleaning itslef up which will happen in
114             * this function. Once it's called the object is no longer valid.
115             */
116            virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
117                    Mode mode) = 0;
118        };
119
120        void dumpAll();
121        TlbEntry *lookup(Addr va, bool update_lru=true);
122        void setConfigAddress(uint32_t addr);
123
124      protected:
125        EntryList::iterator lookupIt(Addr va, bool update_lru=true);
126        Walker *walker;
127
128      public:
129        Walker *getWalker();
130        void invalidateAll();
131        void invalidateNonGlobal();
132        void demapPage(Addr va, uint64_t asn);
133
134      protected:
135        int size;
136        int assoc;
137        int numSets;
138
139        /**
140         *  true if this is a fully-associative TLB
141         */
142        bool FA;
143        Addr setMask;
144
145        /**
146         * Allocation Policy: true if we always allocate on a hit, false
147         * otherwise. Default is true.
148         */
149        bool allocationPolicy;
150
151        /**
152         * if true, then this is not the last level TLB
153         */
154        bool hasMemSidePort;
155
156        /**
157         * Print out accessDistance stats. One stat file
158         * per TLB.
159         */
160        bool accessDistance;
161
162        std::vector<TlbEntry> tlb;
163
164        /*
165         * It's a per-set list. As long as we have not reached
166         * the full capacity of the given set, grab an entry from
167         * the freeList.
168         */
169        std::vector<EntryList> freeList;
170
171        /**
172         * An entryList per set is the equivalent of an LRU stack;
173         * it's used to guide replacement decisions. The head of the list
174         * contains the MRU TLB entry of the given set. If the freeList
175         * for this set is empty, the last element of the list
176         * is evicted (i.e., dropped on the floor).
177         */
178        std::vector<EntryList> entryList;
179
180        Fault translateInt(RequestPtr req, ThreadContext *tc);
181
182        Fault translate(RequestPtr req, ThreadContext *tc,
183                Translation *translation, Mode mode, bool &delayedResponse,
184                bool timing, int &latency);
185
186      public:
187        // latencies for a TLB hit, miss and page fault
188        int hitLatency;
189        int missLatency1;
190        int missLatency2;
191
192        // local_stats are as seen from the TLB
193        // without taking into account coalescing
194        Stats::Scalar localNumTLBAccesses;
195        Stats::Scalar localNumTLBHits;
196        Stats::Scalar localNumTLBMisses;
197        Stats::Formula localTLBMissRate;
198
199        // global_stats are as seen from the
200        // CU's perspective taking into account
201        // all coalesced requests.
202        Stats::Scalar globalNumTLBAccesses;
203        Stats::Scalar globalNumTLBHits;
204        Stats::Scalar globalNumTLBMisses;
205        Stats::Formula globalTLBMissRate;
206
207        // from the CU perspective (global)
208        Stats::Scalar accessCycles;
209        // from the CU perspective (global)
210        Stats::Scalar pageTableCycles;
211        Stats::Scalar numUniquePages;
212        // from the perspective of this TLB
213        Stats::Scalar localCycles;
214        // from the perspective of this TLB
215        Stats::Formula localLatency;
216        // I take the avg. per page and then
217        // the avg. over all pages.
218        Stats::Scalar avgReuseDistance;
219
220        void regStats();
221        void updatePageFootprint(Addr virt_page_addr);
222        void printAccessPattern();
223
224
225        Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
226                              int &latency);
227
228        void translateTiming(RequestPtr req, ThreadContext *tc,
229                             Translation *translation, Mode mode,
230                             int &latency);
231
232        Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
233        Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
234
235        TlbEntry *insert(Addr vpn, TlbEntry &entry);
236
237        // Checkpointing
238        virtual void serialize(CheckpointOut& cp) const;
239        virtual void unserialize(CheckpointIn& cp);
240        void issueTranslation();
241        enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
242        bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
243
244        void handleTranslationReturn(Addr addr, tlbOutcome outcome,
245                                     PacketPtr pkt);
246
247        void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
248
249        void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
250                                    TlbEntry *tlb_entry, Mode mode);
251
252        void updatePhysAddresses(Addr virt_page_addr, TlbEntry *tlb_entry,
253                                 Addr phys_page_addr);
254
255        void issueTLBLookup(PacketPtr pkt);
256
257        // CpuSidePort is the TLB Port closer to the CPU/CU side
258        class CpuSidePort : public SlavePort
259        {
260          public:
261            CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
262                        PortID _index)
263                : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
264
265          protected:
266            GpuTLB *tlb;
267            int index;
268
269            virtual bool recvTimingReq(PacketPtr pkt);
270            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
271            virtual void recvFunctional(PacketPtr pkt);
272            virtual void recvRangeChange() { }
273            virtual void recvReqRetry();
274            virtual void recvRespRetry() { assert(false); }
275            virtual AddrRangeList getAddrRanges() const;
276        };
277
278        /**
279         * MemSidePort is the TLB Port closer to the memory side
280         * If this is a last level TLB then this port will not be connected.
281         *
282         * Future action item: if we ever do real page walks, then this port
283         * should be connected to a RubyPort.
284         */
285        class MemSidePort : public MasterPort
286        {
287          public:
288            MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
289                        PortID _index)
290                : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
291
292            std::deque<PacketPtr> retries;
293
294          protected:
295            GpuTLB *tlb;
296            int index;
297
298            virtual bool recvTimingResp(PacketPtr pkt);
299            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
300            virtual void recvFunctional(PacketPtr pkt) { }
301            virtual void recvRangeChange() { }
302            virtual void recvReqRetry();
303        };
304
305        // TLB ports on the cpu Side
306        std::vector<CpuSidePort*> cpuSidePort;
307        // TLB ports on the memory side
308        std::vector<MemSidePort*> memSidePort;
309
310        BaseMasterPort &getMasterPort(const std::string &if_name,
311                                      PortID idx=InvalidPortID);
312
313        BaseSlavePort &getSlavePort(const std::string &if_name,
314                                    PortID idx=InvalidPortID);
315
316        /**
317         * TLB TranslationState: this currently is a somewhat bastardization of
318         * the usage of SenderState, whereby the receiver of a packet is not
319         * usually supposed to need to look at the contents of the senderState,
320         * you're really only supposed to look at what you pushed on, pop it
321         * off, and send it back.
322         *
323         * However, since there is state that we want to pass to the TLBs using
324         * the send/recv Timing/Functional/etc. APIs, which don't allow for new
325         * arguments, we need a common TLB senderState to pass between TLBs,
326         * both "forwards" and "backwards."
327         *
328         * So, basically, the rule is that any packet received by a TLB port
329         * (cpuside OR memside) must be safely castable to a TranslationState.
330         */
331
332        struct TranslationState : public Packet::SenderState
333        {
334            // TLB mode, read or write
335            Mode tlbMode;
336            // Thread context associated with this req
337            ThreadContext *tc;
338
339            /*
340            * TLB entry to be populated and passed back and filled in
341            * previous TLBs.  Equivalent to the data cache concept of
342            * "data return."
343            */
344            TlbEntry *tlbEntry;
345            // Is this a TLB prefetch request?
346            bool prefetch;
347            // When was the req for this translation issued
348            uint64_t issueTime;
349            // Remember where this came from
350            std::vector<SlavePort*>ports;
351
352            // keep track of #uncoalesced reqs per packet per TLB level;
353            // reqCnt per level >= reqCnt higher level
354            std::vector<int> reqCnt;
355            // TLB level this packet hit in; 0 if it hit in the page table
356            int hitLevel;
357            Packet::SenderState *saved;
358
359            TranslationState(Mode tlb_mode, ThreadContext *_tc,
360                             bool _prefetch=false,
361                             Packet::SenderState *_saved=nullptr)
362                : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
363                  prefetch(_prefetch), issueTime(0),
364                  hitLevel(0),saved(_saved) { }
365        };
366
367        // maximum number of permitted coalesced requests per cycle
368        int maxCoalescedReqs;
369
370        // Current number of outstandings coalesced requests.
371        // Should be <= maxCoalescedReqs
372        int outstandingReqs;
373
374        /**
375         * A TLBEvent is scheduled after the TLB lookup and helps us take the
376         * appropriate actions:
377         *  (e.g., update TLB on a hit,
378         *  send request to lower level TLB on a miss,
379         *  or start a page walk if this was the last-level TLB).
380         */
381        void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
382                               PacketPtr pkt);
383
384        class TLBEvent : public Event
385        {
386            private:
387                GpuTLB *tlb;
388                Addr virtPageAddr;
389                /**
390                 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
391                 */
392                tlbOutcome outcome;
393                PacketPtr pkt;
394
395            public:
396                TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
397                        PacketPtr _pkt);
398
399                void process();
400                const char *description() const;
401
402                // updateOutcome updates the tlbOutcome of a TLBEvent
403                void updateOutcome(tlbOutcome _outcome);
404                Addr getTLBEventVaddr();
405        };
406
407        std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
408
409        // this FIFO queue keeps track of the virt. page addresses
410        // that are pending cleanup
411        std::queue<Addr> cleanupQueue;
412
413        // the cleanupEvent is scheduled after a TLBEvent triggers in order to
414        // free memory and do the required clean-up
415        void cleanup();
416
417        EventFunctionWrapper cleanupEvent;
418
419        /**
420         * This hash map will use the virtual page address as a key
421         * and will keep track of total number of accesses per page
422         */
423
424        struct AccessInfo
425        {
426            unsigned int lastTimeAccessed; // last access to this page
427            unsigned int accessesPerPage;
428            // need to divide it by accessesPerPage at the end
429            unsigned int totalReuseDistance;
430
431            /**
432             * The field below will help us compute the access distance,
433             * that is the number of (coalesced) TLB accesses that
434             * happened in between each access to this page
435             *
436             * localTLBAccesses[x] is the value of localTLBNumAccesses
437             * when the page <Addr> was accessed for the <x>th time
438             */
439            std::vector<unsigned int> localTLBAccesses;
440            unsigned int sumDistance;
441            unsigned int meanDistance;
442        };
443
444        typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
445        AccessPatternTable TLBFootprint;
446
447        // Called at the end of simulation to dump page access stats.
448        void exitCallback();
449
450        EventFunctionWrapper exitEvent;
451    };
452}
453
454#endif // __GPU_TLB_HH__
455