1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Lisa Hsu
34 */
35
36#ifndef __GPU_TLB_HH__
37#define __GPU_TLB_HH__
38
39#include <fstream>
40#include <list>
41#include <queue>
42#include <string>
43#include <vector>
44
45#include "arch/generic/tlb.hh"
46#include "arch/x86/pagetable.hh"
47#include "arch/x86/pagetable_walker.hh"
48#include "arch/x86/regs/segment.hh"
49#include "base/callback.hh"
50#include "base/logging.hh"
51#include "base/statistics.hh"
52#include "gpu-compute/compute_unit.hh"
53#include "mem/port.hh"
54#include "mem/request.hh"
55#include "params/X86GPUTLB.hh"
56#include "sim/clocked_object.hh"
57#include "sim/sim_object.hh"
58
59class BaseTLB;
60class Packet;
61class ThreadContext;
62
63namespace X86ISA
64{
65    class GpuTLB : public ClockedObject
66    {
67      protected:
68        friend class Walker;
69
70        typedef std::list<TlbEntry*> EntryList;
71
72        uint32_t configAddress;
73
74        // TLB clock: will inherit clock from shader's clock period in terms
75        // of nuber of ticks of curTime (aka global simulation clock)
76        // The assignment of TLB clock from shader clock is done in the python
77        // config files.
78        int clock;
79
80      public:
81        // clock related functions ; maps to-and-from Simulation ticks and
82        // object clocks.
83        Tick frequency() const { return SimClock::Frequency / clock; }
84
85        Tick
86        ticks(int numCycles) const
87        {
88            return (Tick)clock * numCycles;
89        }
90
91        Tick curCycle() const { return curTick() / clock; }
92        Tick tickToCycles(Tick val) const { return val / clock;}
93
94        typedef X86GPUTLBParams Params;
95        GpuTLB(const Params *p);
96        ~GpuTLB();
97
98        typedef enum BaseTLB::Mode Mode;
99
100        class Translation
101        {
102          public:
103            virtual ~Translation() { }
104
105            /**
106             * Signal that the translation has been delayed due to a hw page
107             * table walk.
108             */
109            virtual void markDelayed() = 0;
110
111            /**
112             * The memory for this object may be dynamically allocated, and it
113             * may be responsible for cleaning itslef up which will happen in
114             * this function. Once it's called the object is no longer valid.
115             */
116            virtual void finish(Fault fault, const RequestPtr &req,
117                                ThreadContext *tc, Mode mode) = 0;
118        };
119
120        void dumpAll();
121        TlbEntry *lookup(Addr va, bool update_lru=true);
122        void setConfigAddress(uint32_t addr);
123
124      protected:
125        EntryList::iterator lookupIt(Addr va, bool update_lru=true);
126        Walker *walker;
127
128      public:
129        Walker *getWalker();
130        void invalidateAll();
131        void invalidateNonGlobal();
132        void demapPage(Addr va, uint64_t asn);
133
134      protected:
135        int size;
136        int assoc;
137        int numSets;
138
139        /**
140         *  true if this is a fully-associative TLB
141         */
142        bool FA;
143        Addr setMask;
144
145        /**
146         * Allocation Policy: true if we always allocate on a hit, false
147         * otherwise. Default is true.
148         */
149        bool allocationPolicy;
150
151        /**
152         * if true, then this is not the last level TLB
153         */
154        bool hasMemSidePort;
155
156        /**
157         * Print out accessDistance stats. One stat file
158         * per TLB.
159         */
160        bool accessDistance;
161
162        std::vector<TlbEntry> tlb;
163
164        /*
165         * It's a per-set list. As long as we have not reached
166         * the full capacity of the given set, grab an entry from
167         * the freeList.
168         */
169        std::vector<EntryList> freeList;
170
171        /**
172         * An entryList per set is the equivalent of an LRU stack;
173         * it's used to guide replacement decisions. The head of the list
174         * contains the MRU TLB entry of the given set. If the freeList
175         * for this set is empty, the last element of the list
176         * is evicted (i.e., dropped on the floor).
177         */
178        std::vector<EntryList> entryList;
179
180        Fault translateInt(const RequestPtr &req, ThreadContext *tc);
181
182        Fault translate(const RequestPtr &req, ThreadContext *tc,
183                Translation *translation, Mode mode, bool &delayedResponse,
184                bool timing, int &latency);
185
186      public:
187        // latencies for a TLB hit, miss and page fault
188        int hitLatency;
189        int missLatency1;
190        int missLatency2;
191
192        // local_stats are as seen from the TLB
193        // without taking into account coalescing
194        Stats::Scalar localNumTLBAccesses;
195        Stats::Scalar localNumTLBHits;
196        Stats::Scalar localNumTLBMisses;
197        Stats::Formula localTLBMissRate;
198
199        // global_stats are as seen from the
200        // CU's perspective taking into account
201        // all coalesced requests.
202        Stats::Scalar globalNumTLBAccesses;
203        Stats::Scalar globalNumTLBHits;
204        Stats::Scalar globalNumTLBMisses;
205        Stats::Formula globalTLBMissRate;
206
207        // from the CU perspective (global)
208        Stats::Scalar accessCycles;
209        // from the CU perspective (global)
210        Stats::Scalar pageTableCycles;
211        Stats::Scalar numUniquePages;
212        // from the perspective of this TLB
213        Stats::Scalar localCycles;
214        // from the perspective of this TLB
215        Stats::Formula localLatency;
216        // I take the avg. per page and then
217        // the avg. over all pages.
218        Stats::Scalar avgReuseDistance;
219
220        void regStats() override;
221        void updatePageFootprint(Addr virt_page_addr);
222        void printAccessPattern();
223
224
225        Fault translateAtomic(const RequestPtr &req, ThreadContext *tc,
226                              Mode mode, int &latency);
227
228        void translateTiming(const RequestPtr &req, ThreadContext *tc,
229                             Translation *translation, Mode mode,
230                             int &latency);
231
232        Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
233        Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
234
235        TlbEntry *insert(Addr vpn, TlbEntry &entry);
236
237        // Checkpointing
238        virtual void serialize(CheckpointOut& cp) const override;
239        virtual void unserialize(CheckpointIn& cp) override;
240        void issueTranslation();
241        enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
242        bool tlbLookup(const RequestPtr &req,
243                       ThreadContext *tc, bool update_stats);
244
245        void handleTranslationReturn(Addr addr, tlbOutcome outcome,
246                                     PacketPtr pkt);
247
248        void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
249
250        void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
251                                    TlbEntry *tlb_entry, Mode mode);
252
253        void updatePhysAddresses(Addr virt_page_addr, TlbEntry *tlb_entry,
254                                 Addr phys_page_addr);
255
256        void issueTLBLookup(PacketPtr pkt);
257
258        // CpuSidePort is the TLB Port closer to the CPU/CU side
259        class CpuSidePort : public SlavePort
260        {
261          public:
262            CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
263                        PortID _index)
264                : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
265
266          protected:
267            GpuTLB *tlb;
268            int index;
269
270            virtual bool recvTimingReq(PacketPtr pkt);
271            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
272            virtual void recvFunctional(PacketPtr pkt);
273            virtual void recvRangeChange() { }
274            virtual void recvReqRetry();
275            virtual void recvRespRetry() { panic("recvRespRetry called"); }
276            virtual AddrRangeList getAddrRanges() const;
277        };
278
279        /**
280         * MemSidePort is the TLB Port closer to the memory side
281         * If this is a last level TLB then this port will not be connected.
282         *
283         * Future action item: if we ever do real page walks, then this port
284         * should be connected to a RubyPort.
285         */
286        class MemSidePort : public MasterPort
287        {
288          public:
289            MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
290                        PortID _index)
291                : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
292
293            std::deque<PacketPtr> retries;
294
295          protected:
296            GpuTLB *tlb;
297            int index;
298
299            virtual bool recvTimingResp(PacketPtr pkt);
300            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
301            virtual void recvFunctional(PacketPtr pkt) { }
302            virtual void recvRangeChange() { }
303            virtual void recvReqRetry();
304        };
305
306        // TLB ports on the cpu Side
307        std::vector<CpuSidePort*> cpuSidePort;
308        // TLB ports on the memory side
309        std::vector<MemSidePort*> memSidePort;
310
311        Port &getPort(const std::string &if_name,
312                      PortID idx=InvalidPortID) override;
313
314        /**
315         * TLB TranslationState: this currently is a somewhat bastardization of
316         * the usage of SenderState, whereby the receiver of a packet is not
317         * usually supposed to need to look at the contents of the senderState,
318         * you're really only supposed to look at what you pushed on, pop it
319         * off, and send it back.
320         *
321         * However, since there is state that we want to pass to the TLBs using
322         * the send/recv Timing/Functional/etc. APIs, which don't allow for new
323         * arguments, we need a common TLB senderState to pass between TLBs,
324         * both "forwards" and "backwards."
325         *
326         * So, basically, the rule is that any packet received by a TLB port
327         * (cpuside OR memside) must be safely castable to a TranslationState.
328         */
329
330        struct TranslationState : public Packet::SenderState
331        {
332            // TLB mode, read or write
333            Mode tlbMode;
334            // Thread context associated with this req
335            ThreadContext *tc;
336
337            /*
338            * TLB entry to be populated and passed back and filled in
339            * previous TLBs.  Equivalent to the data cache concept of
340            * "data return."
341            */
342            TlbEntry *tlbEntry;
343            // Is this a TLB prefetch request?
344            bool prefetch;
345            // When was the req for this translation issued
346            uint64_t issueTime;
347            // Remember where this came from
348            std::vector<SlavePort*>ports;
349
350            // keep track of #uncoalesced reqs per packet per TLB level;
351            // reqCnt per level >= reqCnt higher level
352            std::vector<int> reqCnt;
353            // TLB level this packet hit in; 0 if it hit in the page table
354            int hitLevel;
355            Packet::SenderState *saved;
356
357            TranslationState(Mode tlb_mode, ThreadContext *_tc,
358                             bool _prefetch=false,
359                             Packet::SenderState *_saved=nullptr)
360                : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
361                  prefetch(_prefetch), issueTime(0),
362                  hitLevel(0),saved(_saved) { }
363        };
364
365        // maximum number of permitted coalesced requests per cycle
366        int maxCoalescedReqs;
367
368        // Current number of outstandings coalesced requests.
369        // Should be <= maxCoalescedReqs
370        int outstandingReqs;
371
372        /**
373         * A TLBEvent is scheduled after the TLB lookup and helps us take the
374         * appropriate actions:
375         *  (e.g., update TLB on a hit,
376         *  send request to lower level TLB on a miss,
377         *  or start a page walk if this was the last-level TLB).
378         */
379        void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
380                               PacketPtr pkt);
381
382        class TLBEvent : public Event
383        {
384            private:
385                GpuTLB *tlb;
386                Addr virtPageAddr;
387                /**
388                 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
389                 */
390                tlbOutcome outcome;
391                PacketPtr pkt;
392
393            public:
394                TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
395                        PacketPtr _pkt);
396
397                void process();
398                const char *description() const;
399
400                // updateOutcome updates the tlbOutcome of a TLBEvent
401                void updateOutcome(tlbOutcome _outcome);
402                Addr getTLBEventVaddr();
403        };
404
405        std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
406
407        // this FIFO queue keeps track of the virt. page addresses
408        // that are pending cleanup
409        std::queue<Addr> cleanupQueue;
410
411        // the cleanupEvent is scheduled after a TLBEvent triggers in order to
412        // free memory and do the required clean-up
413        void cleanup();
414
415        EventFunctionWrapper cleanupEvent;
416
417        /**
418         * This hash map will use the virtual page address as a key
419         * and will keep track of total number of accesses per page
420         */
421
422        struct AccessInfo
423        {
424            unsigned int lastTimeAccessed; // last access to this page
425            unsigned int accessesPerPage;
426            // need to divide it by accessesPerPage at the end
427            unsigned int totalReuseDistance;
428
429            /**
430             * The field below will help us compute the access distance,
431             * that is the number of (coalesced) TLB accesses that
432             * happened in between each access to this page
433             *
434             * localTLBAccesses[x] is the value of localTLBNumAccesses
435             * when the page <Addr> was accessed for the <x>th time
436             */
437            std::vector<unsigned int> localTLBAccesses;
438            unsigned int sumDistance;
439            unsigned int meanDistance;
440        };
441
442        typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
443        AccessPatternTable TLBFootprint;
444
445        // Called at the end of simulation to dump page access stats.
446        void exitCallback();
447
448        EventFunctionWrapper exitEvent;
449    };
450}
451
452#endif // __GPU_TLB_HH__
453