src/gpu-compute/gpu_tlb.hh

/*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Lisa Hsu
 */

#ifndef __GPU_TLB_HH__
#define __GPU_TLB_HH__

#include <fstream>
#include <list>
#include <queue>
#include <string>
#include <vector>

#include "arch/generic/tlb.hh"
#include "arch/x86/pagetable.hh"
#include "arch/x86/pagetable_walker.hh"
#include "arch/x86/regs/segment.hh"
#include "base/callback.hh"
#include "base/misc.hh"
#include "base/statistics.hh"
#include "gpu-compute/compute_unit.hh"
#include "mem/mem_object.hh"
#include "mem/port.hh"
#include "mem/request.hh"
#include "params/X86GPUTLB.hh"
#include "sim/sim_object.hh"

class BaseTLB;
class Packet;
class ThreadContext;

namespace X86ISA
{
    class GpuTlbEntry : public TlbEntry
    {
      public:
        GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
          : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }

        GpuTlbEntry() : TlbEntry() { }

        bool valid;
    };

    class GpuTLB : public MemObject
    {
      protected:
        friend class Walker;

        typedef std::list<GpuTlbEntry*> EntryList;

        uint32_t configAddress;

        // TLB clock: will inherit clock from shader's clock period in terms
        // of nuber of ticks of curTime (aka global simulation clock)
        // The assignment of TLB clock from shader clock is done in the python
        // config files.
        int clock;

      public:
        // clock related functions ; maps to-and-from Simulation ticks and
        // object clocks.
        Tick frequency() const { return SimClock::Frequency / clock; }

        Tick
        ticks(int numCycles) const
        {
            return (Tick)clock * numCycles;
        }

        Tick curCycle() const { return curTick() / clock; }
        Tick tickToCycles(Tick val) const { return val / clock;}

        typedef X86GPUTLBParams Params;
        GpuTLB(const Params *p);
        ~GpuTLB();

        typedef enum BaseTLB::Mode Mode;

        class Translation
        {
          public:
            virtual ~Translation() { }

            /**
             * Signal that the translation has been delayed due to a hw page
             * table walk.
             */
            virtual void markDelayed() = 0;

            /**
             * The memory for this object may be dynamically allocated, and it
             * may be responsible for cleaning itslef up which will happen in
             * this function. Once it's called the object is no longer valid.
             */
            virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
                    Mode mode) = 0;
        };

        void dumpAll();
        GpuTlbEntry *lookup(Addr va, bool update_lru=true);
        void setConfigAddress(uint32_t addr);

      protected:
        EntryList::iterator lookupIt(Addr va, bool update_lru=true);
        Walker *walker;

      public:
        Walker *getWalker();
        void invalidateAll();
        void invalidateNonGlobal();
        void demapPage(Addr va, uint64_t asn);

      protected:
        int size;
        int assoc;
        int numSets;

        /**
         *  true if this is a fully-associative TLB
         */
        bool FA;
        Addr setMask;

        /**
         * Allocation Policy: true if we always allocate on a hit, false
         * otherwise. Default is true.
         */
        bool allocationPolicy;

        /**
         * if true, then this is not the last level TLB
         */
        bool hasMemSidePort;

        /**
         * Print out accessDistance stats. One stat file
         * per TLB.
         */
        bool accessDistance;

        GpuTlbEntry *tlb;

        /*
         * It's a per-set list. As long as we have not reached
         * the full capacity of the given set, grab an entry from
         * the freeList.
         */
        std::vector<EntryList> freeList;

        /**
         * An entryList per set is the equivalent of an LRU stack;
         * it's used to guide replacement decisions. The head of the list
         * contains the MRU TLB entry of the given set. If the freeList
         * for this set is empty, the last element of the list
         * is evicted (i.e., dropped on the floor).
         */
        std::vector<EntryList> entryList;

        Fault translateInt(RequestPtr req, ThreadContext *tc);

        Fault translate(RequestPtr req, ThreadContext *tc,
                Translation *translation, Mode mode, bool &delayedResponse,
                bool timing, int &latency);

      public:
        // latencies for a TLB hit, miss and page fault
        int hitLatency;
        int missLatency1;
        int missLatency2;

        // local_stats are as seen from the TLB
        // without taking into account coalescing
        Stats::Scalar localNumTLBAccesses;
        Stats::Scalar localNumTLBHits;
        Stats::Scalar localNumTLBMisses;
        Stats::Formula localTLBMissRate;

        // global_stats are as seen from the
        // CU's perspective taking into account
        // all coalesced requests.
        Stats::Scalar globalNumTLBAccesses;
        Stats::Scalar globalNumTLBHits;
        Stats::Scalar globalNumTLBMisses;
        Stats::Formula globalTLBMissRate;

        // from the CU perspective (global)
        Stats::Scalar accessCycles;
        // from the CU perspective (global)
        Stats::Scalar pageTableCycles;
        Stats::Scalar numUniquePages;
        // from the perspective of this TLB
        Stats::Scalar localCycles;
        // from the perspective of this TLB
        Stats::Formula localLatency;
        // I take the avg. per page and then
        // the avg. over all pages.
        Stats::Scalar avgReuseDistance;

        void regStats();
        void updatePageFootprint(Addr virt_page_addr);
        void printAccessPattern();


        Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
                              int &latency);

        void translateTiming(RequestPtr req, ThreadContext *tc,
                             Translation *translation, Mode mode,
                             int &latency);

        Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
        Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);

        GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);

        // Checkpointing
        virtual void serialize(CheckpointOut& cp) const;
        virtual void unserialize(CheckpointIn& cp);
        void issueTranslation();
        enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
        bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);

        void handleTranslationReturn(Addr addr, tlbOutcome outcome,
                                     PacketPtr pkt);

        void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);

        void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
                                    GpuTlbEntry *tlb_entry, Mode mode);

        void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
                                 Addr phys_page_addr);

        void issueTLBLookup(PacketPtr pkt);

        // CpuSidePort is the TLB Port closer to the CPU/CU side
        class CpuSidePort : public SlavePort
        {
          public:
            CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
                        PortID _index)
                : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }

          protected:
            GpuTLB *tlb;
            int index;

            virtual bool recvTimingReq(PacketPtr pkt);
            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
            virtual void recvFunctional(PacketPtr pkt);
            virtual void recvRangeChange() { }
            virtual void recvReqRetry();
            virtual void recvRespRetry() { assert(false); }
            virtual AddrRangeList getAddrRanges() const;
        };

        /**
         * MemSidePort is the TLB Port closer to the memory side
         * If this is a last level TLB then this port will not be connected.
         *
         * Future action item: if we ever do real page walks, then this port
         * should be connected to a RubyPort.
         */
        class MemSidePort : public MasterPort
        {
          public:
            MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
                        PortID _index)
                : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }

            std::deque<PacketPtr> retries;

          protected:
            GpuTLB *tlb;
            int index;

            virtual bool recvTimingResp(PacketPtr pkt);
            virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
            virtual void recvFunctional(PacketPtr pkt) { }
            virtual void recvRangeChange() { }
            virtual void recvReqRetry();
        };

        // TLB ports on the cpu Side
        std::vector<CpuSidePort*> cpuSidePort;
        // TLB ports on the memory side
        std::vector<MemSidePort*> memSidePort;

        BaseMasterPort &getMasterPort(const std::string &if_name,
                                      PortID idx=InvalidPortID);

        BaseSlavePort &getSlavePort(const std::string &if_name,
                                    PortID idx=InvalidPortID);

        /**
         * TLB TranslationState: this currently is a somewhat bastardization of
         * the usage of SenderState, whereby the receiver of a packet is not
         * usually supposed to need to look at the contents of the senderState,
         * you're really only supposed to look at what you pushed on, pop it
         * off, and send it back.
         *
         * However, since there is state that we want to pass to the TLBs using
         * the send/recv Timing/Functional/etc. APIs, which don't allow for new
         * arguments, we need a common TLB senderState to pass between TLBs,
         * both "forwards" and "backwards."
         *
         * So, basically, the rule is that any packet received by a TLB port
         * (cpuside OR memside) must be safely castable to a TranslationState.
         */

        struct TranslationState : public Packet::SenderState
        {
            // TLB mode, read or write
            Mode tlbMode;
            // Thread context associated with this req
            ThreadContext *tc;

            /*
            * TLB entry to be populated and passed back and filled in
            * previous TLBs.  Equivalent to the data cache concept of
            * "data return."
            */
            GpuTlbEntry *tlbEntry;
            // Is this a TLB prefetch request?
            bool prefetch;
            // When was the req for this translation issued
            uint64_t issueTime;
            // Remember where this came from
            std::vector<SlavePort*>ports;

            // keep track of #uncoalesced reqs per packet per TLB level;
            // reqCnt per level >= reqCnt higher level
            std::vector<int> reqCnt;
            // TLB level this packet hit in; 0 if it hit in the page table
            int hitLevel;
            Packet::SenderState *saved;

            TranslationState(Mode tlb_mode, ThreadContext *_tc,
                             bool _prefetch=false,
                             Packet::SenderState *_saved=nullptr)
                : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
                  prefetch(_prefetch), issueTime(0),
                  hitLevel(0),saved(_saved) { }
        };

        // maximum number of permitted coalesced requests per cycle
        int maxCoalescedReqs;

        // Current number of outstandings coalesced requests.
        // Should be <= maxCoalescedReqs
        int outstandingReqs;

        /**
         * A TLBEvent is scheduled after the TLB lookup and helps us take the
         * appropriate actions:
         *  (e.g., update TLB on a hit,
         *  send request to lower level TLB on a miss,
         *  or start a page walk if this was the last-level TLB).
         */
        void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
                               PacketPtr pkt);

        class TLBEvent : public Event
        {
            private:
                GpuTLB *tlb;
                Addr virtPageAddr;
                /**
                 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
                 */
                tlbOutcome outcome;
                PacketPtr pkt;

            public:
                TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
                        PacketPtr _pkt);

                void process();
                const char *description() const;

                // updateOutcome updates the tlbOutcome of a TLBEvent
                void updateOutcome(tlbOutcome _outcome);
                Addr getTLBEventVaddr();
        };

        std::unordered_map<Addr, TLBEvent*> translationReturnEvent;

        // this FIFO queue keeps track of the virt. page addresses
        // that are pending cleanup
        std::queue<Addr> cleanupQueue;

        // the cleanupEvent is scheduled after a TLBEvent triggers in order to
        // free memory and do the required clean-up
        void cleanup();

        EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;

        /**
         * This hash map will use the virtual page address as a key
         * and will keep track of total number of accesses per page
         */

        struct AccessInfo
        {
            unsigned int lastTimeAccessed; // last access to this page
            unsigned int accessesPerPage;
            // need to divide it by accessesPerPage at the end
            unsigned int totalReuseDistance;

            /**
             * The field below will help us compute the access distance,
             * that is the number of (coalesced) TLB accesses that
             * happened in between each access to this page
             *
             * localTLBAccesses[x] is the value of localTLBNumAccesses
             * when the page <Addr> was accessed for the <x>th time
             */
            std::vector<unsigned int> localTLBAccesses;
            unsigned int sumDistance;
            unsigned int meanDistance;
        };

        typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
        AccessPatternTable TLBFootprint;

        // Called at the end of simulation to dump page access stats.
        void exitCallback();

        EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
    };
}

#endif // __GPU_TLB_HH__