1/*
2 * Copyright (c) 2013 - 2016 ARM Limited
3 * All rights reserved
4 *
5 * The license below extends only to copyright in the software and shall
6 * not be construed as granting a license to any other intellectual
7 * property including but not limited to intellectual property relating
8 * to a hardware implementation of the functionality of the software
9 * licensed hereunder.  You may use the software subject to the license
10 * terms below provided that you ensure that this notice is replicated
11 * unmodified and in its entirety in all distributions of the software,
12 * modified or unmodified, in source code or in binary form.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met: redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer;
18 * redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution;
21 * neither the name of the copyright holders nor the names of its
22 * contributors may be used to endorse or promote products derived from
23 * this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 *
37 * Authors: Radhika Jagtap
38 *          Andreas Hansson
39 *          Thomas Grass
40 */
41
42#ifndef __CPU_TRACE_TRACE_CPU_HH__
43#define __CPU_TRACE_TRACE_CPU_HH__
44
45#include <array>
46#include <cstdint>
47#include <queue>
48#include <set>
49#include <unordered_map>
50
51#include "arch/registers.hh"
52#include "base/statistics.hh"
53#include "cpu/base.hh"
54#include "debug/TraceCPUData.hh"
55#include "debug/TraceCPUInst.hh"
56#include "params/TraceCPU.hh"
57#include "proto/inst_dep_record.pb.h"
58#include "proto/packet.pb.h"
59#include "proto/protoio.hh"
60#include "sim/sim_events.hh"
61
62/**
63 * The trace cpu replays traces generated using the elastic trace probe
64 * attached to the O3 CPU model. The elastic trace is an execution trace with
65 * register data dependencies and ordering dependencies annotated to it. The
66 * trace cpu also replays a fixed timestamp fetch trace that is also generated
67 * by the elastic trace probe. This trace cpu model aims at achieving faster
68 * simulation compared to the detailed cpu model and good correlation when the
69 * same trace is used for playback on different memory sub-systems.
70 *
71 * The TraceCPU inherits from BaseCPU so some virtual methods need to be
72 * defined. It has two port subclasses inherited from MasterPort for
73 * instruction and data ports. It issues the memory requests deducing the
74 * timing from the trace and without performing real execution of micro-ops. As
75 * soon as the last dependency for an instruction is complete, its
76 * computational delay, also provided in the input trace is added. The
77 * dependency-free nodes are maintained in a list, called 'ReadyList', ordered
78 * by ready time. Instructions which depend on load stall until the responses
79 * for read requests are received thus achieving elastic replay. If the
80 * dependency is not found when adding a new node, it is assumed complete.
81 * Thus, if this node is found to be completely dependency-free its issue time
82 * is calculated and it is added to the ready list immediately. This is
83 * encapsulated in the subclass ElasticDataGen.
84 *
85 * If ready nodes are issued in an unconstrained way there can be more nodes
86 * outstanding which results in divergence in timing compared to the O3CPU.
87 * Therefore, the Trace CPU also models hardware resources. A sub-class to
88 * model hardware resources contains the maximum sizes of load buffer, store
89 * buffer and ROB. If resources are not available, the node is not issued. Such
90 * nodes that are pending issue are held in the 'depFreeQueue' structure.
91 *
92 * Modeling the ROB size in the Trace CPU as a resource limitation is arguably
93 * the most important parameter of all resources. The ROB occupancy is
94 * estimated using the newly added field 'robNum'. We need to use ROB number as
95 * sequence number is at times much higher due to squashing and trace replay is
96 * focused on correct path modeling.
97 *
98 * A map called 'inFlightNodes' is added to track nodes that are not only in
99 * the readyList but also load nodes that are executed (and thus removed from
100 * readyList) but are not complete. ReadyList handles what and when to execute
101 * next node while the inFlightNodes is used for resource modelling. The oldest
102 * ROB number is updated when any node occupies the ROB or when an entry in the
103 * ROB is released. The ROB occupancy is equal to the difference in the ROB
104 * number of the newly dependency-free node and the oldest ROB number in
105 * flight.
106 *
107 * If no node depends on a non load/store node then there is no reason to
108 * track it in the dependency graph. We filter out such nodes but count them
109 * and add a weight field to the subsequent node that we do include in the
110 * trace. The weight field is used to model ROB occupancy during replay.
111 *
112 * The depFreeQueue is chosen to be FIFO so that child nodes which are in
113 * program order get pushed into it in that order and thus issued in program
114 * order, like in the O3CPU. This is also why the dependents is made a
115 * sequential container, std::set to std::vector. We only check head of the
116 * depFreeQueue as nodes are issued in order and blocking on head models that
117 * better than looping the entire queue. An alternative choice would be to
118 * inspect top N pending nodes where N is the issue-width. This is left for
119 * future as the timing correlation looks good as it is.
120 *
121 * At the start of an execution event, first we attempt to issue such pending
122 * nodes by checking if appropriate resources have become available. If yes, we
123 * compute the execute tick with respect to the time then. Then we proceed to
124 * complete nodes from the readyList.
125 *
126 * When a read response is received, sometimes a dependency on it that was
127 * supposed to be released when it was issued is still not released. This
128 * occurs because the dependent gets added to the graph after the read was
129 * sent. So the check is made less strict and the dependency is marked complete
130 * on read response instead of insisting that it should have been removed on
131 * read sent.
132 *
133 * There is a check for requests spanning two cache lines as this condition
134 * triggers an assert fail in the L1 cache. If it does then truncate the size
135 * to access only until the end of that line and ignore the remainder.
136 * Strictly-ordered requests are skipped and the dependencies on such requests
137 * are handled by simply marking them complete immediately.
138 *
139 * A CountedExitEvent that contains a static int belonging to the Trace CPU
140 * class as a down counter is used to implement multi Trace CPU simulation
141 * exit.
142 */
143
144class TraceCPU : public BaseCPU
145{
146
147  public:
148    TraceCPU(TraceCPUParams *params);
149    ~TraceCPU();
150
151    void init();
152
153    /**
154     * This is a pure virtual function in BaseCPU. As we don't know how many
155     * insts are in the trace but only know how how many micro-ops are we
156     * cannot count this stat.
157     *
158     * @return 0
159     */
160    Counter totalInsts() const
161    {
162        return 0;
163    }
164
165    /**
166     * Return totalOps as the number of committed micro-ops plus the
167     * speculatively issued loads that are modelled in the TraceCPU replay.
168     *
169     * @return number of micro-ops i.e. nodes in the elastic data generator
170     */
171    Counter totalOps() const
172    {
173        return numOps.value();
174    }
175
176    /*
177     * Set the no. of ops when elastic data generator completes executing a
178     * node.
179     */
180    void updateNumOps(uint64_t rob_num);
181
182    /* Pure virtual function in BaseCPU. Do nothing. */
183    void wakeup(ThreadID tid = 0)
184    {
185        return;
186    }
187
188    /*
189     * When resuming from checkpoint in FS mode, the TraceCPU takes over from
190     * the old cpu. This function overrides the takeOverFrom() function in the
191     * BaseCPU. It unbinds the ports of the old CPU and binds the ports of the
192     * TraceCPU.
193     */
194    void takeOverFrom(BaseCPU *oldCPU);
195
196    /**
197     * When instruction cache port receives a retry, schedule event
198     * icacheNextEvent.
199     */
200    void icacheRetryRecvd();
201
202    /**
203     * When data cache port receives a retry, schedule event
204     * dcacheNextEvent.
205     */
206    void dcacheRetryRecvd();
207
208    /**
209     * When data cache port receives a response, this calls the dcache
210     * generator method handle to complete the load writeback.
211     *
212     * @param pkt Pointer to packet received
213     */
214    void dcacheRecvTimingResp(PacketPtr pkt);
215
216    /**
217     * Schedule event dcacheNextEvent at the given tick
218     *
219     * @param when Tick at which to schedule event
220     */
221    void schedDcacheNextEvent(Tick when);
222
223  protected:
224
225    /**
226     * IcachePort class that interfaces with L1 Instruction Cache.
227     */
228    class IcachePort : public MasterPort
229    {
230      public:
231        /** Default constructor. */
232        IcachePort(TraceCPU* _cpu)
233            : MasterPort(_cpu->name() + ".icache_port", _cpu),
234                         owner(_cpu)
235        { }
236
237      public:
238        /**
239         * Receive the timing reponse and simply delete the packet since
240         * instruction fetch requests are issued as per the timing in the trace
241         * and responses are ignored.
242         *
243         * @param pkt Pointer to packet received
244         * @return true
245         */
246        bool recvTimingResp(PacketPtr pkt);
247
248        /**
249         * Required functionally but do nothing.
250         *
251         * @param pkt Pointer to packet received
252         */
253        void recvTimingSnoopReq(PacketPtr pkt) { }
254
255        /**
256         * Handle a retry signalled by the cache if instruction read failed in
257         * the first attempt.
258         */
259        void recvReqRetry();
260
261      private:
262        TraceCPU* owner;
263    };
264
265    /**
266     * DcachePort class that interfaces with L1 Data Cache.
267     */
268    class DcachePort : public MasterPort
269    {
270
271      public:
272        /** Default constructor. */
273        DcachePort(TraceCPU* _cpu)
274            : MasterPort(_cpu->name() + ".dcache_port", _cpu),
275                         owner(_cpu)
276        { }
277
278      public:
279
280        /**
281         * Receive the timing reponse and call dcacheRecvTimingResp() method
282         * of the dcacheGen to handle completing the load
283         *
284         * @param pkt Pointer to packet received
285         * @return true
286         */
287        bool recvTimingResp(PacketPtr pkt);
288
289        /**
290         * Required functionally but do nothing.
291         *
292         * @param pkt Pointer to packet received
293         */
294        void recvTimingSnoopReq(PacketPtr pkt)
295        { }
296
297        /**
298         * Required functionally but do nothing.
299         *
300         * @param pkt Pointer to packet received
301         */
302        void recvFunctionalSnoop(PacketPtr pkt)
303        { }
304
305        /**
306         * Handle a retry signalled by the cache if data access failed in the
307         * first attempt.
308         */
309        void recvReqRetry();
310
311        /**
312         * Required functionally.
313         *
314         * @return true since we have to snoop
315         */
316        bool isSnooping() const { return true; }
317
318      private:
319        TraceCPU* owner;
320    };
321
322    /** Port to connect to L1 instruction cache. */
323    IcachePort icachePort;
324
325    /** Port to connect to L1 data cache. */
326    DcachePort dcachePort;
327
328    /** Master id for instruction read requests. */
329    const MasterID instMasterID;
330
331    /** Master id for data read and write requests. */
332    const MasterID dataMasterID;
333
334    /** File names for input instruction and data traces. */
335    std::string instTraceFile, dataTraceFile;
336
337    /**
338     * Generator to read protobuf trace containing memory requests at fixed
339     * timestamps, perform flow control and issue memory requests. If L1 cache
340     * port sends packet succesfully, determine the tick to send the next
341     * packet else wait for retry from cache.
342     */
343    class FixedRetryGen
344    {
345
346      private:
347
348        /**
349         * This struct stores a line in the trace file.
350         */
351        struct TraceElement {
352
353            /** Specifies if the request is to be a read or a write */
354            MemCmd cmd;
355
356            /** The address for the request */
357            Addr addr;
358
359            /** The size of the access for the request */
360            Addr blocksize;
361
362            /** The time at which the request should be sent */
363            Tick tick;
364
365            /** Potential request flags to use */
366            Request::FlagsType flags;
367
368            /** Instruction PC */
369            Addr pc;
370
371            /**
372             * Check validity of this element.
373             *
374             * @return if this element is valid
375             */
376            bool isValid() const {
377                return cmd != MemCmd::InvalidCmd;
378            }
379
380            /**
381             * Make this element invalid.
382             */
383            void clear() {
384                cmd = MemCmd::InvalidCmd;
385            }
386        };
387
388        /**
389         * The InputStream encapsulates a trace file and the
390         * internal buffers and populates TraceElements based on
391         * the input.
392         */
393        class InputStream
394        {
395
396          private:
397
398            // Input file stream for the protobuf trace
399            ProtoInputStream trace;
400
401          public:
402
403            /**
404             * Create a trace input stream for a given file name.
405             *
406             * @param filename Path to the file to read from
407             */
408            InputStream(const std::string& filename);
409
410            /**
411             * Reset the stream such that it can be played once
412             * again.
413             */
414            void reset();
415
416            /**
417             * Attempt to read a trace element from the stream,
418             * and also notify the caller if the end of the file
419             * was reached.
420             *
421             * @param element Trace element to populate
422             * @return True if an element could be read successfully
423             */
424            bool read(TraceElement* element);
425        };
426
427        public:
428        /* Constructor */
429        FixedRetryGen(TraceCPU& _owner, const std::string& _name,
430                   MasterPort& _port, MasterID master_id,
431                   const std::string& trace_file)
432            : owner(_owner),
433              port(_port),
434              masterID(master_id),
435              trace(trace_file),
436              genName(owner.name() + ".fixedretry" + _name),
437              retryPkt(nullptr),
438              delta(0),
439              traceComplete(false)
440        {
441        }
442
443        /**
444         * Called from TraceCPU init(). Reads the first message from the
445         * input trace file and returns the send tick.
446         *
447         * @return Tick when first packet must be sent
448         */
449        Tick init();
450
451        /**
452         * This tries to send current or retry packet and returns true if
453         * successfull. It calls nextExecute() to read next message.
454         *
455         * @return bool true if packet is sent successfully
456         */
457        bool tryNext();
458
459        /** Returns name of the FixedRetryGen instance. */
460        const std::string& name() const { return genName; }
461
462        /**
463         * Creates a new request assigning the request parameters passed by the
464         * arguments. Calls the port's sendTimingReq() and returns true if
465         * the packet was sent succesfully. It is called by tryNext()
466         *
467         * @param addr address of request
468         * @param size size of request
469         * @param cmd if it is a read or write request
470         * @param flags associated request flags
471         * @param pc instruction PC that generated the request
472         *
473         * @return true if packet was sent successfully
474         */
475        bool send(Addr addr, unsigned size, const MemCmd& cmd,
476              Request::FlagsType flags, Addr pc);
477
478        /** Exit the FixedRetryGen. */
479        void exit();
480
481        /**
482         * Reads a line of the trace file. Returns the tick
483         * when the next request should be generated. If the end
484         * of the file has been reached, it returns false.
485         *
486         * @return bool false id end of file has been reached
487         */
488        bool nextExecute();
489
490        /**
491         * Returns the traceComplete variable which is set when end of the
492         * input trace file is reached.
493         *
494         * @return bool true if traceComplete is set, false otherwise.
495         */
496        bool isTraceComplete() { return traceComplete; }
497
498        int64_t tickDelta() { return delta; }
499
500        void regStats();
501
502      private:
503
504        /** Reference of the TraceCPU. */
505        TraceCPU& owner;
506
507        /** Reference of the port to be used to issue memory requests. */
508        MasterPort& port;
509
510        /** MasterID used for the requests being sent. */
511        const MasterID masterID;
512
513        /** Input stream used for reading the input trace file. */
514        InputStream trace;
515
516        /** String to store the name of the FixedRetryGen. */
517        std::string genName;
518
519        /** PacketPtr used to store the packet to retry. */
520        PacketPtr retryPkt;
521
522        /**
523         * Stores the difference in the send ticks of the current and last
524         * packets. Keeping this signed to check overflow to a negative value
525         * which will be caught by assert(delta > 0)
526         */
527        int64_t delta;
528
529        /**
530         * Set to true when end of trace is reached.
531         */
532        bool traceComplete;
533
534        /** Store an element read from the trace to send as the next packet. */
535        TraceElement currElement;
536
537        /** Stats for instruction accesses replayed. */
538        Stats::Scalar numSendAttempted;
539        Stats::Scalar numSendSucceeded;
540        Stats::Scalar numSendFailed;
541        Stats::Scalar numRetrySucceeded;
542        /** Last simulated tick by the FixedRetryGen */
543        Stats::Scalar instLastTick;
544
545    };
546
547    /**
548     * The elastic data memory request generator to read protobuf trace
549     * containing execution trace annotated with data and ordering
550     * dependencies. It deduces the time at which to send a load/store request
551     * by tracking the dependencies. It attempts to send a memory request for a
552     * load/store without performing real execution of micro-ops. If L1 cache
553     * port sends packet succesfully, the generator checks which instructions
554     * became dependency free as a result of this and schedules an event
555     * accordingly. If it fails to send the packet, it waits for a retry from
556     * the cache.
557     */
558    class ElasticDataGen
559    {
560
561      private:
562
563        /** Node sequence number type. */
564        typedef uint64_t NodeSeqNum;
565
566        /** Node ROB number type. */
567        typedef uint64_t NodeRobNum;
568
569        typedef ProtoMessage::InstDepRecord::RecordType RecordType;
570        typedef ProtoMessage::InstDepRecord Record;
571
572        /**
573         * The struct GraphNode stores an instruction in the trace file. The
574         * format of the trace file favours constructing a dependency graph of
575         * the execution and this struct is used to encapsulate the request
576         * data as well as pointers to its dependent GraphNodes.
577         */
578        class GraphNode {
579
580          public:
581            /**
582             * The maximum no. of ROB dependencies. There can be at most 2
583             * order dependencies which could exist for a store. For a load
584             * and comp node there can be at most one order dependency.
585             */
586            static const uint8_t maxRobDep = 2;
587
588            /** Typedef for the array containing the ROB dependencies */
589            typedef std::array<NodeSeqNum, maxRobDep> RobDepArray;
590
591            /** Typedef for the array containing the register dependencies */
592            typedef std::array<NodeSeqNum, TheISA::MaxInstSrcRegs> RegDepArray;
593
594            /** Instruction sequence number */
595            NodeSeqNum seqNum;
596
597            /** ROB occupancy number */
598            NodeRobNum robNum;
599
600           /** Type of the node corresponding to the instruction modelled by it */
601            RecordType type;
602
603            /** The address for the request if any */
604            Addr physAddr;
605
606            /** The virtual address for the request if any */
607            Addr virtAddr;
608
609            /** The address space id which is set if the virtual address is set */
610            uint32_t asid;
611
612            /** Size of request if any */
613            uint32_t size;
614
615            /** Request flags if any */
616            Request::Flags flags;
617
618            /** Instruction PC */
619            Addr pc;
620
621            /** Array of order dependencies. */
622            RobDepArray robDep;
623
624            /** Number of order dependencies */
625            uint8_t numRobDep;
626
627            /** Computational delay */
628            uint64_t compDelay;
629
630            /**
631             * Array of register dependencies (incoming) if any. Maximum number
632             * of source registers used to set maximum size of the array
633             */
634            RegDepArray regDep;
635
636            /** Number of register dependencies */
637            uint8_t numRegDep;
638
639            /**
640             * A vector of nodes dependent (outgoing) on this node. A
641             * sequential container is chosen because when dependents become
642             * free, they attempt to issue in program order.
643             */
644            std::vector<GraphNode *> dependents;
645
646            /** Is the node a load */
647            bool isLoad() const { return (type == Record::LOAD); }
648
649            /** Is the node a store */
650            bool isStore() const { return (type == Record::STORE); }
651
652            /** Is the node a compute (non load/store) node */
653            bool isComp() const { return (type == Record::COMP); }
654
655            /** Initialize register dependency array to all zeroes */
656            void clearRegDep();
657
658            /** Initialize register dependency array to all zeroes */
659            void clearRobDep();
660
661            /** Remove completed instruction from register dependency array */
662            bool removeRegDep(NodeSeqNum reg_dep);
663
664            /** Remove completed instruction from order dependency array */
665            bool removeRobDep(NodeSeqNum rob_dep);
666
667            /** Check for all dependencies on completed inst */
668            bool removeDepOnInst(NodeSeqNum done_seq_num);
669
670            /** Return true if node has a request which is strictly ordered */
671            bool isStrictlyOrdered() const {
672                return (flags.isSet(Request::STRICT_ORDER));
673            }
674            /**
675             * Write out element in trace-compatible format using debug flag
676             * TraceCPUData.
677             */
678            void writeElementAsTrace() const;
679
680            /** Return string specifying the type of the node */
681            std::string typeToStr() const;
682        };
683
684        /** Struct to store a ready-to-execute node and its execution tick. */
685        struct ReadyNode
686        {
687            /** The sequence number of the ready node */
688            NodeSeqNum seqNum;
689
690            /** The tick at which the ready node must be executed */
691            Tick execTick;
692        };
693
694        /**
695         * The HardwareResource class models structures that hold the in-flight
696         * nodes. When a node becomes dependency free, first check if resources
697         * are available to issue it.
698         */
699        class HardwareResource
700        {
701          public:
702            /**
703             * Constructor that initializes the sizes of the structures.
704             *
705             * @param max_rob size of the Reorder Buffer
706             * @param max_stores size of Store Buffer
707             * @param max_loads size of Load Buffer
708             */
709            HardwareResource(uint16_t max_rob, uint16_t max_stores,
710                                uint16_t max_loads);
711
712            /**
713             * Occupy appropriate structures for an issued node.
714             *
715             * @param node_ptr pointer to the issued node
716             */
717            void occupy(const GraphNode* new_node);
718
719            /**
720             * Release appropriate structures for a completed node.
721             *
722             * @param node_ptr pointer to the completed node
723             */
724            void release(const GraphNode* done_node);
725
726            /** Release store buffer entry for a completed store */
727            void releaseStoreBuffer();
728
729            /**
730             * Check if structures required to issue a node are free.
731             *
732             * @param node_ptr pointer to the node ready to issue
733             * @return true if resources are available
734             */
735            bool isAvailable(const GraphNode* new_node) const;
736
737            /**
738             * Check if there are any outstanding requests, i.e. requests for
739             * which we are yet to receive a response.
740             *
741             * @return true if there is at least one read or write request
742             *      outstanding
743             */
744            bool awaitingResponse() const;
745
746            /** Print resource occupancy for debugging */
747            void printOccupancy();
748
749          private:
750            /**
751             * The size of the ROB used to throttle the max. number of in-flight
752             * nodes.
753             */
754            const uint16_t sizeROB;
755
756            /**
757             * The size of store buffer. This is used to throttle the max. number
758             * of in-flight stores.
759             */
760            const uint16_t sizeStoreBuffer;
761
762            /**
763             * The size of load buffer. This is used to throttle the max. number
764             * of in-flight loads.
765             */
766            const uint16_t sizeLoadBuffer;
767
768            /**
769             * A map from the sequence number to the ROB number of the in-
770             * flight nodes. This includes all nodes that are in the readyList
771             * plus the loads for which a request has been sent which are not
772             * present in the readyList. But such loads are not yet complete
773             * and thus occupy resources. We need to query the oldest in-flight
774             * node and since a map container keeps all its keys sorted using
775             * the less than criterion, the first element is the in-flight node
776             * with the least sequence number, i.e. the oldest in-flight node.
777             */
778            std::map<NodeSeqNum, NodeRobNum> inFlightNodes;
779
780            /** The ROB number of the oldest in-flight node */
781            NodeRobNum oldestInFlightRobNum;
782
783            /** Number of ready loads for which request may or may not be sent */
784            uint16_t numInFlightLoads;
785
786            /** Number of ready stores for which request may or may not be sent */
787            uint16_t numInFlightStores;
788        };
789
790        /**
791         * The InputStream encapsulates a trace file and the
792         * internal buffers and populates GraphNodes based on
793         * the input.
794         */
795        class InputStream
796        {
797
798          private:
799
800            /** Input file stream for the protobuf trace */
801            ProtoInputStream trace;
802
803            /**
804             * A multiplier for the compute delays in the trace to modulate
805             * the Trace CPU frequency either up or down. The Trace CPU's
806             * clock domain frequency must also be set to match the expected
807             * result of frequency scaling.
808             */
809            const double timeMultiplier;
810
811            /** Count of committed ops read from trace plus the filtered ops */
812            uint64_t microOpCount;
813
814            /**
815             * The window size that is read from the header of the protobuf
816             * trace and used to process the dependency trace
817             */
818            uint32_t windowSize;
819          public:
820
821            /**
822             * Create a trace input stream for a given file name.
823             *
824             * @param filename Path to the file to read from
825             * @param time_multiplier used to scale the compute delays
826             */
827            InputStream(const std::string& filename,
828                        const double time_multiplier);
829
830            /**
831             * Reset the stream such that it can be played once
832             * again.
833             */
834            void reset();
835
836            /**
837             * Attempt to read a trace element from the stream,
838             * and also notify the caller if the end of the file
839             * was reached.
840             *
841             * @param element Trace element to populate
842             * @param size of register dependency array stored in the element
843             * @return True if an element could be read successfully
844             */
845            bool read(GraphNode* element);
846
847            /** Get window size from trace */
848            uint32_t getWindowSize() const { return windowSize; }
849
850            /** Get number of micro-ops modelled in the TraceCPU replay */
851            uint64_t getMicroOpCount() const { return microOpCount; }
852        };
853
854        public:
855        /* Constructor */
856        ElasticDataGen(TraceCPU& _owner, const std::string& _name,
857                   MasterPort& _port, MasterID master_id,
858                   const std::string& trace_file, TraceCPUParams *params)
859            : owner(_owner),
860              port(_port),
861              masterID(master_id),
862              trace(trace_file, 1.0 / params->freqMultiplier),
863              genName(owner.name() + ".elastic" + _name),
864              retryPkt(nullptr),
865              traceComplete(false),
866              nextRead(false),
867              execComplete(false),
868              windowSize(trace.getWindowSize()),
869              hwResource(params->sizeROB, params->sizeStoreBuffer,
870                         params->sizeLoadBuffer)
871        {
872            DPRINTF(TraceCPUData, "Window size in the trace is %d.\n",
873                    windowSize);
874        }
875
876        /**
877         * Called from TraceCPU init(). Reads the first message from the
878         * input trace file and returns the send tick.
879         *
880         * @return Tick when first packet must be sent
881         */
882        Tick init();
883
884        /**
885         * Adjust traceOffset based on what TraceCPU init() determines on
886         * comparing the offsets in the fetch request and elastic traces.
887         *
888         * @param trace_offset trace offset set by comparing both traces
889         */
890        void adjustInitTraceOffset(Tick& offset);
891
892        /** Returns name of the ElasticDataGen instance. */
893        const std::string& name() const { return genName; }
894
895        /** Exit the ElasticDataGen. */
896        void exit();
897
898        /**
899         * Reads a line of the trace file. Returns the tick when the next
900         * request should be generated. If the end of the file has been
901         * reached, it returns false.
902         *
903         * @return bool false if end of file has been reached else true
904         */
905        bool readNextWindow();
906
907        /**
908         * Iterate over the dependencies of a new node and add the new node
909         * to the list of dependents of the parent node.
910         *
911         * @param   new_node    new node to add to the graph
912         * @tparam  dep_array   the dependency array of type rob or register,
913         *                      that is to be iterated, and may get modified
914         * @param   num_dep     the number of dependencies set in the array
915         *                      which may get modified during iteration
916         */
917        template<typename T> void addDepsOnParent(GraphNode *new_node,
918                                                    T& dep_array,
919                                                    uint8_t& num_dep);
920
921        /**
922         * This is the main execute function which consumes nodes from the
923         * sorted readyList. First attempt to issue the pending dependency-free
924         * nodes held in the depFreeQueue. Insert the ready-to-issue nodes into
925         * the readyList. Then iterate through the readyList and when a node
926         * has its execute tick equal to curTick(), execute it. If the node is
927         * a load or a store call executeMemReq() and if it is neither, simply
928         * mark it complete.
929         */
930        void execute();
931
932        /**
933         * Creates a new request for a load or store assigning the request
934         * parameters. Calls the port's sendTimingReq() and returns a packet
935         * if the send failed so that it can be saved for a retry.
936         *
937         * @param node_ptr pointer to the load or store node to be executed
938         *
939         * @return packet pointer if the request failed and nullptr if it was
940         *          sent successfully
941         */
942        PacketPtr executeMemReq(GraphNode* node_ptr);
943
944        /**
945         * Add a ready node to the readyList. When inserting, ensure the nodes
946         * are sorted in ascending order of their execute ticks.
947         *
948         * @param seq_num seq. num of ready node
949         * @param exec_tick the execute tick of the ready node
950         */
951        void addToSortedReadyList(NodeSeqNum seq_num, Tick exec_tick);
952
953        /** Print readyList for debugging using debug flag TraceCPUData. */
954        void printReadyList();
955
956        /**
957         * When a load writeback is received, that is when the load completes,
958         * release the dependents on it. This is called from the dcache port
959         * recvTimingResp().
960         */
961        void completeMemAccess(PacketPtr pkt);
962
963        /**
964         * Returns the execComplete variable which is set when the last
965         * node is executed.
966         *
967         * @return bool true if execComplete is set, false otherwise.
968         */
969        bool isExecComplete() const { return execComplete; }
970
971        /**
972         * Attempts to issue a node once the node's source dependencies are
973         * complete. If resources are available then add it to the readyList,
974         * otherwise the node is not issued and is stored in depFreeQueue
975         * until resources become available.
976         *
977         * @param node_ptr pointer to node to be issued
978         * @param first true if this is the first attempt to issue this node
979         * @return true if node was added to readyList
980         */
981        bool checkAndIssue(const GraphNode* node_ptr, bool first = true);
982
983        /** Get number of micro-ops modelled in the TraceCPU replay */
984        uint64_t getMicroOpCount() const { return trace.getMicroOpCount(); }
985
986        void regStats();
987
988      private:
989
990        /** Reference of the TraceCPU. */
991        TraceCPU& owner;
992
993        /** Reference of the port to be used to issue memory requests. */
994        MasterPort& port;
995
996        /** MasterID used for the requests being sent. */
997        const MasterID masterID;
998
999        /** Input stream used for reading the input trace file. */
1000        InputStream trace;
1001
1002        /** String to store the name of the FixedRetryGen. */
1003        std::string genName;
1004
1005        /** PacketPtr used to store the packet to retry. */
1006        PacketPtr retryPkt;
1007
1008        /** Set to true when end of trace is reached. */
1009        bool traceComplete;
1010
1011        /** Set to true when the next window of instructions need to be read */
1012        bool nextRead;
1013
1014        /** Set true when execution of trace is complete */
1015        bool execComplete;
1016
1017        /**
1018         * Window size within which to check for dependencies. Its value is
1019         * made equal to the window size used to generate the trace which is
1020         * recorded in the trace header. The dependency graph must be
1021         * populated enough such that when a node completes, its potential
1022         * child node must be found and the dependency removed before the
1023         * completed node itself is removed. Thus as soon as the graph shrinks
1024         * to become smaller than this window, we read in the next window.
1025         */
1026        const uint32_t windowSize;
1027
1028        /**
1029         * Hardware resources required to contain in-flight nodes and to
1030         * throttle issuing of new nodes when resources are not available.
1031         */
1032        HardwareResource hwResource;
1033
1034        /** Store the depGraph of GraphNodes */
1035        std::unordered_map<NodeSeqNum, GraphNode*> depGraph;
1036
1037        /**
1038         * Queue of dependency-free nodes that are pending issue because
1039         * resources are not available. This is chosen to be FIFO so that
1040         * dependent nodes which become free in program order get pushed
1041         * into the queue in that order. Thus nodes are more likely to
1042         * issue in program order.
1043         */
1044        std::queue<const GraphNode*> depFreeQueue;
1045
1046        /** List of nodes that are ready to execute */
1047        std::list<ReadyNode> readyList;
1048
1049        /** Stats for data memory accesses replayed. */
1050        Stats::Scalar maxDependents;
1051        Stats::Scalar maxReadyListSize;
1052        Stats::Scalar numSendAttempted;
1053        Stats::Scalar numSendSucceeded;
1054        Stats::Scalar numSendFailed;
1055        Stats::Scalar numRetrySucceeded;
1056        Stats::Scalar numSplitReqs;
1057        Stats::Scalar numSOLoads;
1058        Stats::Scalar numSOStores;
1059        /** Tick when ElasticDataGen completes execution */
1060        Stats::Scalar dataLastTick;
1061    };
1062
1063    /** Instance of FixedRetryGen to replay instruction read requests. */
1064    FixedRetryGen icacheGen;
1065
1066    /** Instance of ElasticDataGen to replay data read and write requests. */
1067    ElasticDataGen dcacheGen;
1068
1069    /**
1070     * This is the control flow that uses the functionality of the icacheGen to
1071     * replay the trace. It calls tryNext(). If it returns true then next event
1072     * is scheduled at curTick() plus delta. If it returns false then delta is
1073     * ignored and control is brought back via recvRetry().
1074     */
1075    void schedIcacheNext();
1076
1077    /**
1078     * This is the control flow that uses the functionality of the dcacheGen to
1079     * replay the trace. It calls execute(). It checks if execution is complete
1080     * and schedules an event to exit simulation accordingly.
1081     */
1082    void schedDcacheNext();
1083
1084    /** Event for the control flow method schedIcacheNext() */
1085    EventFunctionWrapper icacheNextEvent;
1086
1087    /** Event for the control flow method schedDcacheNext() */
1088    EventFunctionWrapper dcacheNextEvent;
1089
1090    /** This is called when either generator finishes executing from the trace */
1091    void checkAndSchedExitEvent();
1092
1093    /** Set to true when one of the generators finishes replaying its trace. */
1094    bool oneTraceComplete;
1095
1096    /**
1097     * This stores the time offset in the trace, which is taken away from
1098     * the ready times of requests. This is specially useful because the time
1099     * offset can be very large if the traces are generated from the middle of
1100     * a program.
1101     */
1102    Tick traceOffset;
1103
1104    /**
1105     * Number of Trace CPUs in the system used as a shared variable and passed
1106     * to the CountedExitEvent event used for counting down exit events.  It is
1107     * incremented in the constructor call so that the total is arrived at
1108     * automatically.
1109     */
1110    static int numTraceCPUs;
1111
1112   /**
1113    * A CountedExitEvent which when serviced decrements the counter. A sim
1114    * exit event is scheduled when the counter equals zero, that is all
1115    * instances of Trace CPU have had their execCompleteEvent serviced.
1116    */
1117    CountedExitEvent *execCompleteEvent;
1118
1119    /**
1120     * Exit when any one Trace CPU completes its execution. If this is
1121     * configured true then the execCompleteEvent is not scheduled.
1122     */
1123    const bool enableEarlyExit;
1124
1125    /**
1126      * Interval of committed instructions specified by the user at which a
1127      * progress info message is printed
1128      */
1129    const uint64_t progressMsgInterval;
1130
1131    /*
1132     * The progress msg threshold is kept updated to the next multiple of the
1133     * progress msg interval. As soon as the threshold is reached, an info
1134     * message is printed.
1135     */
1136    uint64_t progressMsgThreshold;
1137
1138    Stats::Scalar numSchedDcacheEvent;
1139    Stats::Scalar numSchedIcacheEvent;
1140
1141    /** Stat for number of simulated micro-ops. */
1142    Stats::Scalar numOps;
1143    /** Stat for the CPI. This is really cycles per micro-op and not inst. */
1144    Stats::Formula cpi;
1145
1146  public:
1147
1148    /** Used to get a reference to the icache port. */
1149    Port &getInstPort() { return icachePort; }
1150
1151    /** Used to get a reference to the dcache port. */
1152    Port &getDataPort() { return dcachePort; }
1153
1154    void regStats();
1155};
1156#endif // __CPU_TRACE_TRACE_CPU_HH__
1157