trace_cpu.hh revision 11631:6d147afa8fc6
1/*
2 * Copyright (c) 2013 - 2016 ARM Limited
3 * All rights reserved
4 *
5 * The license below extends only to copyright in the software and shall
6 * not be construed as granting a license to any other intellectual
7 * property including but not limited to intellectual property relating
8 * to a hardware implementation of the functionality of the software
9 * licensed hereunder.  You may use the software subject to the license
10 * terms below provided that you ensure that this notice is replicated
11 * unmodified and in its entirety in all distributions of the software,
12 * modified or unmodified, in source code or in binary form.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met: redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer;
18 * redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution;
21 * neither the name of the copyright holders nor the names of its
22 * contributors may be used to endorse or promote products derived from
23 * this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 *
37 * Authors: Radhika Jagtap
38 *          Andreas Hansson
39 *          Thomas Grass
40 */
41
42#ifndef __CPU_TRACE_TRACE_CPU_HH__
43#define __CPU_TRACE_TRACE_CPU_HH__
44
45#include <array>
46#include <cstdint>
47#include <queue>
48#include <set>
49#include <unordered_map>
50
51#include "arch/registers.hh"
52#include "base/statistics.hh"
53#include "cpu/base.hh"
54#include "debug/TraceCPUData.hh"
55#include "debug/TraceCPUInst.hh"
56#include "params/TraceCPU.hh"
57#include "proto/inst_dep_record.pb.h"
58#include "proto/packet.pb.h"
59#include "proto/protoio.hh"
60#include "sim/sim_events.hh"
61
62/**
63 * The trace cpu replays traces generated using the elastic trace probe
64 * attached to the O3 CPU model. The elastic trace is an execution trace with
65 * register data dependencies and ordering dependencies annotated to it. The
66 * trace cpu also replays a fixed timestamp fetch trace that is also generated
67 * by the elastic trace probe. This trace cpu model aims at achieving faster
68 * simulation compared to the detailed cpu model and good correlation when the
69 * same trace is used for playback on different memory sub-systems.
70 *
71 * The TraceCPU inherits from BaseCPU so some virtual methods need to be
72 * defined. It has two port subclasses inherited from MasterPort for
73 * instruction and data ports. It issues the memory requests deducing the
74 * timing from the trace and without performing real execution of micro-ops. As
75 * soon as the last dependency for an instruction is complete, its
76 * computational delay, also provided in the input trace is added. The
77 * dependency-free nodes are maintained in a list, called 'ReadyList', ordered
78 * by ready time. Instructions which depend on load stall until the responses
79 * for read requests are received thus achieving elastic replay. If the
80 * dependency is not found when adding a new node, it is assumed complete.
81 * Thus, if this node is found to be completely dependency-free its issue time
82 * is calculated and it is added to the ready list immediately. This is
83 * encapsulated in the subclass ElasticDataGen.
84 *
85 * If ready nodes are issued in an unconstrained way there can be more nodes
86 * outstanding which results in divergence in timing compared to the O3CPU.
87 * Therefore, the Trace CPU also models hardware resources. A sub-class to
88 * model hardware resources contains the maximum sizes of load buffer, store
89 * buffer and ROB. If resources are not available, the node is not issued. Such
90 * nodes that are pending issue are held in the 'depFreeQueue' structure.
91 *
92 * Modeling the ROB size in the Trace CPU as a resource limitation is arguably
93 * the most important parameter of all resources. The ROB occupancy is
94 * estimated using the newly added field 'robNum'. We need to use ROB number as
95 * sequence number is at times much higher due to squashing and trace replay is
96 * focused on correct path modeling.
97 *
98 * A map called 'inFlightNodes' is added to track nodes that are not only in
99 * the readyList but also load nodes that are executed (and thus removed from
100 * readyList) but are not complete. ReadyList handles what and when to execute
101 * next node while the inFlightNodes is used for resource modelling. The oldest
102 * ROB number is updated when any node occupies the ROB or when an entry in the
103 * ROB is released. The ROB occupancy is equal to the difference in the ROB
104 * number of the newly dependency-free node and the oldest ROB number in
105 * flight.
106 *
107 * If no node depends on a non load/store node then there is no reason to
108 * track it in the dependency graph. We filter out such nodes but count them
109 * and add a weight field to the subsequent node that we do include in the
110 * trace. The weight field is used to model ROB occupancy during replay.
111 *
112 * The depFreeQueue is chosen to be FIFO so that child nodes which are in
113 * program order get pushed into it in that order and thus issued in program
114 * order, like in the O3CPU. This is also why the dependents is made a
115 * sequential container, std::set to std::vector. We only check head of the
116 * depFreeQueue as nodes are issued in order and blocking on head models that
117 * better than looping the entire queue. An alternative choice would be to
118 * inspect top N pending nodes where N is the issue-width. This is left for
119 * future as the timing correlation looks good as it is.
120 *
121 * At the start of an execution event, first we attempt to issue such pending
122 * nodes by checking if appropriate resources have become available. If yes, we
123 * compute the execute tick with respect to the time then. Then we proceed to
124 * complete nodes from the readyList.
125 *
126 * When a read response is received, sometimes a dependency on it that was
127 * supposed to be released when it was issued is still not released. This
128 * occurs because the dependent gets added to the graph after the read was
129 * sent. So the check is made less strict and the dependency is marked complete
130 * on read response instead of insisting that it should have been removed on
131 * read sent.
132 *
133 * There is a check for requests spanning two cache lines as this condition
134 * triggers an assert fail in the L1 cache. If it does then truncate the size
135 * to access only until the end of that line and ignore the remainder.
136 * Strictly-ordered requests are skipped and the dependencies on such requests
137 * are handled by simply marking them complete immediately.
138 *
139 * The simulated seconds can be calculated as the difference between the
140 * final_tick stat and the tickOffset stat. A CountedExitEvent that contains a
141 * static int belonging to the Trace CPU class as a down counter is used to
142 * implement multi Trace CPU simulation exit.
143 */
144
145class TraceCPU : public BaseCPU
146{
147
148  public:
149    TraceCPU(TraceCPUParams *params);
150    ~TraceCPU();
151
152    void init();
153
154    /**
155     * This is a pure virtual function in BaseCPU. As we don't know how many
156     * insts are in the trace but only know how how many micro-ops are we
157     * cannot count this stat.
158     *
159     * @return 0
160     */
161    Counter totalInsts() const
162    {
163        return 0;
164    }
165
166    /**
167     * Return totalOps as the number of committed micro-ops plus the
168     * speculatively issued loads that are modelled in the TraceCPU replay.
169     *
170     * @return number of micro-ops i.e. nodes in the elastic data generator
171     */
172    Counter totalOps() const
173    {
174        return dcacheGen.getMicroOpCount();
175    }
176
177    /* Pure virtual function in BaseCPU. Do nothing. */
178    void wakeup(ThreadID tid = 0)
179    {
180        return;
181    }
182
183    /*
184     * When resuming from checkpoint in FS mode, the TraceCPU takes over from
185     * the old cpu. This function overrides the takeOverFrom() function in the
186     * BaseCPU. It unbinds the ports of the old CPU and binds the ports of the
187     * TraceCPU.
188     */
189    void takeOverFrom(BaseCPU *oldCPU);
190
191    /**
192     * When instruction cache port receives a retry, schedule event
193     * icacheNextEvent.
194     */
195    void icacheRetryRecvd();
196
197    /**
198     * When data cache port receives a retry, schedule event
199     * dcacheNextEvent.
200     */
201    void dcacheRetryRecvd();
202
203    /**
204     * When data cache port receives a response, this calls the dcache
205     * generator method handle to complete the load writeback.
206     *
207     * @param pkt Pointer to packet received
208     */
209    void dcacheRecvTimingResp(PacketPtr pkt);
210
211    /**
212     * Schedule event dcacheNextEvent at the given tick
213     *
214     * @param when Tick at which to schedule event
215     */
216    void schedDcacheNextEvent(Tick when);
217
218  protected:
219
220    /**
221     * IcachePort class that interfaces with L1 Instruction Cache.
222     */
223    class IcachePort : public MasterPort
224    {
225      public:
226        /** Default constructor. */
227        IcachePort(TraceCPU* _cpu)
228            : MasterPort(_cpu->name() + ".icache_port", _cpu),
229                         owner(_cpu)
230        { }
231
232      public:
233        /**
234         * Receive the timing reponse and simply delete the packet since
235         * instruction fetch requests are issued as per the timing in the trace
236         * and responses are ignored.
237         *
238         * @param pkt Pointer to packet received
239         * @return true
240         */
241        bool recvTimingResp(PacketPtr pkt);
242
243        /**
244         * Required functionally but do nothing.
245         *
246         * @param pkt Pointer to packet received
247         */
248        void recvTimingSnoopReq(PacketPtr pkt) { }
249
250        /**
251         * Handle a retry signalled by the cache if instruction read failed in
252         * the first attempt.
253         */
254        void recvReqRetry();
255
256      private:
257        TraceCPU* owner;
258    };
259
260    /**
261     * DcachePort class that interfaces with L1 Data Cache.
262     */
263    class DcachePort : public MasterPort
264    {
265
266      public:
267        /** Default constructor. */
268        DcachePort(TraceCPU* _cpu)
269            : MasterPort(_cpu->name() + ".dcache_port", _cpu),
270                         owner(_cpu)
271        { }
272
273      public:
274
275        /**
276         * Receive the timing reponse and call dcacheRecvTimingResp() method
277         * of the dcacheGen to handle completing the load
278         *
279         * @param pkt Pointer to packet received
280         * @return true
281         */
282        bool recvTimingResp(PacketPtr pkt);
283
284        /**
285         * Required functionally but do nothing.
286         *
287         * @param pkt Pointer to packet received
288         */
289        void recvTimingSnoopReq(PacketPtr pkt)
290        { }
291
292        /**
293         * Required functionally but do nothing.
294         *
295         * @param pkt Pointer to packet received
296         */
297        void recvFunctionalSnoop(PacketPtr pkt)
298        { }
299
300        /**
301         * Handle a retry signalled by the cache if data access failed in the
302         * first attempt.
303         */
304        void recvReqRetry();
305
306        /**
307         * Required functionally.
308         *
309         * @return true since we have to snoop
310         */
311        bool isSnooping() const { return true; }
312
313      private:
314        TraceCPU* owner;
315    };
316
317    /** Port to connect to L1 instruction cache. */
318    IcachePort icachePort;
319
320    /** Port to connect to L1 data cache. */
321    DcachePort dcachePort;
322
323    /** Master id for instruction read requests. */
324    const MasterID instMasterID;
325
326    /** Master id for data read and write requests. */
327    const MasterID dataMasterID;
328
329    /** File names for input instruction and data traces. */
330    std::string instTraceFile, dataTraceFile;
331
332    /**
333     * Generator to read protobuf trace containing memory requests at fixed
334     * timestamps, perform flow control and issue memory requests. If L1 cache
335     * port sends packet succesfully, determine the tick to send the next
336     * packet else wait for retry from cache.
337     */
338    class FixedRetryGen
339    {
340
341      private:
342
343        /**
344         * This struct stores a line in the trace file.
345         */
346        struct TraceElement {
347
348            /** Specifies if the request is to be a read or a write */
349            MemCmd cmd;
350
351            /** The address for the request */
352            Addr addr;
353
354            /** The size of the access for the request */
355            Addr blocksize;
356
357            /** The time at which the request should be sent */
358            Tick tick;
359
360            /** Potential request flags to use */
361            Request::FlagsType flags;
362
363            /** Instruction PC */
364            Addr pc;
365
366            /**
367             * Check validity of this element.
368             *
369             * @return if this element is valid
370             */
371            bool isValid() const {
372                return cmd != MemCmd::InvalidCmd;
373            }
374
375            /**
376             * Make this element invalid.
377             */
378            void clear() {
379                cmd = MemCmd::InvalidCmd;
380            }
381        };
382
383        /**
384         * The InputStream encapsulates a trace file and the
385         * internal buffers and populates TraceElements based on
386         * the input.
387         */
388        class InputStream
389        {
390
391          private:
392
393            // Input file stream for the protobuf trace
394            ProtoInputStream trace;
395
396          public:
397
398            /**
399             * Create a trace input stream for a given file name.
400             *
401             * @param filename Path to the file to read from
402             */
403            InputStream(const std::string& filename);
404
405            /**
406             * Reset the stream such that it can be played once
407             * again.
408             */
409            void reset();
410
411            /**
412             * Attempt to read a trace element from the stream,
413             * and also notify the caller if the end of the file
414             * was reached.
415             *
416             * @param element Trace element to populate
417             * @return True if an element could be read successfully
418             */
419            bool read(TraceElement* element);
420        };
421
422        public:
423        /* Constructor */
424        FixedRetryGen(TraceCPU& _owner, const std::string& _name,
425                   MasterPort& _port, MasterID master_id,
426                   const std::string& trace_file)
427            : owner(_owner),
428              port(_port),
429              masterID(master_id),
430              trace(trace_file),
431              genName(owner.name() + ".fixedretry" + _name),
432              retryPkt(nullptr),
433              delta(0),
434              traceComplete(false)
435        {
436        }
437
438        /**
439         * Called from TraceCPU init(). Reads the first message from the
440         * input trace file and returns the send tick.
441         *
442         * @return Tick when first packet must be sent
443         */
444        Tick init();
445
446        /**
447         * This tries to send current or retry packet and returns true if
448         * successfull. It calls nextExecute() to read next message.
449         *
450         * @return bool true if packet is sent successfully
451         */
452        bool tryNext();
453
454        /** Returns name of the FixedRetryGen instance. */
455        const std::string& name() const { return genName; }
456
457        /**
458         * Creates a new request assigning the request parameters passed by the
459         * arguments. Calls the port's sendTimingReq() and returns true if
460         * the packet was sent succesfully. It is called by tryNext()
461         *
462         * @param addr address of request
463         * @param size size of request
464         * @param cmd if it is a read or write request
465         * @param flags associated request flags
466         * @param pc instruction PC that generated the request
467         *
468         * @return true if packet was sent successfully
469         */
470        bool send(Addr addr, unsigned size, const MemCmd& cmd,
471              Request::FlagsType flags, Addr pc);
472
473        /** Exit the FixedRetryGen. */
474        void exit();
475
476        /**
477         * Reads a line of the trace file. Returns the tick
478         * when the next request should be generated. If the end
479         * of the file has been reached, it returns false.
480         *
481         * @return bool false id end of file has been reached
482         */
483        bool nextExecute();
484
485        /**
486         * Returns the traceComplete variable which is set when end of the
487         * input trace file is reached.
488         *
489         * @return bool true if traceComplete is set, false otherwise.
490         */
491        bool isTraceComplete() { return traceComplete; }
492
493        int64_t tickDelta() { return delta; }
494
495        void regStats();
496
497      private:
498
499        /** Reference of the TraceCPU. */
500        TraceCPU& owner;
501
502        /** Reference of the port to be used to issue memory requests. */
503        MasterPort& port;
504
505        /** MasterID used for the requests being sent. */
506        const MasterID masterID;
507
508        /** Input stream used for reading the input trace file. */
509        InputStream trace;
510
511        /** String to store the name of the FixedRetryGen. */
512        std::string genName;
513
514        /** PacketPtr used to store the packet to retry. */
515        PacketPtr retryPkt;
516
517        /**
518         * Stores the difference in the send ticks of the current and last
519         * packets. Keeping this signed to check overflow to a negative value
520         * which will be caught by assert(delta > 0)
521         */
522        int64_t delta;
523
524        /**
525         * Set to true when end of trace is reached.
526         */
527        bool traceComplete;
528
529        /** Store an element read from the trace to send as the next packet. */
530        TraceElement currElement;
531
532        /** Stats for instruction accesses replayed. */
533        Stats::Scalar numSendAttempted;
534        Stats::Scalar numSendSucceeded;
535        Stats::Scalar numSendFailed;
536        Stats::Scalar numRetrySucceeded;
537        /** Last simulated tick by the FixedRetryGen */
538        Stats::Scalar instLastTick;
539
540    };
541
542    /**
543     * The elastic data memory request generator to read protobuf trace
544     * containing execution trace annotated with data and ordering
545     * dependencies. It deduces the time at which to send a load/store request
546     * by tracking the dependencies. It attempts to send a memory request for a
547     * load/store without performing real execution of micro-ops. If L1 cache
548     * port sends packet succesfully, the generator checks which instructions
549     * became dependency free as a result of this and schedules an event
550     * accordingly. If it fails to send the packet, it waits for a retry from
551     * the cache.
552     */
553    class ElasticDataGen
554    {
555
556      private:
557
558        /** Node sequence number type. */
559        typedef uint64_t NodeSeqNum;
560
561        /** Node ROB number type. */
562        typedef uint64_t NodeRobNum;
563
564        typedef ProtoMessage::InstDepRecord::RecordType RecordType;
565        typedef ProtoMessage::InstDepRecord Record;
566
567        /**
568         * The struct GraphNode stores an instruction in the trace file. The
569         * format of the trace file favours constructing a dependency graph of
570         * the execution and this struct is used to encapsulate the request
571         * data as well as pointers to its dependent GraphNodes.
572         */
573        class GraphNode {
574
575          public:
576            /**
577             * The maximum no. of ROB dependencies. There can be at most 2
578             * order dependencies which could exist for a store. For a load
579             * and comp node there can be at most one order dependency.
580             */
581            static const uint8_t maxRobDep = 2;
582
583            /** Typedef for the array containing the ROB dependencies */
584            typedef std::array<NodeSeqNum, maxRobDep> RobDepArray;
585
586            /** Typedef for the array containing the register dependencies */
587            typedef std::array<NodeSeqNum, TheISA::MaxInstSrcRegs> RegDepArray;
588
589            /** Instruction sequence number */
590            NodeSeqNum seqNum;
591
592            /** ROB occupancy number */
593            NodeRobNum robNum;
594
595           /** Type of the node corresponding to the instruction modelled by it */
596            RecordType type;
597
598            /** The address for the request if any */
599            Addr physAddr;
600
601            /** The virtual address for the request if any */
602            Addr virtAddr;
603
604            /** The address space id which is set if the virtual address is set */
605            uint32_t asid;
606
607            /** Size of request if any */
608            uint32_t size;
609
610            /** Request flags if any */
611            Request::Flags flags;
612
613            /** Instruction PC */
614            Addr pc;
615
616            /** Array of order dependencies. */
617            RobDepArray robDep;
618
619            /** Number of order dependencies */
620            uint8_t numRobDep;
621
622            /** Computational delay */
623            uint64_t compDelay;
624
625            /**
626             * Array of register dependencies (incoming) if any. Maximum number
627             * of source registers used to set maximum size of the array
628             */
629            RegDepArray regDep;
630
631            /** Number of register dependencies */
632            uint8_t numRegDep;
633
634            /**
635             * A vector of nodes dependent (outgoing) on this node. A
636             * sequential container is chosen because when dependents become
637             * free, they attempt to issue in program order.
638             */
639            std::vector<GraphNode *> dependents;
640
641            /** Is the node a load */
642            bool isLoad() const { return (type == Record::LOAD); }
643
644            /** Is the node a store */
645            bool isStore() const { return (type == Record::STORE); }
646
647            /** Is the node a compute (non load/store) node */
648            bool isComp() const { return (type == Record::COMP); }
649
650            /** Initialize register dependency array to all zeroes */
651            void clearRegDep();
652
653            /** Initialize register dependency array to all zeroes */
654            void clearRobDep();
655
656            /** Remove completed instruction from register dependency array */
657            bool removeRegDep(NodeSeqNum reg_dep);
658
659            /** Remove completed instruction from order dependency array */
660            bool removeRobDep(NodeSeqNum rob_dep);
661
662            /** Check for all dependencies on completed inst */
663            bool removeDepOnInst(NodeSeqNum done_seq_num);
664
665            /** Return true if node has a request which is strictly ordered */
666            bool isStrictlyOrdered() const {
667                return (flags.isSet(Request::STRICT_ORDER));
668            }
669            /**
670             * Write out element in trace-compatible format using debug flag
671             * TraceCPUData.
672             */
673            void writeElementAsTrace() const;
674
675            /** Return string specifying the type of the node */
676            std::string typeToStr() const;
677        };
678
679        /** Struct to store a ready-to-execute node and its execution tick. */
680        struct ReadyNode
681        {
682            /** The sequence number of the ready node */
683            NodeSeqNum seqNum;
684
685            /** The tick at which the ready node must be executed */
686            Tick execTick;
687        };
688
689        /**
690         * The HardwareResource class models structures that hold the in-flight
691         * nodes. When a node becomes dependency free, first check if resources
692         * are available to issue it.
693         */
694        class HardwareResource
695        {
696          public:
697            /**
698             * Constructor that initializes the sizes of the structures.
699             *
700             * @param max_rob size of the Reorder Buffer
701             * @param max_stores size of Store Buffer
702             * @param max_loads size of Load Buffer
703             */
704            HardwareResource(uint16_t max_rob, uint16_t max_stores,
705                                uint16_t max_loads);
706
707            /**
708             * Occupy appropriate structures for an issued node.
709             *
710             * @param node_ptr pointer to the issued node
711             */
712            void occupy(const GraphNode* new_node);
713
714            /**
715             * Release appropriate structures for a completed node.
716             *
717             * @param node_ptr pointer to the completed node
718             */
719            void release(const GraphNode* done_node);
720
721            /** Release store buffer entry for a completed store */
722            void releaseStoreBuffer();
723
724            /**
725             * Check if structures required to issue a node are free.
726             *
727             * @param node_ptr pointer to the node ready to issue
728             * @return true if resources are available
729             */
730            bool isAvailable(const GraphNode* new_node) const;
731
732            /**
733             * Check if there are any outstanding requests, i.e. requests for
734             * which we are yet to receive a response.
735             *
736             * @return true if there is at least one read or write request
737             *      outstanding
738             */
739            bool awaitingResponse() const;
740
741            /** Print resource occupancy for debugging */
742            void printOccupancy();
743
744          private:
745            /**
746             * The size of the ROB used to throttle the max. number of in-flight
747             * nodes.
748             */
749            const uint16_t sizeROB;
750
751            /**
752             * The size of store buffer. This is used to throttle the max. number
753             * of in-flight stores.
754             */
755            const uint16_t sizeStoreBuffer;
756
757            /**
758             * The size of load buffer. This is used to throttle the max. number
759             * of in-flight loads.
760             */
761            const uint16_t sizeLoadBuffer;
762
763            /**
764             * A map from the sequence number to the ROB number of the in-
765             * flight nodes. This includes all nodes that are in the readyList
766             * plus the loads for which a request has been sent which are not
767             * present in the readyList. But such loads are not yet complete
768             * and thus occupy resources. We need to query the oldest in-flight
769             * node and since a map container keeps all its keys sorted using
770             * the less than criterion, the first element is the in-flight node
771             * with the least sequence number, i.e. the oldest in-flight node.
772             */
773            std::map<NodeSeqNum, NodeRobNum> inFlightNodes;
774
775            /** The ROB number of the oldest in-flight node */
776            NodeRobNum oldestInFlightRobNum;
777
778            /** Number of ready loads for which request may or may not be sent */
779            uint16_t numInFlightLoads;
780
781            /** Number of ready stores for which request may or may not be sent */
782            uint16_t numInFlightStores;
783        };
784
785        /**
786         * The InputStream encapsulates a trace file and the
787         * internal buffers and populates GraphNodes based on
788         * the input.
789         */
790        class InputStream
791        {
792
793          private:
794
795            /** Input file stream for the protobuf trace */
796            ProtoInputStream trace;
797
798            /**
799             * A multiplier for the compute delays in the trace to modulate
800             * the Trace CPU frequency either up or down. The Trace CPU's
801             * clock domain frequency must also be set to match the expected
802             * result of frequency scaling.
803             */
804            const double timeMultiplier;
805
806            /** Count of committed ops read from trace plus the filtered ops */
807            uint64_t microOpCount;
808
809            /**
810             * The window size that is read from the header of the protobuf
811             * trace and used to process the dependency trace
812             */
813            uint32_t windowSize;
814          public:
815
816            /**
817             * Create a trace input stream for a given file name.
818             *
819             * @param filename Path to the file to read from
820             * @param time_multiplier used to scale the compute delays
821             */
822            InputStream(const std::string& filename,
823                        const double time_multiplier);
824
825            /**
826             * Reset the stream such that it can be played once
827             * again.
828             */
829            void reset();
830
831            /**
832             * Attempt to read a trace element from the stream,
833             * and also notify the caller if the end of the file
834             * was reached.
835             *
836             * @param element Trace element to populate
837             * @param size of register dependency array stored in the element
838             * @return True if an element could be read successfully
839             */
840            bool read(GraphNode* element);
841
842            /** Get window size from trace */
843            uint32_t getWindowSize() const { return windowSize; }
844
845            /** Get number of micro-ops modelled in the TraceCPU replay */
846            uint64_t getMicroOpCount() const { return microOpCount; }
847        };
848
849        public:
850        /* Constructor */
851        ElasticDataGen(TraceCPU& _owner, const std::string& _name,
852                   MasterPort& _port, MasterID master_id,
853                   const std::string& trace_file, TraceCPUParams *params)
854            : owner(_owner),
855              port(_port),
856              masterID(master_id),
857              trace(trace_file, 1.0 / params->freqMultiplier),
858              genName(owner.name() + ".elastic" + _name),
859              retryPkt(nullptr),
860              traceComplete(false),
861              nextRead(false),
862              execComplete(false),
863              windowSize(trace.getWindowSize()),
864              hwResource(params->sizeROB, params->sizeStoreBuffer,
865                         params->sizeLoadBuffer)
866        {
867            DPRINTF(TraceCPUData, "Window size in the trace is %d.\n",
868                    windowSize);
869        }
870
871        /**
872         * Called from TraceCPU init(). Reads the first message from the
873         * input trace file and returns the send tick.
874         *
875         * @return Tick when first packet must be sent
876         */
877        Tick init();
878
879        /** Returns name of the ElasticDataGen instance. */
880        const std::string& name() const { return genName; }
881
882        /** Exit the ElasticDataGen. */
883        void exit();
884
885        /**
886         * Reads a line of the trace file. Returns the tick when the next
887         * request should be generated. If the end of the file has been
888         * reached, it returns false.
889         *
890         * @return bool false if end of file has been reached else true
891         */
892        bool readNextWindow();
893
894        /**
895         * Iterate over the dependencies of a new node and add the new node
896         * to the list of dependents of the parent node.
897         *
898         * @param   new_node    new node to add to the graph
899         * @tparam  dep_array   the dependency array of type rob or register,
900         *                      that is to be iterated, and may get modified
901         * @param   num_dep     the number of dependencies set in the array
902         *                      which may get modified during iteration
903         */
904        template<typename T> void addDepsOnParent(GraphNode *new_node,
905                                                    T& dep_array,
906                                                    uint8_t& num_dep);
907
908        /**
909         * This is the main execute function which consumes nodes from the
910         * sorted readyList. First attempt to issue the pending dependency-free
911         * nodes held in the depFreeQueue. Insert the ready-to-issue nodes into
912         * the readyList. Then iterate through the readyList and when a node
913         * has its execute tick equal to curTick(), execute it. If the node is
914         * a load or a store call executeMemReq() and if it is neither, simply
915         * mark it complete.
916         */
917        void execute();
918
919        /**
920         * Creates a new request for a load or store assigning the request
921         * parameters. Calls the port's sendTimingReq() and returns a packet
922         * if the send failed so that it can be saved for a retry.
923         *
924         * @param node_ptr pointer to the load or store node to be executed
925         *
926         * @return packet pointer if the request failed and nullptr if it was
927         *          sent successfully
928         */
929        PacketPtr executeMemReq(GraphNode* node_ptr);
930
931        /**
932         * Add a ready node to the readyList. When inserting, ensure the nodes
933         * are sorted in ascending order of their execute ticks.
934         *
935         * @param seq_num seq. num of ready node
936         * @param exec_tick the execute tick of the ready node
937         */
938        void addToSortedReadyList(NodeSeqNum seq_num, Tick exec_tick);
939
940        /** Print readyList for debugging using debug flag TraceCPUData. */
941        void printReadyList();
942
943        /**
944         * When a load writeback is received, that is when the load completes,
945         * release the dependents on it. This is called from the dcache port
946         * recvTimingResp().
947         */
948        void completeMemAccess(PacketPtr pkt);
949
950        /**
951         * Returns the execComplete variable which is set when the last
952         * node is executed.
953         *
954         * @return bool true if execComplete is set, false otherwise.
955         */
956        bool isExecComplete() const { return execComplete; }
957
958        /**
959         * Attempts to issue a node once the node's source dependencies are
960         * complete. If resources are available then add it to the readyList,
961         * otherwise the node is not issued and is stored in depFreeQueue
962         * until resources become available.
963         *
964         * @param node_ptr pointer to node to be issued
965         * @param first true if this is the first attempt to issue this node
966         * @return true if node was added to readyList
967         */
968        bool checkAndIssue(const GraphNode* node_ptr, bool first = true);
969
970        /** Get number of micro-ops modelled in the TraceCPU replay */
971        uint64_t getMicroOpCount() const { return trace.getMicroOpCount(); }
972
973        void regStats();
974
975      private:
976
977        /** Reference of the TraceCPU. */
978        TraceCPU& owner;
979
980        /** Reference of the port to be used to issue memory requests. */
981        MasterPort& port;
982
983        /** MasterID used for the requests being sent. */
984        const MasterID masterID;
985
986        /** Input stream used for reading the input trace file. */
987        InputStream trace;
988
989        /** String to store the name of the FixedRetryGen. */
990        std::string genName;
991
992        /** PacketPtr used to store the packet to retry. */
993        PacketPtr retryPkt;
994
995        /** Set to true when end of trace is reached. */
996        bool traceComplete;
997
998        /** Set to true when the next window of instructions need to be read */
999        bool nextRead;
1000
1001        /** Set true when execution of trace is complete */
1002        bool execComplete;
1003
1004        /**
1005         * Window size within which to check for dependencies. Its value is
1006         * made equal to the window size used to generate the trace which is
1007         * recorded in the trace header. The dependency graph must be
1008         * populated enough such that when a node completes, its potential
1009         * child node must be found and the dependency removed before the
1010         * completed node itself is removed. Thus as soon as the graph shrinks
1011         * to become smaller than this window, we read in the next window.
1012         */
1013        const uint32_t windowSize;
1014
1015        /**
1016         * Hardware resources required to contain in-flight nodes and to
1017         * throttle issuing of new nodes when resources are not available.
1018         */
1019        HardwareResource hwResource;
1020
1021        /** Store the depGraph of GraphNodes */
1022        std::unordered_map<NodeSeqNum, GraphNode*> depGraph;
1023
1024        /**
1025         * Queue of dependency-free nodes that are pending issue because
1026         * resources are not available. This is chosen to be FIFO so that
1027         * dependent nodes which become free in program order get pushed
1028         * into the queue in that order. Thus nodes are more likely to
1029         * issue in program order.
1030         */
1031        std::queue<const GraphNode*> depFreeQueue;
1032
1033        /** List of nodes that are ready to execute */
1034        std::list<ReadyNode> readyList;
1035
1036        /** Stats for data memory accesses replayed. */
1037        Stats::Scalar maxDependents;
1038        Stats::Scalar maxReadyListSize;
1039        Stats::Scalar numSendAttempted;
1040        Stats::Scalar numSendSucceeded;
1041        Stats::Scalar numSendFailed;
1042        Stats::Scalar numRetrySucceeded;
1043        Stats::Scalar numSplitReqs;
1044        Stats::Scalar numSOLoads;
1045        Stats::Scalar numSOStores;
1046        /** Tick when ElasticDataGen completes execution */
1047        Stats::Scalar dataLastTick;
1048    };
1049
1050    /** Instance of FixedRetryGen to replay instruction read requests. */
1051    FixedRetryGen icacheGen;
1052
1053    /** Instance of ElasticDataGen to replay data read and write requests. */
1054    ElasticDataGen dcacheGen;
1055
1056    /**
1057     * This is the control flow that uses the functionality of the icacheGen to
1058     * replay the trace. It calls tryNext(). If it returns true then next event
1059     * is scheduled at curTick() plus delta. If it returns false then delta is
1060     * ignored and control is brought back via recvRetry().
1061     */
1062    void schedIcacheNext();
1063
1064    /**
1065     * This is the control flow that uses the functionality of the dcacheGen to
1066     * replay the trace. It calls execute(). It checks if execution is complete
1067     * and schedules an event to exit simulation accordingly.
1068     */
1069    void schedDcacheNext();
1070
1071    /** Event for the control flow method schedIcacheNext() */
1072    EventWrapper<TraceCPU, &TraceCPU::schedIcacheNext> icacheNextEvent;
1073
1074    /** Event for the control flow method schedDcacheNext() */
1075    EventWrapper<TraceCPU, &TraceCPU::schedDcacheNext> dcacheNextEvent;
1076
1077    /** This is called when either generator finishes executing from the trace */
1078    void checkAndSchedExitEvent();
1079
1080    /** Set to true when one of the generators finishes replaying its trace. */
1081    bool oneTraceComplete;
1082
1083    /**
1084     * This is stores the tick of the first instruction fetch request
1085     * which is later used for dumping the tickOffset stat.
1086     */
1087    Tick firstFetchTick;
1088
1089    /**
1090     * Number of Trace CPUs in the system used as a shared variable and passed
1091     * to the CountedExitEvent event used for counting down exit events.  It is
1092     * incremented in the constructor call so that the total is arrived at
1093     * automatically.
1094     */
1095    static int numTraceCPUs;
1096
1097   /**
1098    * A CountedExitEvent which when serviced decrements the counter. A sim
1099    * exit event is scheduled when the counter equals zero, that is all
1100    * instances of Trace CPU have had their execCompleteEvent serviced.
1101    */
1102    CountedExitEvent *execCompleteEvent;
1103
1104    Stats::Scalar numSchedDcacheEvent;
1105    Stats::Scalar numSchedIcacheEvent;
1106
1107    /** Stat for number of simulated micro-ops. */
1108    Stats::Scalar numOps;
1109    /** Stat for the CPI. This is really cycles per micro-op and not inst. */
1110    Stats::Formula cpi;
1111
1112    /**
1113     * The first execution tick is dumped as a stat so that the simulated
1114     * seconds for a trace replay can be calculated as a difference between the
1115     * final_tick stat and the tickOffset stat
1116     */
1117    Stats::Scalar tickOffset;
1118
1119  public:
1120
1121    /** Used to get a reference to the icache port. */
1122    MasterPort &getInstPort() { return icachePort; }
1123
1124    /** Used to get a reference to the dcache port. */
1125    MasterPort &getDataPort() { return dcachePort; }
1126
1127    void regStats();
1128};
1129#endif // __CPU_TRACE_TRACE_CPU_HH__
1130