cpu/trace/trace_cpu.cc

/*
 * Copyright (c) 2013 - 2015 ARM Limited
 * All rights reserved
 *
 * The license below extends only to copyright in the software and shall
 * not be construed as granting a license to any other intellectual
 * property including but not limited to intellectual property relating
 * to a hardware implementation of the functionality of the software
 * licensed hereunder.  You may use the software subject to the license
 * terms below provided that you ensure that this notice is replicated
 * unmodified and in its entirety in all distributions of the software,
 * modified or unmodified, in source code or in binary form.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met: redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer;
 * redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution;
 * neither the name of the copyright holders nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Authors: Radhika Jagtap
 *          Andreas Hansson
 *          Thomas Grass
 */

#include "cpu/trace/trace_cpu.hh"

#include "sim/sim_exit.hh"

// Declare and initialize the static counter for number of trace CPUs.
int TraceCPU::numTraceCPUs = 0;

TraceCPU::TraceCPU(TraceCPUParams *params)
    :   BaseCPU(params),
        icachePort(this),
        dcachePort(this),
        instMasterID(params->system->getMasterId(name() + ".inst")),
        dataMasterID(params->system->getMasterId(name() + ".data")),
        instTraceFile(params->instTraceFile),
        dataTraceFile(params->dataTraceFile),
        icacheGen(*this, ".iside", icachePort, instMasterID, instTraceFile),
        dcacheGen(*this, ".dside", dcachePort, dataMasterID, dataTraceFile,
                    params->sizeROB, params->sizeStoreBuffer,
                    params->sizeLoadBuffer),
        icacheNextEvent(this),
        dcacheNextEvent(this),
        oneTraceComplete(false),
        firstFetchTick(0),
        execCompleteEvent(nullptr)
{
    // Increment static counter for number of Trace CPUs.
    ++TraceCPU::numTraceCPUs;

    // Check that the python parameters for sizes of ROB, store buffer and load
    // buffer do not overflow the corresponding C++ variables.
    fatal_if(params->sizeROB > UINT16_MAX, "ROB size set to %d exceeds the "
                "max. value of %d.\n", params->sizeROB, UINT16_MAX);
    fatal_if(params->sizeStoreBuffer > UINT16_MAX, "ROB size set to %d "
                "exceeds the max. value of %d.\n", params->sizeROB,
                UINT16_MAX);
    fatal_if(params->sizeLoadBuffer > UINT16_MAX, "Load buffer size set to"
                " %d exceeds the max. value of %d.\n",
                params->sizeLoadBuffer, UINT16_MAX);
}

TraceCPU::~TraceCPU()
{

}

TraceCPU*
TraceCPUParams::create()
{
    return new TraceCPU(this);
}

void
TraceCPU::takeOverFrom(BaseCPU *oldCPU)
{
    // Unbind the ports of the old CPU and bind the ports of the TraceCPU.
    assert(!getInstPort().isConnected());
    assert(oldCPU->getInstPort().isConnected());
    BaseSlavePort &inst_peer_port = oldCPU->getInstPort().getSlavePort();
    oldCPU->getInstPort().unbind();
    getInstPort().bind(inst_peer_port);

    assert(!getDataPort().isConnected());
    assert(oldCPU->getDataPort().isConnected());
    BaseSlavePort &data_peer_port = oldCPU->getDataPort().getSlavePort();
    oldCPU->getDataPort().unbind();
    getDataPort().bind(data_peer_port);
}

void
TraceCPU::init()
{
    DPRINTF(TraceCPUInst, "Instruction fetch request trace file is \"%s\"."
            "\n", instTraceFile);
    DPRINTF(TraceCPUData, "Data memory request trace file is \"%s\".\n",
            dataTraceFile);

    BaseCPU::init();

    // Get the send tick of the first instruction read request and schedule
    // icacheNextEvent at that tick.
    Tick first_icache_tick = icacheGen.init();
    schedule(icacheNextEvent, first_icache_tick);

    // Get the send tick of the first data read/write request and schedule
    // dcacheNextEvent at that tick.
    Tick first_dcache_tick = dcacheGen.init();
    schedule(dcacheNextEvent, first_dcache_tick);

    // The static counter for number of Trace CPUs is correctly set at this
    // point so create an event and pass it.
    execCompleteEvent = new CountedExitEvent("end of all traces reached.",
                                                numTraceCPUs);
    // Save the first fetch request tick to dump it as tickOffset
    firstFetchTick = first_icache_tick;
}

void
TraceCPU::schedIcacheNext()
{
    DPRINTF(TraceCPUInst, "IcacheGen event.\n");

    // Try to send the current packet or a retry packet if there is one
    bool sched_next = icacheGen.tryNext();
    // If packet sent successfully, schedule next event
    if (sched_next) {
        DPRINTF(TraceCPUInst, "Scheduling next icacheGen event "
                "at %d.\n", curTick() + icacheGen.tickDelta());
        schedule(icacheNextEvent, curTick() + icacheGen.tickDelta());
        ++numSchedIcacheEvent;
    } else {
        // check if traceComplete. If not, do nothing because sending failed
        // and next event will be scheduled via RecvRetry()
        if (icacheGen.isTraceComplete()) {
            // If this is the first trace to complete, set the variable. If it
            // is already set then both traces are complete to exit sim.
            checkAndSchedExitEvent();
        }
    }
    return;
}

void
TraceCPU::schedDcacheNext()
{
    DPRINTF(TraceCPUData, "DcacheGen event.\n");

    dcacheGen.execute();
    if (dcacheGen.isExecComplete()) {
        checkAndSchedExitEvent();
    }
}

void
TraceCPU::checkAndSchedExitEvent()
{
    if (!oneTraceComplete) {
        oneTraceComplete = true;
    } else {
        // Schedule event to indicate execution is complete as both
        // instruction and data access traces have been played back.
        inform("%s: Execution complete.\n", name());

        // Record stats which are computed at the end of simulation
        tickOffset = firstFetchTick;
        numCycles = (clockEdge() - firstFetchTick) / clockPeriod();
        numOps = dcacheGen.getMicroOpCount();
        schedule(*execCompleteEvent, curTick());
    }
}

void
TraceCPU::regStats()
{

    BaseCPU::regStats();

    numSchedDcacheEvent
    .name(name() + ".numSchedDcacheEvent")
    .desc("Number of events scheduled to trigger data request generator")
    ;

    numSchedIcacheEvent
    .name(name() + ".numSchedIcacheEvent")
    .desc("Number of events scheduled to trigger instruction request generator")
    ;

    numOps
    .name(name() + ".numOps")
    .desc("Number of micro-ops simulated by the Trace CPU")
    ;

    cpi
    .name(name() + ".cpi")
    .desc("Cycles per micro-op used as a proxy for CPI")
    .precision(6)
    ;
    cpi = numCycles/numOps;

    tickOffset
    .name(name() + ".tickOffset")
    .desc("The first execution tick for the root node of elastic traces")
    ;

    icacheGen.regStats();
    dcacheGen.regStats();
}

void
TraceCPU::ElasticDataGen::regStats()
{
    using namespace Stats;

    maxDependents
    .name(name() + ".maxDependents")
    .desc("Max number of dependents observed on a node")
    ;

    maxReadyListSize
    .name(name() + ".maxReadyListSize")
    .desc("Max size of the ready list observed")
    ;

    numSendAttempted
    .name(name() + ".numSendAttempted")
    .desc("Number of first attempts to send a request")
    ;

    numSendSucceeded
    .name(name() + ".numSendSucceeded")
    .desc("Number of successful first attempts")
    ;

    numSendFailed
    .name(name() + ".numSendFailed")
    .desc("Number of failed first attempts")
    ;

    numRetrySucceeded
    .name(name() + ".numRetrySucceeded")
    .desc("Number of successful retries")
    ;

    numSplitReqs
    .name(name() + ".numSplitReqs")
    .desc("Number of split requests")
    ;

    numSOLoads
    .name(name() + ".numSOLoads")
    .desc("Number of strictly ordered loads")
    ;

    numSOStores
    .name(name() + ".numSOStores")
    .desc("Number of strictly ordered stores")
    ;

    dataLastTick
    .name(name() + ".dataLastTick")
    .desc("Last tick simulated from the elastic data trace")
    ;
}

Tick
TraceCPU::ElasticDataGen::init()
{
    DPRINTF(TraceCPUData, "Initializing data memory request generator "
            "DcacheGen: elastic issue with retry.\n");

    if (!readNextWindow())
        panic("Trace has %d elements. It must have at least %d elements.\n",
              depGraph.size(), 2 * windowSize);
    DPRINTF(TraceCPUData, "After 1st read, depGraph size:%d.\n",
            depGraph.size());

    if (!readNextWindow())
        panic("Trace has %d elements. It must have at least %d elements.\n",
              depGraph.size(), 2 * windowSize);
    DPRINTF(TraceCPUData, "After 2st read, depGraph size:%d.\n",
            depGraph.size());

    // Print readyList
    if (DTRACE(TraceCPUData)) {
        printReadyList();
    }
    auto free_itr = readyList.begin();
    DPRINTF(TraceCPUData, "Execute tick of the first dependency free node %lli"
            " is %d.\n", free_itr->seqNum, free_itr->execTick);
    // Return the execute tick of the earliest ready node so that an event
    // can be scheduled to call execute()
    return (free_itr->execTick);
}

void
TraceCPU::ElasticDataGen::exit()
{
    trace.reset();
}

bool
TraceCPU::ElasticDataGen::readNextWindow()
{

    // Read and add next window
    DPRINTF(TraceCPUData, "Reading next window from file.\n");

    if (traceComplete) {
        // We are at the end of the file, thus we have no more records.
        // Return false.
        return false;
    }

    DPRINTF(TraceCPUData, "Start read: Size of depGraph is %d.\n",
            depGraph.size());

    uint32_t num_read = 0;
    while (num_read != windowSize) {

        // Create a new graph node
        GraphNode* new_node = new GraphNode;

        // Read the next line to get the next record. If that fails then end of
        // trace has been reached and traceComplete needs to be set in addition
        // to returning false.
        if (!trace.read(new_node)) {
            DPRINTF(TraceCPUData, "\tTrace complete!\n");
            traceComplete = true;
            return false;
        }

        // Annotate the ROB dependencies of the new node onto the parent nodes.
        addDepsOnParent(new_node, new_node->robDep, new_node->numRobDep);
        // Annotate the register dependencies of the new node onto the parent
        // nodes.
        addDepsOnParent(new_node, new_node->regDep, new_node->numRegDep);

        num_read++;
        // Add to map
        depGraph[new_node->seqNum] = new_node;
        if (new_node->numRobDep == 0 && new_node->numRegDep == 0) {
            // Source dependencies are already complete, check if resources
            // are available and issue. The execution time is approximated
            // to current time plus the computational delay.
            checkAndIssue(new_node);
        }
    }

    DPRINTF(TraceCPUData, "End read: Size of depGraph is %d.\n",
            depGraph.size());
    return true;
}

template<typename T> void
TraceCPU::ElasticDataGen::addDepsOnParent(GraphNode *new_node,
                                            T& dep_array, uint8_t& num_dep)
{
    for (auto& a_dep : dep_array) {
        // The convention is to set the dependencies starting with the first
        // index in the ROB and register dependency arrays. Thus, when we reach
        // a dependency equal to the initialisation value of zero, we know have
        // iterated over all dependencies and can break.
        if (a_dep == 0)
            break;
        // We look up the valid dependency, i.e. the parent of this node
        auto parent_itr = depGraph.find(a_dep);
        if (parent_itr != depGraph.end()) {
            // If the parent is found, it is yet to be executed. Append a
            // pointer to the new node to the dependents list of the parent
            // node.
            parent_itr->second->dependents.push_back(new_node);
            auto num_depts = parent_itr->second->dependents.size();
            maxDependents = std::max<double>(num_depts, maxDependents.value());
        } else {
            // The dependency is not found in the graph. So consider
            // the execution of the parent is complete, i.e. remove this
            // dependency.
            a_dep = 0;
            num_dep--;
        }
    }
}

void
TraceCPU::ElasticDataGen::execute()
{
    DPRINTF(TraceCPUData, "Execute start occupancy:\n");
    DPRINTFR(TraceCPUData, "\tdepGraph = %d, readyList = %d, "
            "depFreeQueue = %d ,", depGraph.size(), readyList.size(),
            depFreeQueue.size());
    hwResource.printOccupancy();

    // Read next window to make sure that dependents of all dep-free nodes
    // are in the depGraph
    if (nextRead) {
        readNextWindow();
        nextRead = false;
    }

    // First attempt to issue the pending dependency-free nodes held
    // in depFreeQueue. If resources have become available for a node,
    // then issue it, i.e. add the node to readyList.
    while (!depFreeQueue.empty()) {
        if (checkAndIssue(depFreeQueue.front(), false)) {
            DPRINTF(TraceCPUData, "Removing from depFreeQueue: seq. num "
                "%lli.\n", (depFreeQueue.front())->seqNum);
            depFreeQueue.pop();
        } else {
            break;
        }
    }
    // Proceed to execute from readyList
    auto graph_itr = depGraph.begin();
    auto free_itr = readyList.begin();
    // Iterate through readyList until the next free node has its execute
    // tick later than curTick or the end of readyList is reached
    while (free_itr->execTick <= curTick() && free_itr != readyList.end()) {

        // Get pointer to the node to be executed
        graph_itr = depGraph.find(free_itr->seqNum);
        assert(graph_itr != depGraph.end());
        GraphNode* node_ptr = graph_itr->second;

        // If there is a retryPkt send that else execute the load
        if (retryPkt) {
            // The retryPkt must be the request that was created by the
            // first node in the readyList.
            if (retryPkt->req->getReqInstSeqNum() != node_ptr->seqNum) {
                panic("Retry packet's seqence number does not match "
                      "the first node in the readyList.\n");
            }
            if (port.sendTimingReq(retryPkt)) {
                ++numRetrySucceeded;
                retryPkt = nullptr;
            }
        } else if (node_ptr->isLoad() || node_ptr->isStore()) {
            // If there is no retryPkt, attempt to send a memory request in
            // case of a load or store node. If the send fails, executeMemReq()
            // returns a packet pointer, which we save in retryPkt. In case of
            // a comp node we don't do anything and simply continue as if the
            // execution of the comp node succedded.
            retryPkt = executeMemReq(node_ptr);
        }
        // If the retryPkt or a new load/store node failed, we exit from here
        // as a retry from cache will bring the control to execute(). The
        // first node in readyList then, will be the failed node.
        if (retryPkt) {
            break;
        }

        // Proceed to remove dependencies for the successfully executed node.
        // If it is a load which is not strictly ordered and we sent a
        // request for it successfully, we do not yet mark any register
        // dependencies complete. But as per dependency modelling we need
        // to mark ROB dependencies of load and non load/store nodes which
        // are based on successful sending of the load as complete.
        if (node_ptr->isLoad() && !node_ptr->isStrictlyOrdered()) {
            // If execute succeeded mark its dependents as complete
            DPRINTF(TraceCPUData, "Node seq. num %lli sent. Waking up "
                    "dependents..\n", node_ptr->seqNum);

            auto child_itr = (node_ptr->dependents).begin();
            while (child_itr != (node_ptr->dependents).end()) {
                // ROB dependency of a store on a load must not be removed
                // after load is sent but after response is received
                if (!(*child_itr)->isStore() &&
                    (*child_itr)->removeRobDep(node_ptr->seqNum)) {

                    // Check if the child node has become dependency free
                    if ((*child_itr)->numRobDep == 0 &&
                        (*child_itr)->numRegDep == 0) {

                        // Source dependencies are complete, check if
                        // resources are available and issue
                        checkAndIssue(*child_itr);
                    }
                    // Remove this child for the sent load and point to new
                    // location of the element following the erased element
                    child_itr = node_ptr->dependents.erase(child_itr);
                } else {
                    // This child is not dependency-free, point to the next
                    // child
                    child_itr++;
                }
            }
        } else {
            // If it is a strictly ordered load mark its dependents as complete
            // as we do not send a request for this case. If it is a store or a
            // comp node we also mark all its dependents complete.
            DPRINTF(TraceCPUData, "Node seq. num %lli done. Waking"
                    " up dependents..\n", node_ptr->seqNum);

            for (auto child : node_ptr->dependents) {
                // If the child node is dependency free removeDepOnInst()
                // returns true.
                if (child->removeDepOnInst(node_ptr->seqNum)) {
                    // Source dependencies are complete, check if resources
                    // are available and issue
                    checkAndIssue(child);
                }
            }
        }

        // After executing the node, remove from readyList and delete node.
        readyList.erase(free_itr);
        // If it is a cacheable load which was sent, don't delete
        // just yet.  Delete it in completeMemAccess() after the
        // response is received. If it is an strictly ordered
        // load, it was not sent and all dependencies were simply
        // marked complete. Thus it is safe to delete it. For
        // stores and non load/store nodes all dependencies were
        // marked complete so it is safe to delete it.
        if (!node_ptr->isLoad() || node_ptr->isStrictlyOrdered()) {
            // Release all resources occupied by the completed node
            hwResource.release(node_ptr);
            // clear the dynamically allocated set of dependents
            (node_ptr->dependents).clear();
            // delete node
            delete node_ptr;
            // remove from graph
            depGraph.erase(graph_itr);
        }
        // Point to first node to continue to next iteration of while loop
        free_itr = readyList.begin();
    } // end of while loop

    // Print readyList, sizes of queues and resource status after updating
    if (DTRACE(TraceCPUData)) {
        printReadyList();
        DPRINTF(TraceCPUData, "Execute end occupancy:\n");
        DPRINTFR(TraceCPUData, "\tdepGraph = %d, readyList = %d, "
                "depFreeQueue = %d ,", depGraph.size(), readyList.size(),
                depFreeQueue.size());
        hwResource.printOccupancy();
    }

    if (retryPkt) {
        DPRINTF(TraceCPUData, "Not scheduling an event as expecting a retry"
                "event from the cache for seq. num %lli.\n",
                retryPkt->req->getReqInstSeqNum());
        return;
    }
    // If the size of the dependency graph is less than the dependency window
    // then read from the trace file to populate the graph next time we are in
    // execute.
    if (depGraph.size() < windowSize && !traceComplete)
        nextRead = true;

    // If cache is not blocked, schedule an event for the first execTick in
    // readyList else retry from cache will schedule the event. If the ready
    // list is empty then check if the next pending node has resources
    // available to issue. If yes, then schedule an event for the next cycle.
    if (!readyList.empty()) {
        Tick next_event_tick = std::max(readyList.begin()->execTick,
                                        curTick());
        DPRINTF(TraceCPUData, "Attempting to schedule @%lli.\n",
                next_event_tick);
        owner.schedDcacheNextEvent(next_event_tick);
    } else if (readyList.empty() && !depFreeQueue.empty() &&
                hwResource.isAvailable(depFreeQueue.front())) {
        DPRINTF(TraceCPUData, "Attempting to schedule @%lli.\n",
                owner.clockEdge(Cycles(1)));
        owner.schedDcacheNextEvent(owner.clockEdge(Cycles(1)));
    }

    // If trace is completely read, readyList is empty and depGraph is empty,
    // set execComplete to true
    if (depGraph.empty() && readyList.empty() && traceComplete &&
        !hwResource.awaitingResponse()) {
        DPRINTF(TraceCPUData, "\tExecution Complete!\n");
        execComplete = true;
        dataLastTick = curTick();
    }
}

PacketPtr
TraceCPU::ElasticDataGen::executeMemReq(GraphNode* node_ptr)
{

    DPRINTF(TraceCPUData, "Executing memory request %lli (addr %d, pc %#x, "
            "size %d, flags %d).\n", node_ptr->seqNum, node_ptr->addr,
            node_ptr->pc, node_ptr->size, node_ptr->flags);

    // If the request is strictly ordered, do not send it. Just return nullptr
    // as if it was succesfully sent.
    if (node_ptr->isStrictlyOrdered()) {
        node_ptr->isLoad() ? ++numSOLoads : ++numSOStores;
        DPRINTF(TraceCPUData, "Skipping strictly ordered request %lli.\n",
                node_ptr->seqNum);
        return nullptr;
    }

    // Check if the request spans two cache lines as this condition triggers
    // an assert fail in the L1 cache. If it does then truncate the size to
    // access only until the end of that line and ignore the remainder. The
    // stat counting this is useful to keep a check on how frequently this
    // happens. If required the code could be revised to mimick splitting such
    // a request into two.
    unsigned blk_size = owner.cacheLineSize();
    Addr blk_offset = (node_ptr->addr & (Addr)(blk_size - 1));
    if (!(blk_offset + node_ptr->size <= blk_size)) {
        node_ptr->size = blk_size - blk_offset;
        ++numSplitReqs;
    }

    // Create a request and the packet containing request
    Request* req = new Request(node_ptr->addr, node_ptr->size, node_ptr->flags,
                               masterID, node_ptr->seqNum,
                               ContextID(0), ThreadID(0));
    req->setPC(node_ptr->pc);
    PacketPtr pkt;
    uint8_t* pkt_data = new uint8_t[req->getSize()];
    if (node_ptr->isLoad()) {
        pkt = Packet::createRead(req);
    } else {
        pkt = Packet::createWrite(req);
        memset(pkt_data, 0xA, req->getSize());
    }
    pkt->dataDynamic(pkt_data);

    // Call MasterPort method to send a timing request for this packet
    bool success = port.sendTimingReq(pkt);
    ++numSendAttempted;

    if (!success) {
        // If it fails, return the packet to retry when a retry is signalled by
        // the cache
        ++numSendFailed;
        DPRINTF(TraceCPUData, "Send failed. Saving packet for retry.\n");
        return pkt;
    } else {
        // It is succeeds, return nullptr
        ++numSendSucceeded;
        return nullptr;
    }
}

bool
TraceCPU::ElasticDataGen::checkAndIssue(const GraphNode* node_ptr, bool first)
{
    // Assert the node is dependency-free
    assert(node_ptr->numRobDep == 0 && node_ptr->numRegDep == 0);

    // If this is the first attempt, print a debug message to indicate this.
    if (first) {
        DPRINTFR(TraceCPUData, "\t\tseq. num %lli(%s) with rob num %lli is now"
            " dependency free.\n", node_ptr->seqNum, node_ptr->typeToStr(),
            node_ptr->robNum);
    }

    // Check if resources are available to issue the specific node
    if (hwResource.isAvailable(node_ptr)) {
        // If resources are free only then add to readyList
        DPRINTFR(TraceCPUData, "\t\tResources available for seq. num %lli. Adding"
            " to readyList, occupying resources.\n", node_ptr->seqNum);
        // Compute the execute tick by adding the compute delay for the node
        // and add the ready node to the ready list
        addToSortedReadyList(node_ptr->seqNum,
                                owner.clockEdge() + node_ptr->compDelay);
        // Account for the resources taken up by this issued node.
        hwResource.occupy(node_ptr);
        return true;

    } else {
        if (first) {
            // Although dependencies are complete, resources are not available.
            DPRINTFR(TraceCPUData, "\t\tResources unavailable for seq. num %lli."
                " Adding to depFreeQueue.\n", node_ptr->seqNum);
            depFreeQueue.push(node_ptr);
        } else {
            DPRINTFR(TraceCPUData, "\t\tResources unavailable for seq. num %lli. "
                "Still pending issue.\n", node_ptr->seqNum);
        }
        return false;
    }
}

void
TraceCPU::ElasticDataGen::completeMemAccess(PacketPtr pkt)
{
    // Release the resources for this completed node.
    if (pkt->isWrite()) {
        // Consider store complete.
        hwResource.releaseStoreBuffer();
        // If it is a store response then do nothing since we do not model
        // dependencies on store completion in the trace. But if we were
        // blocking execution due to store buffer fullness, we need to schedule
        // an event and attempt to progress.
    } else {
        // If it is a load response then release the dependents waiting on it.
        // Get pointer to the completed load
        auto graph_itr = depGraph.find(pkt->req->getReqInstSeqNum());
        assert(graph_itr != depGraph.end());
        GraphNode* node_ptr = graph_itr->second;

        // Release resources occupied by the load
        hwResource.release(node_ptr);

        DPRINTF(TraceCPUData, "Load seq. num %lli response received. Waking up"
                " dependents..\n", node_ptr->seqNum);

        for (auto child : node_ptr->dependents) {
            if (child->removeDepOnInst(node_ptr->seqNum)) {
                checkAndIssue(child);
            }
        }

        // clear the dynamically allocated set of dependents
        (node_ptr->dependents).clear();
        // delete node
        delete node_ptr;
        // remove from graph
        depGraph.erase(graph_itr);
    }

    if (DTRACE(TraceCPUData)) {
        printReadyList();
    }

    // If the size of the dependency graph is less than the dependency window
    // then read from the trace file to populate the graph next time we are in
    // execute.
    if (depGraph.size() < windowSize && !traceComplete)
        nextRead = true;

    // If not waiting for retry, attempt to schedule next event
    if (!retryPkt) {
        // We might have new dep-free nodes in the list which will have execute
        // tick greater than or equal to curTick. But a new dep-free node might
        // have its execute tick earlier. Therefore, attempt to reschedule. It
        // could happen that the readyList is empty and we got here via a
        // last remaining response. So, either the trace is complete or there
        // are pending nodes in the depFreeQueue. The checking is done in the
        // execute() control flow, so schedule an event to go via that flow.
        Tick next_event_tick = readyList.empty() ? owner.clockEdge(Cycles(1)) :
            std::max(readyList.begin()->execTick, owner.clockEdge(Cycles(1)));
        DPRINTF(TraceCPUData, "Attempting to schedule @%lli.\n",
                next_event_tick);
        owner.schedDcacheNextEvent(next_event_tick);
    }
}

void
TraceCPU::ElasticDataGen::addToSortedReadyList(NodeSeqNum seq_num,
                                                    Tick exec_tick)
{
    ReadyNode ready_node;
    ready_node.seqNum = seq_num;
    ready_node.execTick = exec_tick;

    // Iterator to readyList
    auto itr = readyList.begin();

    // If the readyList is empty, simply insert the new node at the beginning
    // and return
    if (itr == readyList.end()) {
        readyList.insert(itr, ready_node);
        maxReadyListSize = std::max<double>(readyList.size(),
                                              maxReadyListSize.value());
        return;
    }

    // If the new node has its execution tick equal to the first node in the
    // list then go to the next node. If the first node in the list failed
    // to execute, its position as the first is thus maintained.
    if (retryPkt)
        if (retryPkt->req->getReqInstSeqNum() == itr->seqNum)
            itr++;

    // Increment the iterator and compare the node pointed to by it to the new
    // node till the position to insert the new node is found.
    bool found = false;
    while (!found && itr != readyList.end()) {
        // If the execution tick of the new node is less than the node then
        // this is the position to insert
        if (exec_tick < itr->execTick)
            found = true;
        // If the execution tick of the new node is equal to the node then
        // sort in ascending order of sequence numbers
        else if (exec_tick == itr->execTick) {
            // If the sequence number of the new node is less than the node
            // then this is the position to insert
            if (seq_num < itr->seqNum)
                found = true;
            // Else go to next node
            else
                itr++;
        }
        // If the execution tick of the new node is greater than the node then
        // go to the next node
        else
            itr++;
    }
    readyList.insert(itr, ready_node);
    // Update the stat for max size reached of the readyList
    maxReadyListSize = std::max<double>(readyList.size(),
                                          maxReadyListSize.value());
}

void
TraceCPU::ElasticDataGen::printReadyList() {

    auto itr = readyList.begin();
    if (itr == readyList.end()) {
        DPRINTF(TraceCPUData, "readyList is empty.\n");
        return;
    }
    DPRINTF(TraceCPUData, "Printing readyList:\n");
    while (itr != readyList.end()) {
        auto graph_itr = depGraph.find(itr->seqNum);
        GraphNode* node_ptr M5_VAR_USED = graph_itr->second;
        DPRINTFR(TraceCPUData, "\t%lld(%s), %lld\n", itr->seqNum,
            node_ptr->typeToStr(), itr->execTick);
        itr++;
    }
}

TraceCPU::ElasticDataGen::HardwareResource::HardwareResource(
    uint16_t max_rob, uint16_t max_stores, uint16_t max_loads)
  : sizeROB(max_rob),
    sizeStoreBuffer(max_stores),
    sizeLoadBuffer(max_loads),
    oldestInFlightRobNum(UINT64_MAX),
    numInFlightLoads(0),
    numInFlightStores(0)
{}

void
TraceCPU::ElasticDataGen::HardwareResource::occupy(const GraphNode* new_node)
{
    // Occupy ROB entry for the issued node
    // Merely maintain the oldest node, i.e. numerically least robNum by saving
    // it in the variable oldestInFLightRobNum.
    inFlightNodes[new_node->seqNum] = new_node->robNum;
    oldestInFlightRobNum = inFlightNodes.begin()->second;

    // Occupy Load/Store Buffer entry for the issued node if applicable
    if (new_node->isLoad()) {
        ++numInFlightLoads;
    } else if (new_node->isStore()) {
        ++numInFlightStores;
    } // else if it is a non load/store node, no buffer entry is occupied

    printOccupancy();
}

void
TraceCPU::ElasticDataGen::HardwareResource::release(const GraphNode* done_node)
{
    assert(!inFlightNodes.empty());
    DPRINTFR(TraceCPUData, "\tClearing done seq. num %d from inFlightNodes..\n",
        done_node->seqNum);

    assert(inFlightNodes.find(done_node->seqNum) != inFlightNodes.end());
    inFlightNodes.erase(done_node->seqNum);

    if (inFlightNodes.empty()) {
        // If we delete the only in-flight node and then the
        // oldestInFlightRobNum is set to it's initialized (max) value.
        oldestInFlightRobNum = UINT64_MAX;
    } else {
        // Set the oldest in-flight node rob number equal to the first node in
        // the inFlightNodes since that will have the numerically least value.
        oldestInFlightRobNum = inFlightNodes.begin()->second;
    }

    DPRINTFR(TraceCPUData, "\tCleared. inFlightNodes.size() = %d, "
        "oldestInFlightRobNum = %d\n", inFlightNodes.size(),
        oldestInFlightRobNum);

    // A store is considered complete when a request is sent, thus ROB entry is
    // freed. But it occupies an entry in the Store Buffer until its response
    // is received. A load is considered complete when a response is received,
    // thus both ROB and Load Buffer entries can be released.
    if (done_node->isLoad()) {
        assert(numInFlightLoads != 0);
        --numInFlightLoads;
    }
    // For normal writes, we send the requests out and clear a store buffer
    // entry on response. For writes which are strictly ordered, for e.g.
    // writes to device registers, we do that within release() which is called
    // when node is executed and taken off from readyList.
    if (done_node->isStore() && done_node->isStrictlyOrdered()) {
        releaseStoreBuffer();
    }
}

void
TraceCPU::ElasticDataGen::HardwareResource::releaseStoreBuffer()
{
    assert(numInFlightStores != 0);
    --numInFlightStores;
}

bool
TraceCPU::ElasticDataGen::HardwareResource::isAvailable(
    const GraphNode* new_node) const
{
    uint16_t num_in_flight_nodes;
    if (inFlightNodes.empty()) {
        num_in_flight_nodes = 0;
        DPRINTFR(TraceCPUData, "\t\tChecking resources to issue seq. num %lli:"
            " #in-flight nodes = 0", new_node->seqNum);
    } else if (new_node->robNum > oldestInFlightRobNum) {
        // This is the intuitive case where new dep-free node is younger
        // instruction than the oldest instruction in-flight. Thus we make sure
        // in_flight_nodes does not overflow.
        num_in_flight_nodes = new_node->robNum - oldestInFlightRobNum;
        DPRINTFR(TraceCPUData, "\t\tChecking resources to issue seq. num %lli:"
            " #in-flight nodes = %d - %d =  %d", new_node->seqNum,
             new_node->robNum, oldestInFlightRobNum, num_in_flight_nodes);
    } else {
        // This is the case where an instruction older than the oldest in-
        // flight instruction becomes dep-free. Thus we must have already
        // accounted for the entry in ROB for this new dep-free node.
        // Immediately after this check returns true, oldestInFlightRobNum will
        // be updated in occupy(). We simply let this node issue now.
        num_in_flight_nodes = 0;
        DPRINTFR(TraceCPUData, "\t\tChecking resources to issue seq. num %lli:"
            " new oldestInFlightRobNum = %d, #in-flight nodes ignored",
            new_node->seqNum, new_node->robNum);
    }
    DPRINTFR(TraceCPUData, ", LQ = %d/%d, SQ  = %d/%d.\n",
        numInFlightLoads, sizeLoadBuffer,
        numInFlightStores, sizeStoreBuffer);
    // Check if resources are available to issue the specific node
    if (num_in_flight_nodes >= sizeROB) {
        return false;
    }
    if (new_node->isLoad() && numInFlightLoads >= sizeLoadBuffer) {
        return false;
    }
    if (new_node->isStore() && numInFlightStores >= sizeStoreBuffer) {
        return false;
    }
    return true;
}

bool
TraceCPU::ElasticDataGen::HardwareResource::awaitingResponse() const {
    // Return true if there is at least one read or write request in flight
    return (numInFlightStores != 0 || numInFlightLoads != 0);
}

void
TraceCPU::ElasticDataGen::HardwareResource::printOccupancy() {
    DPRINTFR(TraceCPUData, "oldestInFlightRobNum = %d, "
            "LQ = %d/%d, SQ  = %d/%d.\n",
            oldestInFlightRobNum,
            numInFlightLoads, sizeLoadBuffer,
            numInFlightStores, sizeStoreBuffer);
}

void
TraceCPU::FixedRetryGen::regStats()
{
    using namespace Stats;

    numSendAttempted
    .name(name() + ".numSendAttempted")
    .desc("Number of first attempts to send a request")
    ;

    numSendSucceeded
    .name(name() + ".numSendSucceeded")
    .desc("Number of successful first attempts")
    ;

    numSendFailed
    .name(name() + ".numSendFailed")
    .desc("Number of failed first attempts")
    ;

    numRetrySucceeded
    .name(name() + ".numRetrySucceeded")
    .desc("Number of successful retries")
    ;

    instLastTick
    .name(name() + ".instLastTick")
    .desc("Last tick simulated from the fixed inst trace")
    ;
}

Tick
TraceCPU::FixedRetryGen::init()
{
    DPRINTF(TraceCPUInst, "Initializing instruction fetch request generator"
            " IcacheGen: fixed issue with retry.\n");

    if (nextExecute()) {
        DPRINTF(TraceCPUInst, "\tFirst tick = %d.\n", currElement.tick);
        return currElement.tick;
    } else {
        panic("Read of first message in the trace failed.\n");
        return MaxTick;
    }
}

bool
TraceCPU::FixedRetryGen::tryNext()
{
    // If there is a retry packet, try to send it
    if (retryPkt) {

        DPRINTF(TraceCPUInst, "Trying to send retry packet.\n");

        if (!port.sendTimingReq(retryPkt)) {
            // Still blocked! This should never occur.
            DPRINTF(TraceCPUInst, "Retry packet sending failed.\n");
            return false;
        }
        ++numRetrySucceeded;
    } else {

        DPRINTF(TraceCPUInst, "Trying to send packet for currElement.\n");

        // try sending current element
        assert(currElement.isValid());

        ++numSendAttempted;

        if (!send(currElement.addr, currElement.blocksize,
                    currElement.cmd, currElement.flags, currElement.pc)) {
            DPRINTF(TraceCPUInst, "currElement sending failed.\n");
            ++numSendFailed;
            // return false to indicate not to schedule next event
            return false;
        } else {
            ++numSendSucceeded;
        }
    }
    // If packet was sent successfully, either retryPkt or currElement, return
    // true to indicate to schedule event at current Tick plus delta. If packet
    // was sent successfully and there is no next packet to send, return false.
    DPRINTF(TraceCPUInst, "Packet sent successfully, trying to read next "
        "element.\n");
    retryPkt = nullptr;
    // Read next element into currElement, currElement gets cleared so save the
    // tick to calculate delta
    Tick last_tick = currElement.tick;
    if (nextExecute()) {
        assert(currElement.tick >= last_tick);
        delta = currElement.tick - last_tick;
    }
    return !traceComplete;
}

void
TraceCPU::FixedRetryGen::exit()
{
    trace.reset();
}

bool
TraceCPU::FixedRetryGen::nextExecute()
{
    if (traceComplete)
        // We are at the end of the file, thus we have no more messages.
        // Return false.
        return false;


    //Reset the currElement to the default values
    currElement.clear();

    // Read the next line to get the next message. If that fails then end of
    // trace has been reached and traceComplete needs to be set in addition
    // to returning false. If successful then next message is in currElement.
    if (!trace.read(&currElement)) {
        traceComplete = true;
        instLastTick = curTick();
        return false;
    }

    DPRINTF(TraceCPUInst, "inst fetch: %c addr %d pc %#x size %d tick %d\n",
            currElement.cmd.isRead() ? 'r' : 'w',
            currElement.addr,
            currElement.pc,
            currElement.blocksize,
            currElement.tick);

    return true;
}

bool
TraceCPU::FixedRetryGen::send(Addr addr, unsigned size, const MemCmd& cmd,
              Request::FlagsType flags, Addr pc)
{

    // Create new request
    Request* req = new Request(addr, size, flags, masterID);
    req->setPC(pc);

    // If this is not done it triggers assert in L1 cache for invalid contextId
    req->setThreadContext(ContextID(0), ThreadID(0));

    // Embed it in a packet
    PacketPtr pkt = new Packet(req, cmd);

    uint8_t* pkt_data = new uint8_t[req->getSize()];
    pkt->dataDynamic(pkt_data);

    if (cmd.isWrite()) {
        memset(pkt_data, 0xA, req->getSize());
    }

    // Call MasterPort method to send a timing request for this packet
    bool success = port.sendTimingReq(pkt);
    if (!success) {
        // If it fails, save the packet to retry when a retry is signalled by
        // the cache
        retryPkt = pkt;
    }
    return success;
}

void
TraceCPU::icacheRetryRecvd()
{
    // Schedule an event to go through the control flow in the same tick as
    // retry is received
    DPRINTF(TraceCPUInst, "Icache retry received. Scheduling next IcacheGen"
            " event @%lli.\n", curTick());
    schedule(icacheNextEvent, curTick());
}

void
TraceCPU::dcacheRetryRecvd()
{
    // Schedule an event to go through the execute flow in the same tick as
    // retry is received
    DPRINTF(TraceCPUData, "Dcache retry received. Scheduling next DcacheGen"
            " event @%lli.\n", curTick());
    schedule(dcacheNextEvent, curTick());
}

void
TraceCPU::schedDcacheNextEvent(Tick when)
{
    if (!dcacheNextEvent.scheduled()) {
        DPRINTF(TraceCPUData, "Scheduling next DcacheGen event at %lli.\n",
                when);
        schedule(dcacheNextEvent, when);
        ++numSchedDcacheEvent;
    } else if (when < dcacheNextEvent.when()) {
        DPRINTF(TraceCPUData, "Re-scheduling next dcache event from %lli"
                " to %lli.\n", dcacheNextEvent.when(), when);
        reschedule(dcacheNextEvent, when);
    }

}

bool
TraceCPU::IcachePort::recvTimingResp(PacketPtr pkt)
{
    // All responses on the instruction fetch side are ignored. Simply delete
    // the request and packet to free allocated memory
    delete pkt->req;
    delete pkt;

    return true;
}

void
TraceCPU::IcachePort::recvReqRetry()
{
    owner->icacheRetryRecvd();
}

void
TraceCPU::dcacheRecvTimingResp(PacketPtr pkt)
{
    DPRINTF(TraceCPUData, "Received timing response from Dcache.\n");
    dcacheGen.completeMemAccess(pkt);
}

bool
TraceCPU::DcachePort::recvTimingResp(PacketPtr pkt)
{
    // Handle the responses for data memory requests which is done inside the
    // elastic data generator
    owner->dcacheRecvTimingResp(pkt);
    // After processing the response delete the request and packet to free
    // memory
    delete pkt->req;
    delete pkt;

    return true;
}

void
TraceCPU::DcachePort::recvReqRetry()
{
    owner->dcacheRetryRecvd();
}

TraceCPU::ElasticDataGen::InputStream::InputStream(const std::string& filename)
    : trace(filename),
      microOpCount(0)
{
    // Create a protobuf message for the header and read it from the stream
    ProtoMessage::InstDepRecordHeader header_msg;
    if (!trace.read(header_msg)) {
        panic("Failed to read packet header from %s\n", filename);

        if (header_msg.tick_freq() != SimClock::Frequency) {
            panic("Trace %s was recorded with a different tick frequency %d\n",
                  header_msg.tick_freq());
        }
    } else {
        // Assign window size equal to the field in the trace that was recorded
        // when the data dependency trace was captured in the o3cpu model
        windowSize = header_msg.window_size();
    }
}

void
TraceCPU::ElasticDataGen::InputStream::reset()
{
    trace.reset();
}

bool
TraceCPU::ElasticDataGen::InputStream::read(GraphNode* element)
{
    ProtoMessage::InstDepRecord pkt_msg;
    if (trace.read(pkt_msg)) {
        // Required fields
        element->seqNum = pkt_msg.seq_num();
        element->type = pkt_msg.type();
        element->compDelay = pkt_msg.comp_delay();

        // Repeated field robDepList
        element->clearRobDep();
        assert((pkt_msg.rob_dep()).size() <= element->maxRobDep);
        for (int i = 0; i < (pkt_msg.rob_dep()).size(); i++) {
            element->robDep[element->numRobDep] = pkt_msg.rob_dep(i);
            element->numRobDep += 1;
        }

        // Repeated field
        element->clearRegDep();
        assert((pkt_msg.reg_dep()).size() <= TheISA::MaxInstSrcRegs);
        for (int i = 0; i < (pkt_msg.reg_dep()).size(); i++) {
            // There is a possibility that an instruction has both, a register
            // and order dependency on an instruction. In such a case, the
            // register dependency is omitted
            bool duplicate = false;
            for (int j = 0; j < element->numRobDep; j++) {
                duplicate |= (pkt_msg.reg_dep(i) == element->robDep[j]);
            }
            if (!duplicate) {
                element->regDep[element->numRegDep] = pkt_msg.reg_dep(i);
                element->numRegDep += 1;
            }
        }

        // Optional fields
        if (pkt_msg.has_addr())
            element->addr = pkt_msg.addr();
        else
            element->addr = 0;

        if (pkt_msg.has_size())
            element->size = pkt_msg.size();
        else
            element->size = 0;

        if (pkt_msg.has_flags())
            element->flags = pkt_msg.flags();
        else
            element->flags = 0;

        if (pkt_msg.has_pc())
            element->pc = pkt_msg.pc();
        else
            element->pc = 0;

        // ROB occupancy number
        ++microOpCount;
        if (pkt_msg.has_weight()) {
            microOpCount += pkt_msg.weight();
        }
        element->robNum = microOpCount;
        return true;
    }

    // We have reached the end of the file
    return false;
}

bool
TraceCPU::ElasticDataGen::GraphNode::removeRegDep(NodeSeqNum reg_dep)
{
    for (auto& own_reg_dep : regDep) {
        if (own_reg_dep == reg_dep) {
            // If register dependency is found, make it zero and return true
            own_reg_dep = 0;
            --numRegDep;
            assert(numRegDep >= 0);
            DPRINTFR(TraceCPUData, "\tFor %lli: Marking register dependency %lli "
                    "done.\n", seqNum, reg_dep);
            return true;
        }
    }

    // Return false if the dependency is not found
    return false;
}

bool
TraceCPU::ElasticDataGen::GraphNode::removeRobDep(NodeSeqNum rob_dep)
{
    for (auto& own_rob_dep : robDep) {
        if (own_rob_dep == rob_dep) {
            // If the rob dependency is found, make it zero and return true
            own_rob_dep = 0;
            --numRobDep;
            assert(numRobDep >= 0);
            DPRINTFR(TraceCPUData, "\tFor %lli: Marking ROB dependency %lli "
                "done.\n", seqNum, rob_dep);
            return true;
        }
    }
    return false;
}

void
TraceCPU::ElasticDataGen::GraphNode::clearRegDep() {
    for (auto& own_reg_dep : regDep) {
        own_reg_dep = 0;
    }
    numRegDep = 0;
}

void
TraceCPU::ElasticDataGen::GraphNode::clearRobDep() {
    for (auto& own_rob_dep : robDep) {
        own_rob_dep = 0;
    }
    numRobDep = 0;
}

bool
TraceCPU::ElasticDataGen::GraphNode::removeDepOnInst(NodeSeqNum done_seq_num)
{
    // If it is an rob dependency then remove it
    if (!removeRobDep(done_seq_num)) {
        // If it is not an rob dependency then it must be a register dependency
        // If the register dependency is not found, it violates an assumption
        // and must be caught by assert.
        bool regdep_found M5_VAR_USED = removeRegDep(done_seq_num);
        assert(regdep_found);
    }
    // Return true if the node is dependency free
    return (numRobDep == 0 && numRegDep == 0);
}

void
TraceCPU::ElasticDataGen::GraphNode::writeElementAsTrace() const
{
    DPRINTFR(TraceCPUData, "%lli", seqNum);
    DPRINTFR(TraceCPUData, ",%s", typeToStr());
    if (isLoad() || isStore()) {
        DPRINTFR(TraceCPUData, ",%i", addr);
        DPRINTFR(TraceCPUData, ",%i", size);
        DPRINTFR(TraceCPUData, ",%i", flags);
    }
    DPRINTFR(TraceCPUData, ",%lli", compDelay);
    int i = 0;
    DPRINTFR(TraceCPUData, "robDep:");
    while (robDep[i] != 0) {
        DPRINTFR(TraceCPUData, ",%lli", robDep[i]);
        i++;
    }
    i = 0;
    DPRINTFR(TraceCPUData, "regDep:");
    while (regDep[i] != 0) {
        DPRINTFR(TraceCPUData, ",%lli", regDep[i]);
        i++;
    }
    auto child_itr = dependents.begin();
    DPRINTFR(TraceCPUData, "dependents:");
    while (child_itr != dependents.end()) {
        DPRINTFR(TraceCPUData, ":%lli", (*child_itr)->seqNum);
        child_itr++;
    }

    DPRINTFR(TraceCPUData, "\n");
}

std::string
TraceCPU::ElasticDataGen::GraphNode::typeToStr() const
{
    return Record::RecordType_Name(type);
}

TraceCPU::FixedRetryGen::InputStream::InputStream(const std::string& filename)
    : trace(filename)
{
    // Create a protobuf message for the header and read it from the stream
    ProtoMessage::PacketHeader header_msg;
    if (!trace.read(header_msg)) {
        panic("Failed to read packet header from %s\n", filename);

        if (header_msg.tick_freq() != SimClock::Frequency) {
            panic("Trace %s was recorded with a different tick frequency %d\n",
                  header_msg.tick_freq());
        }
    }
}

void
TraceCPU::FixedRetryGen::InputStream::reset()
{
    trace.reset();
}

bool
TraceCPU::FixedRetryGen::InputStream::read(TraceElement* element)
{
    ProtoMessage::Packet pkt_msg;
    if (trace.read(pkt_msg)) {
        element->cmd = pkt_msg.cmd();
        element->addr = pkt_msg.addr();
        element->blocksize = pkt_msg.size();
        element->tick = pkt_msg.tick();
        element->flags = pkt_msg.has_flags() ? pkt_msg.flags() : 0;
        element->pc = pkt_msg.has_pc() ? pkt_msg.pc() : 0;
        return true;
    }

    // We have reached the end of the file
    return false;
}