ruby/system/GPUCoalescer.cc

/*
 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Sooraj Puthoor
 */

#include "base/misc.hh"
#include "base/str.hh"
#include "config/the_isa.hh"

#if THE_ISA == X86_ISA
#include "arch/x86/insts/microldstop.hh"

#endif // X86_ISA
#include "mem/ruby/system/GPUCoalescer.hh"

#include "cpu/testers/rubytest/RubyTester.hh"
#include "debug/GPUCoalescer.hh"
#include "debug/MemoryAccess.hh"
#include "debug/ProtocolTrace.hh"
#include "debug/RubyPort.hh"
#include "debug/RubyStats.hh"
#include "gpu-compute/shader.hh"
#include "mem/packet.hh"
#include "mem/ruby/common/DataBlock.hh"
#include "mem/ruby/common/SubBlock.hh"
#include "mem/ruby/network/MessageBuffer.hh"
#include "mem/ruby/profiler/Profiler.hh"
#include "mem/ruby/slicc_interface/AbstractController.hh"
#include "mem/ruby/slicc_interface/RubyRequest.hh"
#include "mem/ruby/structures/CacheMemory.hh"
#include "mem/ruby/system/RubySystem.hh"
#include "params/RubyGPUCoalescer.hh"

using namespace std;

GPUCoalescer *
RubyGPUCoalescerParams::create()
{
    return new GPUCoalescer(this);
}

HSAScope
reqScopeToHSAScope(Request* req)
{
    HSAScope accessScope = HSAScope_UNSPECIFIED;
    if (req->isScoped()) {
        if (req->isWavefrontScope()) {
            accessScope = HSAScope_WAVEFRONT;
        } else if (req->isWorkgroupScope()) {
            accessScope = HSAScope_WORKGROUP;
        } else if (req->isDeviceScope()) {
            accessScope = HSAScope_DEVICE;
        } else if (req->isSystemScope()) {
            accessScope = HSAScope_SYSTEM;
        } else {
            fatal("Bad scope type");
        }
    }
    return accessScope;
}

HSASegment
reqSegmentToHSASegment(Request* req)
{
    HSASegment accessSegment = HSASegment_GLOBAL;

    if (req->isGlobalSegment()) {
        accessSegment = HSASegment_GLOBAL;
    } else if (req->isGroupSegment()) {
        accessSegment = HSASegment_GROUP;
    } else if (req->isPrivateSegment()) {
        accessSegment = HSASegment_PRIVATE;
    } else if (req->isKernargSegment()) {
        accessSegment = HSASegment_KERNARG;
    } else if (req->isReadonlySegment()) {
        accessSegment = HSASegment_READONLY;
    } else if (req->isSpillSegment()) {
        accessSegment = HSASegment_SPILL;
    } else if (req->isArgSegment()) {
        accessSegment = HSASegment_ARG;
    } else {
        fatal("Bad segment type");
    }

    return accessSegment;
}

GPUCoalescer::GPUCoalescer(const Params *p)
    : RubyPort(p), issueEvent(this), deadlockCheckEvent(this)
{
    m_store_waiting_on_load_cycles = 0;
    m_store_waiting_on_store_cycles = 0;
    m_load_waiting_on_store_cycles = 0;
    m_load_waiting_on_load_cycles = 0;

    m_outstanding_count = 0;

    m_max_outstanding_requests = 0;
    m_deadlock_threshold = 0;
    m_instCache_ptr = nullptr;
    m_dataCache_ptr = nullptr;

    m_instCache_ptr = p->icache;
    m_dataCache_ptr = p->dcache;
    m_max_outstanding_requests = p->max_outstanding_requests;
    m_deadlock_threshold = p->deadlock_threshold;

    assert(m_max_outstanding_requests > 0);
    assert(m_deadlock_threshold > 0);
    assert(m_instCache_ptr);
    assert(m_dataCache_ptr);

    m_data_cache_hit_latency = p->dcache_hit_latency;

    m_runningGarnetStandalone = p->garnet_standalone;
    assumingRfOCoherence = p->assume_rfo;
}

GPUCoalescer::~GPUCoalescer()
{
}

void
GPUCoalescer::wakeup()
{
    // Check for deadlock of any of the requests
    Cycles current_time = curCycle();

    // Check across all outstanding requests
    int total_outstanding = 0;

    RequestTable::iterator read = m_readRequestTable.begin();
    RequestTable::iterator read_end = m_readRequestTable.end();
    for (; read != read_end; ++read) {
        GPUCoalescerRequest* request = read->second;
        if (current_time - request->issue_time < m_deadlock_threshold)
            continue;

        panic("Possible Deadlock detected. Aborting!\n"
             "version: %d request.paddr: 0x%x m_readRequestTable: %d "
             "current time: %u issue_time: %d difference: %d\n", m_version,
              request->pkt->getAddr(), m_readRequestTable.size(),
              current_time * clockPeriod(), request->issue_time * clockPeriod(),
              (current_time - request->issue_time)*clockPeriod());
    }

    RequestTable::iterator write = m_writeRequestTable.begin();
    RequestTable::iterator write_end = m_writeRequestTable.end();
    for (; write != write_end; ++write) {
        GPUCoalescerRequest* request = write->second;
        if (current_time - request->issue_time < m_deadlock_threshold)
            continue;

        panic("Possible Deadlock detected. Aborting!\n"
             "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
             "current time: %u issue_time: %d difference: %d\n", m_version,
              request->pkt->getAddr(), m_writeRequestTable.size(),
              current_time * clockPeriod(), request->issue_time * clockPeriod(),
              (current_time - request->issue_time) * clockPeriod());
    }

    total_outstanding += m_writeRequestTable.size();
    total_outstanding += m_readRequestTable.size();

    assert(m_outstanding_count == total_outstanding);

    if (m_outstanding_count > 0) {
        // If there are still outstanding requests, keep checking
        schedule(deadlockCheckEvent,
                 m_deadlock_threshold * clockPeriod() +
                 curTick());
    }
}

void
GPUCoalescer::resetStats()
{
    m_latencyHist.reset();
    m_missLatencyHist.reset();
    for (int i = 0; i < RubyRequestType_NUM; i++) {
        m_typeLatencyHist[i]->reset();
        m_missTypeLatencyHist[i]->reset();
        for (int j = 0; j < MachineType_NUM; j++) {
            m_missTypeMachLatencyHist[i][j]->reset();
        }
    }

    for (int i = 0; i < MachineType_NUM; i++) {
        m_missMachLatencyHist[i]->reset();

        m_IssueToInitialDelayHist[i]->reset();
        m_InitialToForwardDelayHist[i]->reset();
        m_ForwardToFirstResponseDelayHist[i]->reset();
        m_FirstResponseToCompletionDelayHist[i]->reset();
    }
}

void
GPUCoalescer::printProgress(ostream& out) const
{
}

RequestStatus
GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
{
    Addr line_addr = makeLineAddress(pkt->getAddr());

    if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
        return RequestStatus_BufferFull;
    }

    if (m_controller->isBlocked(line_addr) &&
       request_type != RubyRequestType_Locked_RMW_Write) {
        return RequestStatus_Aliased;
    }

    if ((request_type == RubyRequestType_ST) ||
        (request_type == RubyRequestType_ATOMIC) ||
        (request_type == RubyRequestType_ATOMIC_RETURN) ||
        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
        (request_type == RubyRequestType_RMW_Read) ||
        (request_type == RubyRequestType_RMW_Write) ||
        (request_type == RubyRequestType_Load_Linked) ||
        (request_type == RubyRequestType_Store_Conditional) ||
        (request_type == RubyRequestType_Locked_RMW_Read) ||
        (request_type == RubyRequestType_Locked_RMW_Write) ||
        (request_type == RubyRequestType_FLUSH)) {

        // Check if there is any outstanding read request for the same
        // cache line.
        if (m_readRequestTable.count(line_addr) > 0) {
            m_store_waiting_on_load_cycles++;
            return RequestStatus_Aliased;
        }

        if (m_writeRequestTable.count(line_addr) > 0) {
          // There is an outstanding write request for the cache line
          m_store_waiting_on_store_cycles++;
          return RequestStatus_Aliased;
        }
    } else {
        // Check if there is any outstanding write request for the same
        // cache line.
        if (m_writeRequestTable.count(line_addr) > 0) {
            m_load_waiting_on_store_cycles++;
            return RequestStatus_Aliased;
        }

        if (m_readRequestTable.count(line_addr) > 0) {
            // There is an outstanding read request for the cache line
            m_load_waiting_on_load_cycles++;
            return RequestStatus_Aliased;
        }
    }

    return RequestStatus_Ready;

}


// sets the kernelEndList
void
GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
{
    // Don't know if this will happen or is possible
    // but I just want to be careful and not have it become
    // simulator hang in the future
    DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
    assert(kernelEndList.count(wavefront_id) == 0);

    kernelEndList[wavefront_id] = pkt;
    DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
            kernelEndList.size());
}


// Insert the request on the correct request table.  Return true if
// the entry was already present.
bool
GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
{
    assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
           pkt->req->isLockedRMW() ||
           !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));

    int total_outstanding M5_VAR_USED =
        m_writeRequestTable.size() + m_readRequestTable.size();

    assert(m_outstanding_count == total_outstanding);

    // See if we should schedule a deadlock check
    if (!deadlockCheckEvent.scheduled()) {
        schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
    }

    Addr line_addr = makeLineAddress(pkt->getAddr());
    if ((request_type == RubyRequestType_ST) ||
        (request_type == RubyRequestType_ATOMIC) ||
        (request_type == RubyRequestType_ATOMIC_RETURN) ||
        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
        (request_type == RubyRequestType_RMW_Read) ||
        (request_type == RubyRequestType_RMW_Write) ||
        (request_type == RubyRequestType_Load_Linked) ||
        (request_type == RubyRequestType_Store_Conditional) ||
        (request_type == RubyRequestType_Locked_RMW_Read) ||
        (request_type == RubyRequestType_Locked_RMW_Write) ||
        (request_type == RubyRequestType_FLUSH)) {

        pair<RequestTable::iterator, bool> r =
          m_writeRequestTable.insert(RequestTable::value_type(line_addr,
                                       (GPUCoalescerRequest*) NULL));
        if (r.second) {
            RequestTable::iterator i = r.first;
            i->second = new GPUCoalescerRequest(pkt, request_type,
                                                curCycle());
            DPRINTF(GPUCoalescer,
                    "Inserting write request for paddr %#x for type %d\n",
                    pkt->req->getPaddr(), i->second->m_type);
            m_outstanding_count++;
        } else {
            return true;
        }
    } else {
        pair<RequestTable::iterator, bool> r =
            m_readRequestTable.insert(RequestTable::value_type(line_addr,
                                        (GPUCoalescerRequest*) NULL));

        if (r.second) {
            RequestTable::iterator i = r.first;
            i->second = new GPUCoalescerRequest(pkt, request_type,
                                             curCycle());
            DPRINTF(GPUCoalescer,
                    "Inserting read request for paddr %#x for type %d\n",
                    pkt->req->getPaddr(), i->second->m_type);
            m_outstanding_count++;
        } else {
            return true;
        }
    }

    m_outstandReqHist.sample(m_outstanding_count);

    total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
    assert(m_outstanding_count == total_outstanding);

    return false;
}

void
GPUCoalescer::markRemoved()
{
    m_outstanding_count--;
    assert(m_outstanding_count ==
           m_writeRequestTable.size() + m_readRequestTable.size());
}

void
GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
{
    assert(m_outstanding_count ==
           m_writeRequestTable.size() + m_readRequestTable.size());

    Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
    if ((srequest->m_type == RubyRequestType_ST) ||
        (srequest->m_type == RubyRequestType_RMW_Read) ||
        (srequest->m_type == RubyRequestType_RMW_Write) ||
        (srequest->m_type == RubyRequestType_Load_Linked) ||
        (srequest->m_type == RubyRequestType_Store_Conditional) ||
        (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
        (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
        m_writeRequestTable.erase(line_addr);
    } else {
        m_readRequestTable.erase(line_addr);
    }

    markRemoved();
}

bool
GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
{
    //
    // The success flag indicates whether the LLSC operation was successful.
    // LL ops will always succeed, but SC may fail if the cache line is no
    // longer locked.
    //
    bool success = true;
    if (request->m_type == RubyRequestType_Store_Conditional) {
        if (!m_dataCache_ptr->isLocked(address, m_version)) {
            //
            // For failed SC requests, indicate the failure to the cpu by
            // setting the extra data to zero.
            //
            request->pkt->req->setExtraData(0);
            success = false;
        } else {
            //
            // For successful SC requests, indicate the success to the cpu by
            // setting the extra data to one.
            //
            request->pkt->req->setExtraData(1);
        }
        //
        // Independent of success, all SC operations must clear the lock
        //
        m_dataCache_ptr->clearLocked(address);
    } else if (request->m_type == RubyRequestType_Load_Linked) {
        //
        // Note: To fully follow Alpha LLSC semantics, should the LL clear any
        // previously locked cache lines?
        //
        m_dataCache_ptr->setLocked(address, m_version);
    } else if ((m_dataCache_ptr->isTagPresent(address)) &&
               (m_dataCache_ptr->isLocked(address, m_version))) {
        //
        // Normal writes should clear the locked address
        //
        m_dataCache_ptr->clearLocked(address);
    }
    return success;
}

void
GPUCoalescer::writeCallback(Addr address, DataBlock& data)
{
    writeCallback(address, MachineType_NULL, data);
}

void
GPUCoalescer::writeCallback(Addr address,
                         MachineType mach,
                         DataBlock& data)
{
    writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
}

void
GPUCoalescer::writeCallback(Addr address,
                         MachineType mach,
                         DataBlock& data,
                         Cycles initialRequestTime,
                         Cycles forwardRequestTime,
                         Cycles firstResponseTime)
{
    writeCallback(address, mach, data,
                  initialRequestTime, forwardRequestTime, firstResponseTime,
                  false);
}

void
GPUCoalescer::writeCallback(Addr address,
                         MachineType mach,
                         DataBlock& data,
                         Cycles initialRequestTime,
                         Cycles forwardRequestTime,
                         Cycles firstResponseTime,
                         bool isRegion)
{
    assert(address == makeLineAddress(address));

    DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
    assert(m_writeRequestTable.count(makeLineAddress(address)));

    RequestTable::iterator i = m_writeRequestTable.find(address);
    assert(i != m_writeRequestTable.end());
    GPUCoalescerRequest* request = i->second;

    m_writeRequestTable.erase(i);
    markRemoved();

    assert((request->m_type == RubyRequestType_ST) ||
           (request->m_type == RubyRequestType_ATOMIC) ||
           (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
           (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
           (request->m_type == RubyRequestType_RMW_Read) ||
           (request->m_type == RubyRequestType_RMW_Write) ||
           (request->m_type == RubyRequestType_Load_Linked) ||
           (request->m_type == RubyRequestType_Store_Conditional) ||
           (request->m_type == RubyRequestType_Locked_RMW_Read) ||
           (request->m_type == RubyRequestType_Locked_RMW_Write) ||
           (request->m_type == RubyRequestType_FLUSH));


    //
    // For Alpha, properly handle LL, SC, and write requests with respect to
    // locked cache blocks.
    //
    // Not valid for Garnet_standalone protocl
    //
    bool success = true;
    if (!m_runningGarnetStandalone)
        success = handleLlsc(address, request);

    if (request->m_type == RubyRequestType_Locked_RMW_Read) {
        m_controller->blockOnQueue(address, m_mandatory_q_ptr);
    } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
        m_controller->unblock(address);
    }

    hitCallback(request, mach, data, success,
                request->issue_time, forwardRequestTime, firstResponseTime,
                isRegion);
}

void
GPUCoalescer::readCallback(Addr address, DataBlock& data)
{
    readCallback(address, MachineType_NULL, data);
}

void
GPUCoalescer::readCallback(Addr address,
                        MachineType mach,
                        DataBlock& data)
{
    readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
}

void
GPUCoalescer::readCallback(Addr address,
                        MachineType mach,
                        DataBlock& data,
                        Cycles initialRequestTime,
                        Cycles forwardRequestTime,
                        Cycles firstResponseTime)
{

    readCallback(address, mach, data,
                 initialRequestTime, forwardRequestTime, firstResponseTime,
                 false);
}

void
GPUCoalescer::readCallback(Addr address,
                        MachineType mach,
                        DataBlock& data,
                        Cycles initialRequestTime,
                        Cycles forwardRequestTime,
                        Cycles firstResponseTime,
                        bool isRegion)
{
    assert(address == makeLineAddress(address));
    assert(m_readRequestTable.count(makeLineAddress(address)));

    DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
    RequestTable::iterator i = m_readRequestTable.find(address);
    assert(i != m_readRequestTable.end());
    GPUCoalescerRequest* request = i->second;

    m_readRequestTable.erase(i);
    markRemoved();

    assert((request->m_type == RubyRequestType_LD) ||
           (request->m_type == RubyRequestType_IFETCH));

    hitCallback(request, mach, data, true,
                request->issue_time, forwardRequestTime, firstResponseTime,
                isRegion);
}

void
GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
                       MachineType mach,
                       DataBlock& data,
                       bool success,
                       Cycles initialRequestTime,
                       Cycles forwardRequestTime,
                       Cycles firstResponseTime,
                       bool isRegion)
{
    PacketPtr pkt = srequest->pkt;
    Addr request_address = pkt->getAddr();
    Addr request_line_address = makeLineAddress(request_address);

    RubyRequestType type = srequest->m_type;

    // Set this cache entry to the most recently used
    if (type == RubyRequestType_IFETCH) {
        if (m_instCache_ptr->isTagPresent(request_line_address))
            m_instCache_ptr->setMRU(request_line_address);
    } else {
        if (m_dataCache_ptr->isTagPresent(request_line_address))
            m_dataCache_ptr->setMRU(request_line_address);
    }

    recordMissLatency(srequest, mach,
                      initialRequestTime,
                      forwardRequestTime,
                      firstResponseTime,
                      success, isRegion);
    // update the data
    //
    // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
    int len = reqCoalescer[request_line_address].size();
    std::vector<PacketPtr> mylist;
    for (int i = 0; i < len; ++i) {
        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
        assert(type == reqCoalescer[request_line_address][i].primaryType);
        request_address = pkt->getAddr();
        request_line_address = makeLineAddress(pkt->getAddr());
        if (pkt->getPtr<uint8_t>()) {
            if ((type == RubyRequestType_LD) ||
                (type == RubyRequestType_ATOMIC) ||
                (type == RubyRequestType_ATOMIC_RETURN) ||
                (type == RubyRequestType_IFETCH) ||
                (type == RubyRequestType_RMW_Read) ||
                (type == RubyRequestType_Locked_RMW_Read) ||
                (type == RubyRequestType_Load_Linked)) {
                memcpy(pkt->getPtr<uint8_t>(),
                       data.getData(getOffset(request_address),
                                    pkt->getSize()),
                       pkt->getSize());
            } else {
                data.setData(pkt->getPtr<uint8_t>(),
                             getOffset(request_address), pkt->getSize());
            }
        } else {
            DPRINTF(MemoryAccess,
                    "WARNING.  Data not transfered from Ruby to M5 for type " \
                    "%s\n",
                    RubyRequestType_to_string(type));
        }

        // If using the RubyTester, update the RubyTester sender state's
        // subBlock with the recieved data.  The tester will later access
        // this state.
        // Note: RubyPort will access it's sender state before the
        // RubyTester.
        if (m_usingRubyTester) {
            RubyPort::SenderState *requestSenderState =
                safe_cast<RubyPort::SenderState*>(pkt->senderState);
            RubyTester::SenderState* testerSenderState =
                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
            testerSenderState->subBlock.mergeFrom(data);
        }

        mylist.push_back(pkt);
    }
    delete srequest;
    reqCoalescer.erase(request_line_address);
    assert(!reqCoalescer.count(request_line_address));


    completeHitCallback(mylist, len);
}

bool
GPUCoalescer::empty() const
{
    return m_writeRequestTable.empty() && m_readRequestTable.empty();
}

// Analyzes the packet to see if this request can be coalesced.
// If request can be coalesced, this request is added to the reqCoalescer table
// and makeRequest returns RequestStatus_Issued;
// If this is the first request to a cacheline, request is added to both
// newRequests queue and to the reqCoalescer table; makeRequest
// returns RequestStatus_Issued.
// If there is a pending request to this cacheline and this request
// can't be coalesced, RequestStatus_Aliased is returned and
// the packet needs to be reissued.
RequestStatus
GPUCoalescer::makeRequest(PacketPtr pkt)
{
    // Check for GPU Barrier Kernel End or Kernel Begin
    // Leave these to be handled by the child class
    // Kernel End/Barrier = isFlush + isRelease
    // Kernel Begin = isFlush + isAcquire
    if (pkt->req->isKernel()) {
        if (pkt->req->isAcquire()){
            // This is a Kernel Begin leave handling to
            // virtual xCoalescer::makeRequest
            return RequestStatus_Issued;
        }else if (pkt->req->isRelease()) {
            // This is a Kernel End leave handling to
            // virtual xCoalescer::makeRequest
            // If we are here then we didn't call
            // a virtual version of this function
            // so we will also schedule the callback
            int wf_id = 0;
            if (pkt->req->hasContextId()) {
                wf_id = pkt->req->contextId();
            }
            insertKernel(wf_id, pkt);
            newKernelEnds.push_back(wf_id);
            if (!issueEvent.scheduled()) {
                schedule(issueEvent, curTick());
            }
            return RequestStatus_Issued;
        }
    }

    // If number of outstanding requests greater than the max allowed,
    // return RequestStatus_BufferFull. This logic can be extended to
    // support proper backpressure.
    if (m_outstanding_count >= m_max_outstanding_requests) {
        return RequestStatus_BufferFull;
    }

    RubyRequestType primary_type = RubyRequestType_NULL;
    RubyRequestType secondary_type = RubyRequestType_NULL;

    if (pkt->isLLSC()) {
        //
        // Alpha LL/SC instructions need to be handled carefully by the cache
        // coherence protocol to ensure they follow the proper semantics. In
        // particular, by identifying the operations as atomic, the protocol
        // should understand that migratory sharing optimizations should not
        // be performed (i.e. a load between the LL and SC should not steal
        // away exclusive permission).
        //
        if (pkt->isWrite()) {
            primary_type = RubyRequestType_Store_Conditional;
        } else {
            assert(pkt->isRead());
            primary_type = RubyRequestType_Load_Linked;
        }
        secondary_type = RubyRequestType_ATOMIC;
    } else if (pkt->req->isLockedRMW()) {
        //
        // x86 locked instructions are translated to store cache coherence
        // requests because these requests should always be treated as read
        // exclusive operations and should leverage any migratory sharing
        // optimization built into the protocol.
        //
        if (pkt->isWrite()) {
            primary_type = RubyRequestType_Locked_RMW_Write;
        } else {
            assert(pkt->isRead());
            primary_type = RubyRequestType_Locked_RMW_Read;
        }
        secondary_type = RubyRequestType_ST;
    } else if (pkt->isAtomicOp()) {
        //
        // GPU Atomic Operation
        //
        primary_type = RubyRequestType_ATOMIC;
        secondary_type = RubyRequestType_ATOMIC;
    } else {
        if (pkt->isRead()) {
            if (pkt->req->isInstFetch()) {
                primary_type = secondary_type = RubyRequestType_IFETCH;
            } else {
#if THE_ISA == X86_ISA
                uint32_t flags = pkt->req->getFlags();
                bool storeCheck = flags &
                        (TheISA::StoreCheck << TheISA::FlagShift);
#else
                bool storeCheck = false;
#endif // X86_ISA
                if (storeCheck) {
                    primary_type = RubyRequestType_RMW_Read;
                    secondary_type = RubyRequestType_ST;
                } else {
                    primary_type = secondary_type = RubyRequestType_LD;
                }
            }
        } else if (pkt->isWrite()) {
            //
            // Note: M5 packets do not differentiate ST from RMW_Write
            //
            primary_type = secondary_type = RubyRequestType_ST;
        } else if (pkt->isFlush()) {
            primary_type = secondary_type = RubyRequestType_FLUSH;
        } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
            if (assumingRfOCoherence) {
                // If we reached here, this request must be a memFence
                // and the protocol implements RfO, the coalescer can
                // assume sequentially consistency and schedule the callback
                // immediately.
                // Currently the code implements fence callbacks
                // by reusing the mechanism for kernel completions.
                // This should be fixed.
                int wf_id = 0;
                if (pkt->req->hasContextId()) {
                    wf_id = pkt->req->contextId();
                }
                insertKernel(wf_id, pkt);
                newKernelEnds.push_back(wf_id);
                if (!issueEvent.scheduled()) {
                    schedule(issueEvent, curTick());
                }
                return RequestStatus_Issued;
            } else {
                // If not RfO, return issued here and let the child coalescer
                // take care of it.
                return RequestStatus_Issued;
            }
        } else {
            panic("Unsupported ruby packet type\n");
        }
    }

    // Check if there is any pending request to this cache line from
    // previous cycles.
    // If there is a pending request, return aliased. Since coalescing
    // across time is not permitted, aliased requests are not coalesced.
    // If a request for this address has already been issued, we must block
    RequestStatus status = getRequestStatus(pkt, primary_type);
    if (status != RequestStatus_Ready)
        return status;

    Addr line_addr = makeLineAddress(pkt->getAddr());

    // Check if this request can be coalesced with previous
    // requests from this cycle.
    if (!reqCoalescer.count(line_addr)) {
        // This is the first access to this cache line.
        // A new request to the memory subsystem has to be
        // made in the next cycle for this cache line, so
        // add this line addr to the "newRequests" queue
        newRequests.push_back(line_addr);

    // There was a request to this cache line in this cycle,
    // let us see if we can coalesce this request with the previous
    // requests from this cycle
    } else if (primary_type !=
               reqCoalescer[line_addr][0].primaryType) {
        // can't coalesce loads, stores and atomics!
        return RequestStatus_Aliased;
    } else if (pkt->req->isLockedRMW() ||
               reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
        // can't coalesce locked accesses, but can coalesce atomics!
        return RequestStatus_Aliased;
    } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
               pkt->req->contextId() !=
               reqCoalescer[line_addr][0].pkt->req->contextId()) {
        // can't coalesce releases from different wavefronts
        return RequestStatus_Aliased;
    }

    // in addition to the packet, we need to save both request types
    reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
    if (!issueEvent.scheduled())
        schedule(issueEvent, curTick());
    // TODO: issue hardware prefetches here
    return RequestStatus_Issued;
}

void
GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
{

    int proc_id = -1;
    if (pkt != NULL && pkt->req->hasContextId()) {
        proc_id = pkt->req->contextId();
    }

    // If valid, copy the pc to the ruby request
    Addr pc = 0;
    if (pkt->req->hasPC()) {
        pc = pkt->req->getPC();
    }

    // At the moment setting scopes only counts
    // for GPU spill space accesses
    // which is pkt->req->isStack()
    // this scope is REPLACE since it
    // does not need to be flushed at the end
    // of a kernel Private and local may need
    // to be visible at the end of the kernel
    HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
    HSAScope accessScope = reqScopeToHSAScope(pkt->req);

    Addr line_addr = makeLineAddress(pkt->getAddr());

    // Creating WriteMask that records written bytes
    // and atomic operations. This enables partial writes
    // and partial reads of those writes
    DataBlock dataBlock;
    dataBlock.clear();
    uint32_t blockSize = RubySystem::getBlockSizeBytes();
    std::vector<bool> accessMask(blockSize,false);
    std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
    uint32_t tableSize = reqCoalescer[line_addr].size();
    for (int i = 0; i < tableSize; i++) {
        PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
        uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
        uint32_t tmpSize = tmpPkt->getSize();
        if (tmpPkt->isAtomicOp()) {
            std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
                                                        tmpPkt->getAtomicOp());
            atomicOps.push_back(tmpAtomicOp);
        } else if (tmpPkt->isWrite()) {
            dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
                              tmpOffset, tmpSize);
        }
        for (int j = 0; j < tmpSize; j++) {
            accessMask[tmpOffset + j] = true;
        }
    }
    std::shared_ptr<RubyRequest> msg;
    if (pkt->isAtomicOp()) {
        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
                              pkt->getPtr<uint8_t>(),
                              pkt->getSize(), pc, secondary_type,
                              RubyAccessMode_Supervisor, pkt,
                              PrefetchBit_No, proc_id, 100,
                              blockSize, accessMask,
                              dataBlock, atomicOps,
                              accessScope, accessSegment);
    } else {
        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
                              pkt->getPtr<uint8_t>(),
                              pkt->getSize(), pc, secondary_type,
                              RubyAccessMode_Supervisor, pkt,
                              PrefetchBit_No, proc_id, 100,
                              blockSize, accessMask,
                              dataBlock,
                              accessScope, accessSegment);
    }
    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
             curTick(), m_version, "Coal", "Begin", "", "",
             printAddress(msg->getPhysicalAddress()),
             RubyRequestType_to_string(secondary_type));

    fatal_if(secondary_type == RubyRequestType_IFETCH,
             "there should not be any I-Fetch requests in the GPU Coalescer");

    // Send the message to the cache controller
    fatal_if(m_data_cache_hit_latency == 0,
             "should not have a latency of zero");

    assert(m_mandatory_q_ptr);
    m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
}

template <class KEY, class VALUE>
std::ostream &
operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
{
    out << "[";
    for (auto i = map.begin(); i != map.end(); ++i)
        out << " " << i->first << "=" << i->second;
    out << " ]";

    return out;
}

void
GPUCoalescer::print(ostream& out) const
{
    out << "[GPUCoalescer: " << m_version
        << ", outstanding requests: " << m_outstanding_count
        << ", read request table: " << m_readRequestTable
        << ", write request table: " << m_writeRequestTable
        << "]";
}

// this can be called from setState whenever coherence permissions are
// upgraded when invoked, coherence violations will be checked for the
// given block
void
GPUCoalescer::checkCoherence(Addr addr)
{
#ifdef CHECK_COHERENCE
    m_ruby_system->checkGlobalCoherenceInvariant(addr);
#endif
}

void
GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
    DPRINTF(RubyStats, "Recorded statistic: %s\n",
            SequencerRequestType_to_string(requestType));
}

GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)
    : Event(Progress_Event_Pri), seq(_seq)
{
}


void
GPUCoalescer::completeIssue()
{
    // newRequests has the cacheline addresses of all the
    // requests which need to be issued to the memory subsystem
    // in this cycle
    int len = newRequests.size();
    DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
    for (int i = 0; i < len; ++i) {
        // Get the requests from reqCoalescer table. Get only the
        // first request for each cacheline, the remaining requests
        // can be coalesced with the first request. So, only
        // one request is issued per cacheline.
        RequestDesc info = reqCoalescer[newRequests[i]][0];
        PacketPtr pkt = info.pkt;
        DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
                i, pkt->req->getPaddr());
        // Insert this request to the read/writeRequestTables. These tables
        // are used to track aliased requests in makeRequest subroutine
        bool found = insertRequest(pkt, info.primaryType);

        if (found) {
            panic("GPUCoalescer::makeRequest should never be called if the "
                  "request is already outstanding\n");
        }

        // Issue request to ruby subsystem
        issueRequest(pkt, info.secondaryType);
    }
    newRequests.clear();

    // have Kernel End releases been issued this cycle
    len = newKernelEnds.size();
    for (int i = 0; i < len; i++) {
        kernelCallback(newKernelEnds[i]);
    }
    newKernelEnds.clear();
}

void
GPUCoalescer::IssueEvent::process()
{
    seq->completeIssue();
}

const char *
GPUCoalescer::IssueEvent::description() const
{
    return "Issue coalesced request";
}

void
GPUCoalescer::evictionCallback(Addr address)
{
    ruby_eviction_callback(address);
}

void
GPUCoalescer::kernelCallback(int wavefront_id)
{
    assert(kernelEndList.count(wavefront_id));

    ruby_hit_callback(kernelEndList[wavefront_id]);

    kernelEndList.erase(wavefront_id);
}

void
GPUCoalescer::atomicCallback(Addr address,
                             MachineType mach,
                             const DataBlock& data)
{
    assert(address == makeLineAddress(address));

    DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
    assert(m_writeRequestTable.count(makeLineAddress(address)));

    RequestTable::iterator i = m_writeRequestTable.find(address);
    assert(i != m_writeRequestTable.end());
    GPUCoalescerRequest* srequest = i->second;

    m_writeRequestTable.erase(i);
    markRemoved();

    assert((srequest->m_type == RubyRequestType_ATOMIC) ||
           (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
           (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));


    // Atomics don't write to cache, so there is no MRU update...

    recordMissLatency(srequest, mach,
                      srequest->issue_time, Cycles(0), Cycles(0), true, false);

    PacketPtr pkt = srequest->pkt;
    Addr request_address = pkt->getAddr();
    Addr request_line_address = makeLineAddress(pkt->getAddr());

    int len = reqCoalescer[request_line_address].size();
    std::vector<PacketPtr> mylist;
    for (int i = 0; i < len; ++i) {
        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
        assert(srequest->m_type ==
               reqCoalescer[request_line_address][i].primaryType);
        request_address = (pkt->getAddr());
        request_line_address = makeLineAddress(request_address);
        if (pkt->getPtr<uint8_t>() &&
            srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
            /* atomics are done in memory, and return the data *before* the atomic op... */
            memcpy(pkt->getPtr<uint8_t>(),
                   data.getData(getOffset(request_address),
                                pkt->getSize()),
                   pkt->getSize());
        } else {
            DPRINTF(MemoryAccess,
                    "WARNING.  Data not transfered from Ruby to M5 for type " \
                    "%s\n",
                    RubyRequestType_to_string(srequest->m_type));
        }

        // If using the RubyTester, update the RubyTester sender state's
        // subBlock with the recieved data.  The tester will later access
        // this state.
        // Note: RubyPort will access it's sender state before the
        // RubyTester.
        if (m_usingRubyTester) {
            RubyPort::SenderState *requestSenderState =
                safe_cast<RubyPort::SenderState*>(pkt->senderState);
            RubyTester::SenderState* testerSenderState =
                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
            testerSenderState->subBlock.mergeFrom(data);
        }

        mylist.push_back(pkt);
    }
    delete srequest;
    reqCoalescer.erase(request_line_address);
    assert(!reqCoalescer.count(request_line_address));

    completeHitCallback(mylist, len);
}

void
GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
{
    if (myMachID == senderMachID) {
        CP_TCPLdHits++;
    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
        CP_TCPLdTransfers++;
    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
        CP_TCCLdHits++;
    } else {
        CP_LdMiss++;
    }
}

void
GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
{
    if (myMachID == senderMachID) {
        CP_TCPStHits++;
    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
        CP_TCPStTransfers++;
    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
        CP_TCCStHits++;
    } else {
        CP_StMiss++;
    }
}

void
GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
{
    for (int i = 0; i < len; ++i) {
        RubyPort::SenderState *ss =
            safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
        MemSlavePort *port = ss->port;
        assert(port != NULL);

        mylist[i]->senderState = ss->predecessor;
        delete ss;
        port->hitCallback(mylist[i]);
        trySendRetries();
    }

    testDrainComplete();
}

PacketPtr
GPUCoalescer::mapAddrToPkt(Addr address)
{
    RequestTable::iterator i = m_readRequestTable.find(address);
    assert(i != m_readRequestTable.end());
    GPUCoalescerRequest* request = i->second;
    return request->pkt;
}

void
GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
                                MachineType mach,
                                Cycles initialRequestTime,
                                Cycles forwardRequestTime,
                                Cycles firstResponseTime,
                                bool success, bool isRegion)
{
    RubyRequestType type = srequest->m_type;
    Cycles issued_time = srequest->issue_time;
    Cycles completion_time = curCycle();
    assert(completion_time >= issued_time);
    Cycles total_lat = completion_time - issued_time;

    // cache stats (valid for RfO protocol only)
    if (mach == MachineType_TCP) {
        if (type == RubyRequestType_LD) {
            GPU_TCPLdHits++;
        } else {
            GPU_TCPStHits++;
        }
    } else if (mach == MachineType_L1Cache_wCC) {
        if (type == RubyRequestType_LD) {
            GPU_TCPLdTransfers++;
        } else {
            GPU_TCPStTransfers++;
        }
    } else if (mach == MachineType_TCC) {
        if (type == RubyRequestType_LD) {
            GPU_TCCLdHits++;
        } else {
            GPU_TCCStHits++;
        }
    } else  {
        if (type == RubyRequestType_LD) {
            GPU_LdMiss++;
        } else {
            GPU_StMiss++;
        }
    }

    // Profile all access latency, even zero latency accesses
    m_latencyHist.sample(total_lat);
    m_typeLatencyHist[type]->sample(total_lat);

    // Profile the miss latency for all non-zero demand misses
    if (total_lat != Cycles(0)) {
        m_missLatencyHist.sample(total_lat);
        m_missTypeLatencyHist[type]->sample(total_lat);

        if (mach != MachineType_NUM) {
            m_missMachLatencyHist[mach]->sample(total_lat);
            m_missTypeMachLatencyHist[type][mach]->sample(total_lat);

            if ((issued_time <= initialRequestTime) &&
                (initialRequestTime <= forwardRequestTime) &&
                (forwardRequestTime <= firstResponseTime) &&
                (firstResponseTime <= completion_time)) {

                m_IssueToInitialDelayHist[mach]->sample(
                    initialRequestTime - issued_time);
                m_InitialToForwardDelayHist[mach]->sample(
                    forwardRequestTime - initialRequestTime);
                m_ForwardToFirstResponseDelayHist[mach]->sample(
                    firstResponseTime - forwardRequestTime);
                m_FirstResponseToCompletionDelayHist[mach]->sample(
                    completion_time - firstResponseTime);
            }
        }

    }

    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
             curTick(), m_version, "Coal",
             success ? "Done" : "SC_Failed", "", "",
             printAddress(srequest->pkt->getAddr()), total_lat);
}

void
GPUCoalescer::regStats()
{
    RubyPort::regStats();

    // These statistical variables are not for display.
    // The profiler will collate these across different
    // coalescers and display those collated statistics.
    m_outstandReqHist.init(10);
    m_latencyHist.init(10);
    m_missLatencyHist.init(10);

    for (int i = 0; i < RubyRequestType_NUM; i++) {
        m_typeLatencyHist.push_back(new Stats::Histogram());
        m_typeLatencyHist[i]->init(10);

        m_missTypeLatencyHist.push_back(new Stats::Histogram());
        m_missTypeLatencyHist[i]->init(10);
    }

    for (int i = 0; i < MachineType_NUM; i++) {
        m_missMachLatencyHist.push_back(new Stats::Histogram());
        m_missMachLatencyHist[i]->init(10);

        m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
        m_IssueToInitialDelayHist[i]->init(10);

        m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
        m_InitialToForwardDelayHist[i]->init(10);

        m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
        m_ForwardToFirstResponseDelayHist[i]->init(10);

        m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
        m_FirstResponseToCompletionDelayHist[i]->init(10);
    }

    for (int i = 0; i < RubyRequestType_NUM; i++) {
        m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());

        for (int j = 0; j < MachineType_NUM; j++) {
            m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
            m_missTypeMachLatencyHist[i][j]->init(10);
        }
    }

    // GPU cache stats
    GPU_TCPLdHits
        .name(name() + ".gpu_tcp_ld_hits")
        .desc("loads that hit in the TCP")
        ;
    GPU_TCPLdTransfers
        .name(name() + ".gpu_tcp_ld_transfers")
        .desc("TCP to TCP load transfers")
        ;
    GPU_TCCLdHits
        .name(name() + ".gpu_tcc_ld_hits")
        .desc("loads that hit in the TCC")
        ;
    GPU_LdMiss
        .name(name() + ".gpu_ld_misses")
        .desc("loads that miss in the GPU")
        ;

    GPU_TCPStHits
        .name(name() + ".gpu_tcp_st_hits")
        .desc("stores that hit in the TCP")
        ;
    GPU_TCPStTransfers
        .name(name() + ".gpu_tcp_st_transfers")
        .desc("TCP to TCP store transfers")
        ;
    GPU_TCCStHits
        .name(name() + ".gpu_tcc_st_hits")
        .desc("stores that hit in the TCC")
        ;
    GPU_StMiss
        .name(name() + ".gpu_st_misses")
        .desc("stores that miss in the GPU")
        ;

    // CP cache stats
    CP_TCPLdHits
        .name(name() + ".cp_tcp_ld_hits")
        .desc("loads that hit in the TCP")
        ;
    CP_TCPLdTransfers
        .name(name() + ".cp_tcp_ld_transfers")
        .desc("TCP to TCP load transfers")
        ;
    CP_TCCLdHits
        .name(name() + ".cp_tcc_ld_hits")
        .desc("loads that hit in the TCC")
        ;
    CP_LdMiss
        .name(name() + ".cp_ld_misses")
        .desc("loads that miss in the GPU")
        ;

    CP_TCPStHits
        .name(name() + ".cp_tcp_st_hits")
        .desc("stores that hit in the TCP")
        ;
    CP_TCPStTransfers
        .name(name() + ".cp_tcp_st_transfers")
        .desc("TCP to TCP store transfers")
        ;
    CP_TCCStHits
        .name(name() + ".cp_tcc_st_hits")
        .desc("stores that hit in the TCC")
        ;
    CP_StMiss
        .name(name() + ".cp_st_misses")
        .desc("stores that miss in the GPU")
        ;
}