GPUCoalescer.cc revision 12133:ca42be3276af
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Sooraj Puthoor
34 */
35
36#include "base/misc.hh"
37#include "base/str.hh"
38#include "config/the_isa.hh"
39
40#if THE_ISA == X86_ISA
41#include "arch/x86/insts/microldstop.hh"
42
43#endif // X86_ISA
44#include "mem/ruby/system/GPUCoalescer.hh"
45
46#include "cpu/testers/rubytest/RubyTester.hh"
47#include "debug/GPUCoalescer.hh"
48#include "debug/MemoryAccess.hh"
49#include "debug/ProtocolTrace.hh"
50#include "debug/RubyPort.hh"
51#include "debug/RubyStats.hh"
52#include "gpu-compute/shader.hh"
53#include "mem/packet.hh"
54#include "mem/ruby/common/DataBlock.hh"
55#include "mem/ruby/common/SubBlock.hh"
56#include "mem/ruby/network/MessageBuffer.hh"
57#include "mem/ruby/profiler/Profiler.hh"
58#include "mem/ruby/slicc_interface/AbstractController.hh"
59#include "mem/ruby/slicc_interface/RubyRequest.hh"
60#include "mem/ruby/structures/CacheMemory.hh"
61#include "mem/ruby/system/RubySystem.hh"
62#include "params/RubyGPUCoalescer.hh"
63
64using namespace std;
65
66GPUCoalescer *
67RubyGPUCoalescerParams::create()
68{
69    return new GPUCoalescer(this);
70}
71
72HSAScope
73reqScopeToHSAScope(Request* req)
74{
75    HSAScope accessScope = HSAScope_UNSPECIFIED;
76    if (req->isScoped()) {
77        if (req->isWavefrontScope()) {
78            accessScope = HSAScope_WAVEFRONT;
79        } else if (req->isWorkgroupScope()) {
80            accessScope = HSAScope_WORKGROUP;
81        } else if (req->isDeviceScope()) {
82            accessScope = HSAScope_DEVICE;
83        } else if (req->isSystemScope()) {
84            accessScope = HSAScope_SYSTEM;
85        } else {
86            fatal("Bad scope type");
87        }
88    }
89    return accessScope;
90}
91
92HSASegment
93reqSegmentToHSASegment(Request* req)
94{
95    HSASegment accessSegment = HSASegment_GLOBAL;
96
97    if (req->isGlobalSegment()) {
98        accessSegment = HSASegment_GLOBAL;
99    } else if (req->isGroupSegment()) {
100        accessSegment = HSASegment_GROUP;
101    } else if (req->isPrivateSegment()) {
102        accessSegment = HSASegment_PRIVATE;
103    } else if (req->isKernargSegment()) {
104        accessSegment = HSASegment_KERNARG;
105    } else if (req->isReadonlySegment()) {
106        accessSegment = HSASegment_READONLY;
107    } else if (req->isSpillSegment()) {
108        accessSegment = HSASegment_SPILL;
109    } else if (req->isArgSegment()) {
110        accessSegment = HSASegment_ARG;
111    } else {
112        fatal("Bad segment type");
113    }
114
115    return accessSegment;
116}
117
118GPUCoalescer::GPUCoalescer(const Params *p)
119    : RubyPort(p),
120      issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
121                 false, Event::Progress_Event_Pri),
122      deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check")
123{
124    m_store_waiting_on_load_cycles = 0;
125    m_store_waiting_on_store_cycles = 0;
126    m_load_waiting_on_store_cycles = 0;
127    m_load_waiting_on_load_cycles = 0;
128
129    m_outstanding_count = 0;
130
131    m_max_outstanding_requests = 0;
132    m_deadlock_threshold = 0;
133    m_instCache_ptr = nullptr;
134    m_dataCache_ptr = nullptr;
135
136    m_instCache_ptr = p->icache;
137    m_dataCache_ptr = p->dcache;
138    m_max_outstanding_requests = p->max_outstanding_requests;
139    m_deadlock_threshold = p->deadlock_threshold;
140
141    assert(m_max_outstanding_requests > 0);
142    assert(m_deadlock_threshold > 0);
143    assert(m_instCache_ptr);
144    assert(m_dataCache_ptr);
145
146    m_data_cache_hit_latency = p->dcache_hit_latency;
147
148    m_runningGarnetStandalone = p->garnet_standalone;
149    assumingRfOCoherence = p->assume_rfo;
150}
151
152GPUCoalescer::~GPUCoalescer()
153{
154}
155
156void
157GPUCoalescer::wakeup()
158{
159    // Check for deadlock of any of the requests
160    Cycles current_time = curCycle();
161
162    // Check across all outstanding requests
163    int total_outstanding = 0;
164
165    RequestTable::iterator read = m_readRequestTable.begin();
166    RequestTable::iterator read_end = m_readRequestTable.end();
167    for (; read != read_end; ++read) {
168        GPUCoalescerRequest* request = read->second;
169        if (current_time - request->issue_time < m_deadlock_threshold)
170            continue;
171
172        panic("Possible Deadlock detected. Aborting!\n"
173             "version: %d request.paddr: 0x%x m_readRequestTable: %d "
174             "current time: %u issue_time: %d difference: %d\n", m_version,
175              request->pkt->getAddr(), m_readRequestTable.size(),
176              current_time * clockPeriod(), request->issue_time * clockPeriod(),
177              (current_time - request->issue_time)*clockPeriod());
178    }
179
180    RequestTable::iterator write = m_writeRequestTable.begin();
181    RequestTable::iterator write_end = m_writeRequestTable.end();
182    for (; write != write_end; ++write) {
183        GPUCoalescerRequest* request = write->second;
184        if (current_time - request->issue_time < m_deadlock_threshold)
185            continue;
186
187        panic("Possible Deadlock detected. Aborting!\n"
188             "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
189             "current time: %u issue_time: %d difference: %d\n", m_version,
190              request->pkt->getAddr(), m_writeRequestTable.size(),
191              current_time * clockPeriod(), request->issue_time * clockPeriod(),
192              (current_time - request->issue_time) * clockPeriod());
193    }
194
195    total_outstanding += m_writeRequestTable.size();
196    total_outstanding += m_readRequestTable.size();
197
198    assert(m_outstanding_count == total_outstanding);
199
200    if (m_outstanding_count > 0) {
201        // If there are still outstanding requests, keep checking
202        schedule(deadlockCheckEvent,
203                 m_deadlock_threshold * clockPeriod() +
204                 curTick());
205    }
206}
207
208void
209GPUCoalescer::resetStats()
210{
211    m_latencyHist.reset();
212    m_missLatencyHist.reset();
213    for (int i = 0; i < RubyRequestType_NUM; i++) {
214        m_typeLatencyHist[i]->reset();
215        m_missTypeLatencyHist[i]->reset();
216        for (int j = 0; j < MachineType_NUM; j++) {
217            m_missTypeMachLatencyHist[i][j]->reset();
218        }
219    }
220
221    for (int i = 0; i < MachineType_NUM; i++) {
222        m_missMachLatencyHist[i]->reset();
223
224        m_IssueToInitialDelayHist[i]->reset();
225        m_InitialToForwardDelayHist[i]->reset();
226        m_ForwardToFirstResponseDelayHist[i]->reset();
227        m_FirstResponseToCompletionDelayHist[i]->reset();
228    }
229}
230
231void
232GPUCoalescer::printProgress(ostream& out) const
233{
234}
235
236RequestStatus
237GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
238{
239    Addr line_addr = makeLineAddress(pkt->getAddr());
240
241    if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
242        return RequestStatus_BufferFull;
243    }
244
245    if (m_controller->isBlocked(line_addr) &&
246       request_type != RubyRequestType_Locked_RMW_Write) {
247        return RequestStatus_Aliased;
248    }
249
250    if ((request_type == RubyRequestType_ST) ||
251        (request_type == RubyRequestType_ATOMIC) ||
252        (request_type == RubyRequestType_ATOMIC_RETURN) ||
253        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
254        (request_type == RubyRequestType_RMW_Read) ||
255        (request_type == RubyRequestType_RMW_Write) ||
256        (request_type == RubyRequestType_Load_Linked) ||
257        (request_type == RubyRequestType_Store_Conditional) ||
258        (request_type == RubyRequestType_Locked_RMW_Read) ||
259        (request_type == RubyRequestType_Locked_RMW_Write) ||
260        (request_type == RubyRequestType_FLUSH)) {
261
262        // Check if there is any outstanding read request for the same
263        // cache line.
264        if (m_readRequestTable.count(line_addr) > 0) {
265            m_store_waiting_on_load_cycles++;
266            return RequestStatus_Aliased;
267        }
268
269        if (m_writeRequestTable.count(line_addr) > 0) {
270          // There is an outstanding write request for the cache line
271          m_store_waiting_on_store_cycles++;
272          return RequestStatus_Aliased;
273        }
274    } else {
275        // Check if there is any outstanding write request for the same
276        // cache line.
277        if (m_writeRequestTable.count(line_addr) > 0) {
278            m_load_waiting_on_store_cycles++;
279            return RequestStatus_Aliased;
280        }
281
282        if (m_readRequestTable.count(line_addr) > 0) {
283            // There is an outstanding read request for the cache line
284            m_load_waiting_on_load_cycles++;
285            return RequestStatus_Aliased;
286        }
287    }
288
289    return RequestStatus_Ready;
290
291}
292
293
294
295// sets the kernelEndList
296void
297GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
298{
299    // Don't know if this will happen or is possible
300    // but I just want to be careful and not have it become
301    // simulator hang in the future
302    DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
303    assert(kernelEndList.count(wavefront_id) == 0);
304
305    kernelEndList[wavefront_id] = pkt;
306    DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
307            kernelEndList.size());
308}
309
310
311// Insert the request on the correct request table.  Return true if
312// the entry was already present.
313bool
314GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
315{
316    assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
317           pkt->req->isLockedRMW() ||
318           !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
319
320    int total_outstanding M5_VAR_USED =
321        m_writeRequestTable.size() + m_readRequestTable.size();
322
323    assert(m_outstanding_count == total_outstanding);
324
325    // See if we should schedule a deadlock check
326    if (!deadlockCheckEvent.scheduled()) {
327        schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
328    }
329
330    Addr line_addr = makeLineAddress(pkt->getAddr());
331    if ((request_type == RubyRequestType_ST) ||
332        (request_type == RubyRequestType_ATOMIC) ||
333        (request_type == RubyRequestType_ATOMIC_RETURN) ||
334        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
335        (request_type == RubyRequestType_RMW_Read) ||
336        (request_type == RubyRequestType_RMW_Write) ||
337        (request_type == RubyRequestType_Load_Linked) ||
338        (request_type == RubyRequestType_Store_Conditional) ||
339        (request_type == RubyRequestType_Locked_RMW_Read) ||
340        (request_type == RubyRequestType_Locked_RMW_Write) ||
341        (request_type == RubyRequestType_FLUSH)) {
342
343        pair<RequestTable::iterator, bool> r =
344          m_writeRequestTable.insert(RequestTable::value_type(line_addr,
345                                       (GPUCoalescerRequest*) NULL));
346        if (r.second) {
347            RequestTable::iterator i = r.first;
348            i->second = new GPUCoalescerRequest(pkt, request_type,
349                                                curCycle());
350            DPRINTF(GPUCoalescer,
351                    "Inserting write request for paddr %#x for type %d\n",
352                    pkt->req->getPaddr(), i->second->m_type);
353            m_outstanding_count++;
354        } else {
355            return true;
356        }
357    } else {
358        pair<RequestTable::iterator, bool> r =
359            m_readRequestTable.insert(RequestTable::value_type(line_addr,
360                                        (GPUCoalescerRequest*) NULL));
361
362        if (r.second) {
363            RequestTable::iterator i = r.first;
364            i->second = new GPUCoalescerRequest(pkt, request_type,
365                                             curCycle());
366            DPRINTF(GPUCoalescer,
367                    "Inserting read request for paddr %#x for type %d\n",
368                    pkt->req->getPaddr(), i->second->m_type);
369            m_outstanding_count++;
370        } else {
371            return true;
372        }
373    }
374
375    m_outstandReqHist.sample(m_outstanding_count);
376
377    total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
378    assert(m_outstanding_count == total_outstanding);
379
380    return false;
381}
382
383void
384GPUCoalescer::markRemoved()
385{
386    m_outstanding_count--;
387    assert(m_outstanding_count ==
388           m_writeRequestTable.size() + m_readRequestTable.size());
389}
390
391void
392GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
393{
394    assert(m_outstanding_count ==
395           m_writeRequestTable.size() + m_readRequestTable.size());
396
397    Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
398    if ((srequest->m_type == RubyRequestType_ST) ||
399        (srequest->m_type == RubyRequestType_RMW_Read) ||
400        (srequest->m_type == RubyRequestType_RMW_Write) ||
401        (srequest->m_type == RubyRequestType_Load_Linked) ||
402        (srequest->m_type == RubyRequestType_Store_Conditional) ||
403        (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
404        (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
405        m_writeRequestTable.erase(line_addr);
406    } else {
407        m_readRequestTable.erase(line_addr);
408    }
409
410    markRemoved();
411}
412
413bool
414GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
415{
416    //
417    // The success flag indicates whether the LLSC operation was successful.
418    // LL ops will always succeed, but SC may fail if the cache line is no
419    // longer locked.
420    //
421    bool success = true;
422    if (request->m_type == RubyRequestType_Store_Conditional) {
423        if (!m_dataCache_ptr->isLocked(address, m_version)) {
424            //
425            // For failed SC requests, indicate the failure to the cpu by
426            // setting the extra data to zero.
427            //
428            request->pkt->req->setExtraData(0);
429            success = false;
430        } else {
431            //
432            // For successful SC requests, indicate the success to the cpu by
433            // setting the extra data to one.
434            //
435            request->pkt->req->setExtraData(1);
436        }
437        //
438        // Independent of success, all SC operations must clear the lock
439        //
440        m_dataCache_ptr->clearLocked(address);
441    } else if (request->m_type == RubyRequestType_Load_Linked) {
442        //
443        // Note: To fully follow Alpha LLSC semantics, should the LL clear any
444        // previously locked cache lines?
445        //
446        m_dataCache_ptr->setLocked(address, m_version);
447    } else if ((m_dataCache_ptr->isTagPresent(address)) &&
448               (m_dataCache_ptr->isLocked(address, m_version))) {
449        //
450        // Normal writes should clear the locked address
451        //
452        m_dataCache_ptr->clearLocked(address);
453    }
454    return success;
455}
456
457void
458GPUCoalescer::writeCallback(Addr address, DataBlock& data)
459{
460    writeCallback(address, MachineType_NULL, data);
461}
462
463void
464GPUCoalescer::writeCallback(Addr address,
465                         MachineType mach,
466                         DataBlock& data)
467{
468    writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
469}
470
471void
472GPUCoalescer::writeCallback(Addr address,
473                         MachineType mach,
474                         DataBlock& data,
475                         Cycles initialRequestTime,
476                         Cycles forwardRequestTime,
477                         Cycles firstResponseTime)
478{
479    writeCallback(address, mach, data,
480                  initialRequestTime, forwardRequestTime, firstResponseTime,
481                  false);
482}
483
484void
485GPUCoalescer::writeCallback(Addr address,
486                         MachineType mach,
487                         DataBlock& data,
488                         Cycles initialRequestTime,
489                         Cycles forwardRequestTime,
490                         Cycles firstResponseTime,
491                         bool isRegion)
492{
493    assert(address == makeLineAddress(address));
494
495    DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
496    assert(m_writeRequestTable.count(makeLineAddress(address)));
497
498    RequestTable::iterator i = m_writeRequestTable.find(address);
499    assert(i != m_writeRequestTable.end());
500    GPUCoalescerRequest* request = i->second;
501
502    m_writeRequestTable.erase(i);
503    markRemoved();
504
505    assert((request->m_type == RubyRequestType_ST) ||
506           (request->m_type == RubyRequestType_ATOMIC) ||
507           (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
508           (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
509           (request->m_type == RubyRequestType_RMW_Read) ||
510           (request->m_type == RubyRequestType_RMW_Write) ||
511           (request->m_type == RubyRequestType_Load_Linked) ||
512           (request->m_type == RubyRequestType_Store_Conditional) ||
513           (request->m_type == RubyRequestType_Locked_RMW_Read) ||
514           (request->m_type == RubyRequestType_Locked_RMW_Write) ||
515           (request->m_type == RubyRequestType_FLUSH));
516
517
518    //
519    // For Alpha, properly handle LL, SC, and write requests with respect to
520    // locked cache blocks.
521    //
522    // Not valid for Garnet_standalone protocl
523    //
524    bool success = true;
525    if (!m_runningGarnetStandalone)
526        success = handleLlsc(address, request);
527
528    if (request->m_type == RubyRequestType_Locked_RMW_Read) {
529        m_controller->blockOnQueue(address, m_mandatory_q_ptr);
530    } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
531        m_controller->unblock(address);
532    }
533
534    hitCallback(request, mach, data, success,
535                request->issue_time, forwardRequestTime, firstResponseTime,
536                isRegion);
537}
538
539void
540GPUCoalescer::readCallback(Addr address, DataBlock& data)
541{
542    readCallback(address, MachineType_NULL, data);
543}
544
545void
546GPUCoalescer::readCallback(Addr address,
547                        MachineType mach,
548                        DataBlock& data)
549{
550    readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
551}
552
553void
554GPUCoalescer::readCallback(Addr address,
555                        MachineType mach,
556                        DataBlock& data,
557                        Cycles initialRequestTime,
558                        Cycles forwardRequestTime,
559                        Cycles firstResponseTime)
560{
561
562    readCallback(address, mach, data,
563                 initialRequestTime, forwardRequestTime, firstResponseTime,
564                 false);
565}
566
567void
568GPUCoalescer::readCallback(Addr address,
569                        MachineType mach,
570                        DataBlock& data,
571                        Cycles initialRequestTime,
572                        Cycles forwardRequestTime,
573                        Cycles firstResponseTime,
574                        bool isRegion)
575{
576    assert(address == makeLineAddress(address));
577    assert(m_readRequestTable.count(makeLineAddress(address)));
578
579    DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
580    RequestTable::iterator i = m_readRequestTable.find(address);
581    assert(i != m_readRequestTable.end());
582    GPUCoalescerRequest* request = i->second;
583
584    m_readRequestTable.erase(i);
585    markRemoved();
586
587    assert((request->m_type == RubyRequestType_LD) ||
588           (request->m_type == RubyRequestType_IFETCH));
589
590    hitCallback(request, mach, data, true,
591                request->issue_time, forwardRequestTime, firstResponseTime,
592                isRegion);
593}
594
595void
596GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
597                       MachineType mach,
598                       DataBlock& data,
599                       bool success,
600                       Cycles initialRequestTime,
601                       Cycles forwardRequestTime,
602                       Cycles firstResponseTime,
603                       bool isRegion)
604{
605    PacketPtr pkt = srequest->pkt;
606    Addr request_address = pkt->getAddr();
607    Addr request_line_address = makeLineAddress(request_address);
608
609    RubyRequestType type = srequest->m_type;
610
611    // Set this cache entry to the most recently used
612    if (type == RubyRequestType_IFETCH) {
613        if (m_instCache_ptr->isTagPresent(request_line_address))
614            m_instCache_ptr->setMRU(request_line_address);
615    } else {
616        if (m_dataCache_ptr->isTagPresent(request_line_address))
617            m_dataCache_ptr->setMRU(request_line_address);
618    }
619
620    recordMissLatency(srequest, mach,
621                      initialRequestTime,
622                      forwardRequestTime,
623                      firstResponseTime,
624                      success, isRegion);
625    // update the data
626    //
627    // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
628    int len = reqCoalescer[request_line_address].size();
629    std::vector<PacketPtr> mylist;
630    for (int i = 0; i < len; ++i) {
631        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
632        assert(type == reqCoalescer[request_line_address][i].primaryType);
633        request_address = pkt->getAddr();
634        request_line_address = makeLineAddress(pkt->getAddr());
635        if (pkt->getPtr<uint8_t>()) {
636            if ((type == RubyRequestType_LD) ||
637                (type == RubyRequestType_ATOMIC) ||
638                (type == RubyRequestType_ATOMIC_RETURN) ||
639                (type == RubyRequestType_IFETCH) ||
640                (type == RubyRequestType_RMW_Read) ||
641                (type == RubyRequestType_Locked_RMW_Read) ||
642                (type == RubyRequestType_Load_Linked)) {
643                memcpy(pkt->getPtr<uint8_t>(),
644                       data.getData(getOffset(request_address),
645                                    pkt->getSize()),
646                       pkt->getSize());
647            } else {
648                data.setData(pkt->getPtr<uint8_t>(),
649                             getOffset(request_address), pkt->getSize());
650            }
651        } else {
652            DPRINTF(MemoryAccess,
653                    "WARNING.  Data not transfered from Ruby to M5 for type " \
654                    "%s\n",
655                    RubyRequestType_to_string(type));
656        }
657
658        // If using the RubyTester, update the RubyTester sender state's
659        // subBlock with the recieved data.  The tester will later access
660        // this state.
661        // Note: RubyPort will access it's sender state before the
662        // RubyTester.
663        if (m_usingRubyTester) {
664            RubyPort::SenderState *requestSenderState =
665                safe_cast<RubyPort::SenderState*>(pkt->senderState);
666            RubyTester::SenderState* testerSenderState =
667                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
668            testerSenderState->subBlock.mergeFrom(data);
669        }
670
671        mylist.push_back(pkt);
672    }
673    delete srequest;
674    reqCoalescer.erase(request_line_address);
675    assert(!reqCoalescer.count(request_line_address));
676
677
678
679    completeHitCallback(mylist, len);
680}
681
682bool
683GPUCoalescer::empty() const
684{
685    return m_writeRequestTable.empty() && m_readRequestTable.empty();
686}
687
688// Analyzes the packet to see if this request can be coalesced.
689// If request can be coalesced, this request is added to the reqCoalescer table
690// and makeRequest returns RequestStatus_Issued;
691// If this is the first request to a cacheline, request is added to both
692// newRequests queue and to the reqCoalescer table; makeRequest
693// returns RequestStatus_Issued.
694// If there is a pending request to this cacheline and this request
695// can't be coalesced, RequestStatus_Aliased is returned and
696// the packet needs to be reissued.
697RequestStatus
698GPUCoalescer::makeRequest(PacketPtr pkt)
699{
700    // Check for GPU Barrier Kernel End or Kernel Begin
701    // Leave these to be handled by the child class
702    // Kernel End/Barrier = isFlush + isRelease
703    // Kernel Begin = isFlush + isAcquire
704    if (pkt->req->isKernel()) {
705        if (pkt->req->isAcquire()){
706            // This is a Kernel Begin leave handling to
707            // virtual xCoalescer::makeRequest
708            return RequestStatus_Issued;
709        }else if (pkt->req->isRelease()) {
710            // This is a Kernel End leave handling to
711            // virtual xCoalescer::makeRequest
712            // If we are here then we didn't call
713            // a virtual version of this function
714            // so we will also schedule the callback
715            int wf_id = 0;
716            if (pkt->req->hasContextId()) {
717                wf_id = pkt->req->contextId();
718            }
719            insertKernel(wf_id, pkt);
720            newKernelEnds.push_back(wf_id);
721            if (!issueEvent.scheduled()) {
722                schedule(issueEvent, curTick());
723            }
724            return RequestStatus_Issued;
725        }
726    }
727
728    // If number of outstanding requests greater than the max allowed,
729    // return RequestStatus_BufferFull. This logic can be extended to
730    // support proper backpressure.
731    if (m_outstanding_count >= m_max_outstanding_requests) {
732        return RequestStatus_BufferFull;
733    }
734
735    RubyRequestType primary_type = RubyRequestType_NULL;
736    RubyRequestType secondary_type = RubyRequestType_NULL;
737
738    if (pkt->isLLSC()) {
739        //
740        // Alpha LL/SC instructions need to be handled carefully by the cache
741        // coherence protocol to ensure they follow the proper semantics. In
742        // particular, by identifying the operations as atomic, the protocol
743        // should understand that migratory sharing optimizations should not
744        // be performed (i.e. a load between the LL and SC should not steal
745        // away exclusive permission).
746        //
747        if (pkt->isWrite()) {
748            primary_type = RubyRequestType_Store_Conditional;
749        } else {
750            assert(pkt->isRead());
751            primary_type = RubyRequestType_Load_Linked;
752        }
753        secondary_type = RubyRequestType_ATOMIC;
754    } else if (pkt->req->isLockedRMW()) {
755        //
756        // x86 locked instructions are translated to store cache coherence
757        // requests because these requests should always be treated as read
758        // exclusive operations and should leverage any migratory sharing
759        // optimization built into the protocol.
760        //
761        if (pkt->isWrite()) {
762            primary_type = RubyRequestType_Locked_RMW_Write;
763        } else {
764            assert(pkt->isRead());
765            primary_type = RubyRequestType_Locked_RMW_Read;
766        }
767        secondary_type = RubyRequestType_ST;
768    } else if (pkt->isAtomicOp()) {
769        //
770        // GPU Atomic Operation
771        //
772        primary_type = RubyRequestType_ATOMIC;
773        secondary_type = RubyRequestType_ATOMIC;
774    } else {
775        if (pkt->isRead()) {
776            if (pkt->req->isInstFetch()) {
777                primary_type = secondary_type = RubyRequestType_IFETCH;
778            } else {
779#if THE_ISA == X86_ISA
780                uint32_t flags = pkt->req->getFlags();
781                bool storeCheck = flags &
782                        (TheISA::StoreCheck << TheISA::FlagShift);
783#else
784                bool storeCheck = false;
785#endif // X86_ISA
786                if (storeCheck) {
787                    primary_type = RubyRequestType_RMW_Read;
788                    secondary_type = RubyRequestType_ST;
789                } else {
790                    primary_type = secondary_type = RubyRequestType_LD;
791                }
792            }
793        } else if (pkt->isWrite()) {
794            //
795            // Note: M5 packets do not differentiate ST from RMW_Write
796            //
797            primary_type = secondary_type = RubyRequestType_ST;
798        } else if (pkt->isFlush()) {
799            primary_type = secondary_type = RubyRequestType_FLUSH;
800        } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
801            if (assumingRfOCoherence) {
802                // If we reached here, this request must be a memFence
803                // and the protocol implements RfO, the coalescer can
804                // assume sequentially consistency and schedule the callback
805                // immediately.
806                // Currently the code implements fence callbacks
807                // by reusing the mechanism for kernel completions.
808                // This should be fixed.
809                int wf_id = 0;
810                if (pkt->req->hasContextId()) {
811                    wf_id = pkt->req->contextId();
812                }
813                insertKernel(wf_id, pkt);
814                newKernelEnds.push_back(wf_id);
815                if (!issueEvent.scheduled()) {
816                    schedule(issueEvent, curTick());
817                }
818                return RequestStatus_Issued;
819            } else {
820                // If not RfO, return issued here and let the child coalescer
821                // take care of it.
822                return RequestStatus_Issued;
823            }
824        } else {
825            panic("Unsupported ruby packet type\n");
826        }
827    }
828
829    // Check if there is any pending request to this cache line from
830    // previous cycles.
831    // If there is a pending request, return aliased. Since coalescing
832    // across time is not permitted, aliased requests are not coalesced.
833    // If a request for this address has already been issued, we must block
834    RequestStatus status = getRequestStatus(pkt, primary_type);
835    if (status != RequestStatus_Ready)
836        return status;
837
838    Addr line_addr = makeLineAddress(pkt->getAddr());
839
840    // Check if this request can be coalesced with previous
841    // requests from this cycle.
842    if (!reqCoalescer.count(line_addr)) {
843        // This is the first access to this cache line.
844        // A new request to the memory subsystem has to be
845        // made in the next cycle for this cache line, so
846        // add this line addr to the "newRequests" queue
847        newRequests.push_back(line_addr);
848
849    // There was a request to this cache line in this cycle,
850    // let us see if we can coalesce this request with the previous
851    // requests from this cycle
852    } else if (primary_type !=
853               reqCoalescer[line_addr][0].primaryType) {
854        // can't coalesce loads, stores and atomics!
855        return RequestStatus_Aliased;
856    } else if (pkt->req->isLockedRMW() ||
857               reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
858        // can't coalesce locked accesses, but can coalesce atomics!
859        return RequestStatus_Aliased;
860    } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
861               pkt->req->contextId() !=
862               reqCoalescer[line_addr][0].pkt->req->contextId()) {
863        // can't coalesce releases from different wavefronts
864        return RequestStatus_Aliased;
865    }
866
867    // in addition to the packet, we need to save both request types
868    reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
869    if (!issueEvent.scheduled())
870        schedule(issueEvent, curTick());
871    // TODO: issue hardware prefetches here
872    return RequestStatus_Issued;
873}
874
875void
876GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
877{
878
879    int proc_id = -1;
880    if (pkt != NULL && pkt->req->hasContextId()) {
881        proc_id = pkt->req->contextId();
882    }
883
884    // If valid, copy the pc to the ruby request
885    Addr pc = 0;
886    if (pkt->req->hasPC()) {
887        pc = pkt->req->getPC();
888    }
889
890    // At the moment setting scopes only counts
891    // for GPU spill space accesses
892    // which is pkt->req->isStack()
893    // this scope is REPLACE since it
894    // does not need to be flushed at the end
895    // of a kernel Private and local may need
896    // to be visible at the end of the kernel
897    HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
898    HSAScope accessScope = reqScopeToHSAScope(pkt->req);
899
900    Addr line_addr = makeLineAddress(pkt->getAddr());
901
902    // Creating WriteMask that records written bytes
903    // and atomic operations. This enables partial writes
904    // and partial reads of those writes
905    DataBlock dataBlock;
906    dataBlock.clear();
907    uint32_t blockSize = RubySystem::getBlockSizeBytes();
908    std::vector<bool> accessMask(blockSize,false);
909    std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
910    uint32_t tableSize = reqCoalescer[line_addr].size();
911    for (int i = 0; i < tableSize; i++) {
912        PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
913        uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
914        uint32_t tmpSize = tmpPkt->getSize();
915        if (tmpPkt->isAtomicOp()) {
916            std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
917                                                        tmpPkt->getAtomicOp());
918            atomicOps.push_back(tmpAtomicOp);
919        } else if (tmpPkt->isWrite()) {
920            dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
921                              tmpOffset, tmpSize);
922        }
923        for (int j = 0; j < tmpSize; j++) {
924            accessMask[tmpOffset + j] = true;
925        }
926    }
927    std::shared_ptr<RubyRequest> msg;
928    if (pkt->isAtomicOp()) {
929        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
930                              pkt->getPtr<uint8_t>(),
931                              pkt->getSize(), pc, secondary_type,
932                              RubyAccessMode_Supervisor, pkt,
933                              PrefetchBit_No, proc_id, 100,
934                              blockSize, accessMask,
935                              dataBlock, atomicOps,
936                              accessScope, accessSegment);
937    } else {
938        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
939                              pkt->getPtr<uint8_t>(),
940                              pkt->getSize(), pc, secondary_type,
941                              RubyAccessMode_Supervisor, pkt,
942                              PrefetchBit_No, proc_id, 100,
943                              blockSize, accessMask,
944                              dataBlock,
945                              accessScope, accessSegment);
946    }
947    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
948             curTick(), m_version, "Coal", "Begin", "", "",
949             printAddress(msg->getPhysicalAddress()),
950             RubyRequestType_to_string(secondary_type));
951
952    fatal_if(secondary_type == RubyRequestType_IFETCH,
953             "there should not be any I-Fetch requests in the GPU Coalescer");
954
955    // Send the message to the cache controller
956    fatal_if(m_data_cache_hit_latency == 0,
957             "should not have a latency of zero");
958
959    assert(m_mandatory_q_ptr);
960    m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
961}
962
963template <class KEY, class VALUE>
964std::ostream &
965operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
966{
967    out << "[";
968    for (auto i = map.begin(); i != map.end(); ++i)
969        out << " " << i->first << "=" << i->second;
970    out << " ]";
971
972    return out;
973}
974
975void
976GPUCoalescer::print(ostream& out) const
977{
978    out << "[GPUCoalescer: " << m_version
979        << ", outstanding requests: " << m_outstanding_count
980        << ", read request table: " << m_readRequestTable
981        << ", write request table: " << m_writeRequestTable
982        << "]";
983}
984
985// this can be called from setState whenever coherence permissions are
986// upgraded when invoked, coherence violations will be checked for the
987// given block
988void
989GPUCoalescer::checkCoherence(Addr addr)
990{
991#ifdef CHECK_COHERENCE
992    m_ruby_system->checkGlobalCoherenceInvariant(addr);
993#endif
994}
995
996void
997GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
998    DPRINTF(RubyStats, "Recorded statistic: %s\n",
999            SequencerRequestType_to_string(requestType));
1000}
1001
1002
1003void
1004GPUCoalescer::completeIssue()
1005{
1006    // newRequests has the cacheline addresses of all the
1007    // requests which need to be issued to the memory subsystem
1008    // in this cycle
1009    int len = newRequests.size();
1010    DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
1011    for (int i = 0; i < len; ++i) {
1012        // Get the requests from reqCoalescer table. Get only the
1013        // first request for each cacheline, the remaining requests
1014        // can be coalesced with the first request. So, only
1015        // one request is issued per cacheline.
1016        RequestDesc info = reqCoalescer[newRequests[i]][0];
1017        PacketPtr pkt = info.pkt;
1018        DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
1019                i, pkt->req->getPaddr());
1020        // Insert this request to the read/writeRequestTables. These tables
1021        // are used to track aliased requests in makeRequest subroutine
1022        bool found = insertRequest(pkt, info.primaryType);
1023
1024        if (found) {
1025            panic("GPUCoalescer::makeRequest should never be called if the "
1026                  "request is already outstanding\n");
1027        }
1028
1029        // Issue request to ruby subsystem
1030        issueRequest(pkt, info.secondaryType);
1031    }
1032    newRequests.clear();
1033
1034    // have Kernel End releases been issued this cycle
1035    len = newKernelEnds.size();
1036    for (int i = 0; i < len; i++) {
1037        kernelCallback(newKernelEnds[i]);
1038    }
1039    newKernelEnds.clear();
1040}
1041
1042void
1043GPUCoalescer::evictionCallback(Addr address)
1044{
1045    ruby_eviction_callback(address);
1046}
1047
1048void
1049GPUCoalescer::kernelCallback(int wavefront_id)
1050{
1051    assert(kernelEndList.count(wavefront_id));
1052
1053    ruby_hit_callback(kernelEndList[wavefront_id]);
1054
1055    kernelEndList.erase(wavefront_id);
1056}
1057
1058void
1059GPUCoalescer::atomicCallback(Addr address,
1060                             MachineType mach,
1061                             const DataBlock& data)
1062{
1063    assert(address == makeLineAddress(address));
1064
1065    DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
1066    assert(m_writeRequestTable.count(makeLineAddress(address)));
1067
1068    RequestTable::iterator i = m_writeRequestTable.find(address);
1069    assert(i != m_writeRequestTable.end());
1070    GPUCoalescerRequest* srequest = i->second;
1071
1072    m_writeRequestTable.erase(i);
1073    markRemoved();
1074
1075    assert((srequest->m_type == RubyRequestType_ATOMIC) ||
1076           (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
1077           (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
1078
1079
1080    // Atomics don't write to cache, so there is no MRU update...
1081
1082    recordMissLatency(srequest, mach,
1083                      srequest->issue_time, Cycles(0), Cycles(0), true, false);
1084
1085    PacketPtr pkt = srequest->pkt;
1086    Addr request_address = pkt->getAddr();
1087    Addr request_line_address = makeLineAddress(pkt->getAddr());
1088
1089    int len = reqCoalescer[request_line_address].size();
1090    std::vector<PacketPtr> mylist;
1091    for (int i = 0; i < len; ++i) {
1092        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
1093        assert(srequest->m_type ==
1094               reqCoalescer[request_line_address][i].primaryType);
1095        request_address = (pkt->getAddr());
1096        request_line_address = makeLineAddress(request_address);
1097        if (pkt->getPtr<uint8_t>() &&
1098            srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
1099            /* atomics are done in memory, and return the data *before* the atomic op... */
1100            memcpy(pkt->getPtr<uint8_t>(),
1101                   data.getData(getOffset(request_address),
1102                                pkt->getSize()),
1103                   pkt->getSize());
1104        } else {
1105            DPRINTF(MemoryAccess,
1106                    "WARNING.  Data not transfered from Ruby to M5 for type " \
1107                    "%s\n",
1108                    RubyRequestType_to_string(srequest->m_type));
1109        }
1110
1111        // If using the RubyTester, update the RubyTester sender state's
1112        // subBlock with the recieved data.  The tester will later access
1113        // this state.
1114        // Note: RubyPort will access it's sender state before the
1115        // RubyTester.
1116        if (m_usingRubyTester) {
1117            RubyPort::SenderState *requestSenderState =
1118                safe_cast<RubyPort::SenderState*>(pkt->senderState);
1119            RubyTester::SenderState* testerSenderState =
1120                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
1121            testerSenderState->subBlock.mergeFrom(data);
1122        }
1123
1124        mylist.push_back(pkt);
1125    }
1126    delete srequest;
1127    reqCoalescer.erase(request_line_address);
1128    assert(!reqCoalescer.count(request_line_address));
1129
1130    completeHitCallback(mylist, len);
1131}
1132
1133void
1134GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
1135{
1136    if (myMachID == senderMachID) {
1137        CP_TCPLdHits++;
1138    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1139        CP_TCPLdTransfers++;
1140    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1141        CP_TCCLdHits++;
1142    } else {
1143        CP_LdMiss++;
1144    }
1145}
1146
1147void
1148GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
1149{
1150    if (myMachID == senderMachID) {
1151        CP_TCPStHits++;
1152    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1153        CP_TCPStTransfers++;
1154    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1155        CP_TCCStHits++;
1156    } else {
1157        CP_StMiss++;
1158    }
1159}
1160
1161void
1162GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
1163{
1164    for (int i = 0; i < len; ++i) {
1165        RubyPort::SenderState *ss =
1166            safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
1167        MemSlavePort *port = ss->port;
1168        assert(port != NULL);
1169
1170        mylist[i]->senderState = ss->predecessor;
1171        delete ss;
1172        port->hitCallback(mylist[i]);
1173        trySendRetries();
1174    }
1175
1176    testDrainComplete();
1177}
1178
1179PacketPtr
1180GPUCoalescer::mapAddrToPkt(Addr address)
1181{
1182    RequestTable::iterator i = m_readRequestTable.find(address);
1183    assert(i != m_readRequestTable.end());
1184    GPUCoalescerRequest* request = i->second;
1185    return request->pkt;
1186}
1187
1188void
1189GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
1190                                MachineType mach,
1191                                Cycles initialRequestTime,
1192                                Cycles forwardRequestTime,
1193                                Cycles firstResponseTime,
1194                                bool success, bool isRegion)
1195{
1196    RubyRequestType type = srequest->m_type;
1197    Cycles issued_time = srequest->issue_time;
1198    Cycles completion_time = curCycle();
1199    assert(completion_time >= issued_time);
1200    Cycles total_lat = completion_time - issued_time;
1201
1202    // cache stats (valid for RfO protocol only)
1203    if (mach == MachineType_TCP) {
1204        if (type == RubyRequestType_LD) {
1205            GPU_TCPLdHits++;
1206        } else {
1207            GPU_TCPStHits++;
1208        }
1209    } else if (mach == MachineType_L1Cache_wCC) {
1210        if (type == RubyRequestType_LD) {
1211            GPU_TCPLdTransfers++;
1212        } else {
1213            GPU_TCPStTransfers++;
1214        }
1215    } else if (mach == MachineType_TCC) {
1216        if (type == RubyRequestType_LD) {
1217            GPU_TCCLdHits++;
1218        } else {
1219            GPU_TCCStHits++;
1220        }
1221    } else  {
1222        if (type == RubyRequestType_LD) {
1223            GPU_LdMiss++;
1224        } else {
1225            GPU_StMiss++;
1226        }
1227    }
1228
1229    // Profile all access latency, even zero latency accesses
1230    m_latencyHist.sample(total_lat);
1231    m_typeLatencyHist[type]->sample(total_lat);
1232
1233    // Profile the miss latency for all non-zero demand misses
1234    if (total_lat != Cycles(0)) {
1235        m_missLatencyHist.sample(total_lat);
1236        m_missTypeLatencyHist[type]->sample(total_lat);
1237
1238        if (mach != MachineType_NUM) {
1239            m_missMachLatencyHist[mach]->sample(total_lat);
1240            m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
1241
1242            if ((issued_time <= initialRequestTime) &&
1243                (initialRequestTime <= forwardRequestTime) &&
1244                (forwardRequestTime <= firstResponseTime) &&
1245                (firstResponseTime <= completion_time)) {
1246
1247                m_IssueToInitialDelayHist[mach]->sample(
1248                    initialRequestTime - issued_time);
1249                m_InitialToForwardDelayHist[mach]->sample(
1250                    forwardRequestTime - initialRequestTime);
1251                m_ForwardToFirstResponseDelayHist[mach]->sample(
1252                    firstResponseTime - forwardRequestTime);
1253                m_FirstResponseToCompletionDelayHist[mach]->sample(
1254                    completion_time - firstResponseTime);
1255            }
1256        }
1257
1258    }
1259
1260    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1261             curTick(), m_version, "Coal",
1262             success ? "Done" : "SC_Failed", "", "",
1263             printAddress(srequest->pkt->getAddr()), total_lat);
1264}
1265
1266void
1267GPUCoalescer::regStats()
1268{
1269    RubyPort::regStats();
1270
1271    // These statistical variables are not for display.
1272    // The profiler will collate these across different
1273    // coalescers and display those collated statistics.
1274    m_outstandReqHist.init(10);
1275    m_latencyHist.init(10);
1276    m_missLatencyHist.init(10);
1277
1278    for (int i = 0; i < RubyRequestType_NUM; i++) {
1279        m_typeLatencyHist.push_back(new Stats::Histogram());
1280        m_typeLatencyHist[i]->init(10);
1281
1282        m_missTypeLatencyHist.push_back(new Stats::Histogram());
1283        m_missTypeLatencyHist[i]->init(10);
1284    }
1285
1286    for (int i = 0; i < MachineType_NUM; i++) {
1287        m_missMachLatencyHist.push_back(new Stats::Histogram());
1288        m_missMachLatencyHist[i]->init(10);
1289
1290        m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
1291        m_IssueToInitialDelayHist[i]->init(10);
1292
1293        m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
1294        m_InitialToForwardDelayHist[i]->init(10);
1295
1296        m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
1297        m_ForwardToFirstResponseDelayHist[i]->init(10);
1298
1299        m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
1300        m_FirstResponseToCompletionDelayHist[i]->init(10);
1301    }
1302
1303    for (int i = 0; i < RubyRequestType_NUM; i++) {
1304        m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
1305
1306        for (int j = 0; j < MachineType_NUM; j++) {
1307            m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
1308            m_missTypeMachLatencyHist[i][j]->init(10);
1309        }
1310    }
1311
1312    // GPU cache stats
1313    GPU_TCPLdHits
1314        .name(name() + ".gpu_tcp_ld_hits")
1315        .desc("loads that hit in the TCP")
1316        ;
1317    GPU_TCPLdTransfers
1318        .name(name() + ".gpu_tcp_ld_transfers")
1319        .desc("TCP to TCP load transfers")
1320        ;
1321    GPU_TCCLdHits
1322        .name(name() + ".gpu_tcc_ld_hits")
1323        .desc("loads that hit in the TCC")
1324        ;
1325    GPU_LdMiss
1326        .name(name() + ".gpu_ld_misses")
1327        .desc("loads that miss in the GPU")
1328        ;
1329
1330    GPU_TCPStHits
1331        .name(name() + ".gpu_tcp_st_hits")
1332        .desc("stores that hit in the TCP")
1333        ;
1334    GPU_TCPStTransfers
1335        .name(name() + ".gpu_tcp_st_transfers")
1336        .desc("TCP to TCP store transfers")
1337        ;
1338    GPU_TCCStHits
1339        .name(name() + ".gpu_tcc_st_hits")
1340        .desc("stores that hit in the TCC")
1341        ;
1342    GPU_StMiss
1343        .name(name() + ".gpu_st_misses")
1344        .desc("stores that miss in the GPU")
1345        ;
1346
1347    // CP cache stats
1348    CP_TCPLdHits
1349        .name(name() + ".cp_tcp_ld_hits")
1350        .desc("loads that hit in the TCP")
1351        ;
1352    CP_TCPLdTransfers
1353        .name(name() + ".cp_tcp_ld_transfers")
1354        .desc("TCP to TCP load transfers")
1355        ;
1356    CP_TCCLdHits
1357        .name(name() + ".cp_tcc_ld_hits")
1358        .desc("loads that hit in the TCC")
1359        ;
1360    CP_LdMiss
1361        .name(name() + ".cp_ld_misses")
1362        .desc("loads that miss in the GPU")
1363        ;
1364
1365    CP_TCPStHits
1366        .name(name() + ".cp_tcp_st_hits")
1367        .desc("stores that hit in the TCP")
1368        ;
1369    CP_TCPStTransfers
1370        .name(name() + ".cp_tcp_st_transfers")
1371        .desc("TCP to TCP store transfers")
1372        ;
1373    CP_TCCStHits
1374        .name(name() + ".cp_tcc_st_hits")
1375        .desc("stores that hit in the TCC")
1376        ;
1377    CP_StMiss
1378        .name(name() + ".cp_st_misses")
1379        .desc("stores that miss in the GPU")
1380        ;
1381}
1382