1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Sooraj Puthoor
34 */
35
36#include "base/logging.hh"
37#include "base/str.hh"
38#include "config/the_isa.hh"
39
40#if THE_ISA == X86_ISA
41#include "arch/x86/insts/microldstop.hh"
42
43#endif // X86_ISA
44#include "mem/ruby/system/GPUCoalescer.hh"
45
46#include "cpu/testers/rubytest/RubyTester.hh"
47#include "debug/GPUCoalescer.hh"
48#include "debug/MemoryAccess.hh"
49#include "debug/ProtocolTrace.hh"
50#include "debug/RubyPort.hh"
51#include "debug/RubyStats.hh"
52#include "gpu-compute/shader.hh"
53#include "mem/packet.hh"
54#include "mem/ruby/common/DataBlock.hh"
55#include "mem/ruby/common/SubBlock.hh"
56#include "mem/ruby/network/MessageBuffer.hh"
57#include "mem/ruby/profiler/Profiler.hh"
58#include "mem/ruby/slicc_interface/AbstractController.hh"
59#include "mem/ruby/slicc_interface/RubyRequest.hh"
60#include "mem/ruby/structures/CacheMemory.hh"
61#include "mem/ruby/system/RubySystem.hh"
62#include "params/RubyGPUCoalescer.hh"
63
64using namespace std;
65
66GPUCoalescer *
67RubyGPUCoalescerParams::create()
68{
69    return new GPUCoalescer(this);
70}
71
72HSAScope
73reqScopeToHSAScope(const RequestPtr &req)
74{
75    HSAScope accessScope = HSAScope_UNSPECIFIED;
76    if (req->isScoped()) {
77        if (req->isWavefrontScope()) {
78            accessScope = HSAScope_WAVEFRONT;
79        } else if (req->isWorkgroupScope()) {
80            accessScope = HSAScope_WORKGROUP;
81        } else if (req->isDeviceScope()) {
82            accessScope = HSAScope_DEVICE;
83        } else if (req->isSystemScope()) {
84            accessScope = HSAScope_SYSTEM;
85        } else {
86            fatal("Bad scope type");
87        }
88    }
89    return accessScope;
90}
91
92HSASegment
93reqSegmentToHSASegment(const RequestPtr &req)
94{
95    HSASegment accessSegment = HSASegment_GLOBAL;
96
97    if (req->isGlobalSegment()) {
98        accessSegment = HSASegment_GLOBAL;
99    } else if (req->isGroupSegment()) {
100        accessSegment = HSASegment_GROUP;
101    } else if (req->isPrivateSegment()) {
102        accessSegment = HSASegment_PRIVATE;
103    } else if (req->isKernargSegment()) {
104        accessSegment = HSASegment_KERNARG;
105    } else if (req->isReadonlySegment()) {
106        accessSegment = HSASegment_READONLY;
107    } else if (req->isSpillSegment()) {
108        accessSegment = HSASegment_SPILL;
109    } else if (req->isArgSegment()) {
110        accessSegment = HSASegment_ARG;
111    } else {
112        fatal("Bad segment type");
113    }
114
115    return accessSegment;
116}
117
118GPUCoalescer::GPUCoalescer(const Params *p)
119    : RubyPort(p),
120      issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
121                 false, Event::Progress_Event_Pri),
122      deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check")
123{
124    m_store_waiting_on_load_cycles = 0;
125    m_store_waiting_on_store_cycles = 0;
126    m_load_waiting_on_store_cycles = 0;
127    m_load_waiting_on_load_cycles = 0;
128
129    m_outstanding_count = 0;
130
131    m_max_outstanding_requests = 0;
132    m_deadlock_threshold = 0;
133    m_instCache_ptr = nullptr;
134    m_dataCache_ptr = nullptr;
135
136    m_instCache_ptr = p->icache;
137    m_dataCache_ptr = p->dcache;
138    m_max_outstanding_requests = p->max_outstanding_requests;
139    m_deadlock_threshold = p->deadlock_threshold;
140
141    assert(m_max_outstanding_requests > 0);
142    assert(m_deadlock_threshold > 0);
143    assert(m_instCache_ptr);
144    assert(m_dataCache_ptr);
145
146    m_runningGarnetStandalone = p->garnet_standalone;
147    assumingRfOCoherence = p->assume_rfo;
148}
149
150GPUCoalescer::~GPUCoalescer()
151{
152}
153
154void
155GPUCoalescer::wakeup()
156{
157    // Check for deadlock of any of the requests
158    Cycles current_time = curCycle();
159
160    // Check across all outstanding requests
161    int total_outstanding = 0;
162
163    RequestTable::iterator read = m_readRequestTable.begin();
164    RequestTable::iterator read_end = m_readRequestTable.end();
165    for (; read != read_end; ++read) {
166        GPUCoalescerRequest* request = read->second;
167        if (current_time - request->issue_time < m_deadlock_threshold)
168            continue;
169
170        panic("Possible Deadlock detected. Aborting!\n"
171             "version: %d request.paddr: 0x%x m_readRequestTable: %d "
172             "current time: %u issue_time: %d difference: %d\n", m_version,
173              request->pkt->getAddr(), m_readRequestTable.size(),
174              current_time * clockPeriod(), request->issue_time * clockPeriod(),
175              (current_time - request->issue_time)*clockPeriod());
176    }
177
178    RequestTable::iterator write = m_writeRequestTable.begin();
179    RequestTable::iterator write_end = m_writeRequestTable.end();
180    for (; write != write_end; ++write) {
181        GPUCoalescerRequest* request = write->second;
182        if (current_time - request->issue_time < m_deadlock_threshold)
183            continue;
184
185        panic("Possible Deadlock detected. Aborting!\n"
186             "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
187             "current time: %u issue_time: %d difference: %d\n", m_version,
188              request->pkt->getAddr(), m_writeRequestTable.size(),
189              current_time * clockPeriod(), request->issue_time * clockPeriod(),
190              (current_time - request->issue_time) * clockPeriod());
191    }
192
193    total_outstanding += m_writeRequestTable.size();
194    total_outstanding += m_readRequestTable.size();
195
196    assert(m_outstanding_count == total_outstanding);
197
198    if (m_outstanding_count > 0) {
199        // If there are still outstanding requests, keep checking
200        schedule(deadlockCheckEvent,
201                 m_deadlock_threshold * clockPeriod() +
202                 curTick());
203    }
204}
205
206void
207GPUCoalescer::resetStats()
208{
209    m_latencyHist.reset();
210    m_missLatencyHist.reset();
211    for (int i = 0; i < RubyRequestType_NUM; i++) {
212        m_typeLatencyHist[i]->reset();
213        m_missTypeLatencyHist[i]->reset();
214        for (int j = 0; j < MachineType_NUM; j++) {
215            m_missTypeMachLatencyHist[i][j]->reset();
216        }
217    }
218
219    for (int i = 0; i < MachineType_NUM; i++) {
220        m_missMachLatencyHist[i]->reset();
221
222        m_IssueToInitialDelayHist[i]->reset();
223        m_InitialToForwardDelayHist[i]->reset();
224        m_ForwardToFirstResponseDelayHist[i]->reset();
225        m_FirstResponseToCompletionDelayHist[i]->reset();
226    }
227}
228
229void
230GPUCoalescer::printProgress(ostream& out) const
231{
232}
233
234RequestStatus
235GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
236{
237    Addr line_addr = makeLineAddress(pkt->getAddr());
238
239    if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
240        return RequestStatus_BufferFull;
241    }
242
243    if (m_controller->isBlocked(line_addr) &&
244       request_type != RubyRequestType_Locked_RMW_Write) {
245        return RequestStatus_Aliased;
246    }
247
248    if ((request_type == RubyRequestType_ST) ||
249        (request_type == RubyRequestType_ATOMIC) ||
250        (request_type == RubyRequestType_ATOMIC_RETURN) ||
251        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
252        (request_type == RubyRequestType_RMW_Read) ||
253        (request_type == RubyRequestType_RMW_Write) ||
254        (request_type == RubyRequestType_Load_Linked) ||
255        (request_type == RubyRequestType_Store_Conditional) ||
256        (request_type == RubyRequestType_Locked_RMW_Read) ||
257        (request_type == RubyRequestType_Locked_RMW_Write) ||
258        (request_type == RubyRequestType_FLUSH)) {
259
260        // Check if there is any outstanding read request for the same
261        // cache line.
262        if (m_readRequestTable.count(line_addr) > 0) {
263            m_store_waiting_on_load_cycles++;
264            return RequestStatus_Aliased;
265        }
266
267        if (m_writeRequestTable.count(line_addr) > 0) {
268          // There is an outstanding write request for the cache line
269          m_store_waiting_on_store_cycles++;
270          return RequestStatus_Aliased;
271        }
272    } else {
273        // Check if there is any outstanding write request for the same
274        // cache line.
275        if (m_writeRequestTable.count(line_addr) > 0) {
276            m_load_waiting_on_store_cycles++;
277            return RequestStatus_Aliased;
278        }
279
280        if (m_readRequestTable.count(line_addr) > 0) {
281            // There is an outstanding read request for the cache line
282            m_load_waiting_on_load_cycles++;
283            return RequestStatus_Aliased;
284        }
285    }
286
287    return RequestStatus_Ready;
288
289}
290
291
292
293// sets the kernelEndList
294void
295GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
296{
297    // Don't know if this will happen or is possible
298    // but I just want to be careful and not have it become
299    // simulator hang in the future
300    DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
301    assert(kernelEndList.count(wavefront_id) == 0);
302
303    kernelEndList[wavefront_id] = pkt;
304    DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
305            kernelEndList.size());
306}
307
308
309// Insert the request on the correct request table.  Return true if
310// the entry was already present.
311bool
312GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
313{
314    assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
315           pkt->req->isLockedRMW() ||
316           !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
317
318    int total_outstanding M5_VAR_USED =
319        m_writeRequestTable.size() + m_readRequestTable.size();
320
321    assert(m_outstanding_count == total_outstanding);
322
323    // See if we should schedule a deadlock check
324    if (!deadlockCheckEvent.scheduled()) {
325        schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
326    }
327
328    Addr line_addr = makeLineAddress(pkt->getAddr());
329    if ((request_type == RubyRequestType_ST) ||
330        (request_type == RubyRequestType_ATOMIC) ||
331        (request_type == RubyRequestType_ATOMIC_RETURN) ||
332        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
333        (request_type == RubyRequestType_RMW_Read) ||
334        (request_type == RubyRequestType_RMW_Write) ||
335        (request_type == RubyRequestType_Load_Linked) ||
336        (request_type == RubyRequestType_Store_Conditional) ||
337        (request_type == RubyRequestType_Locked_RMW_Read) ||
338        (request_type == RubyRequestType_Locked_RMW_Write) ||
339        (request_type == RubyRequestType_FLUSH)) {
340
341        pair<RequestTable::iterator, bool> r =
342          m_writeRequestTable.insert(RequestTable::value_type(line_addr,
343                                       (GPUCoalescerRequest*) NULL));
344        if (r.second) {
345            RequestTable::iterator i = r.first;
346            i->second = new GPUCoalescerRequest(pkt, request_type,
347                                                curCycle());
348            DPRINTF(GPUCoalescer,
349                    "Inserting write request for paddr %#x for type %d\n",
350                    pkt->req->getPaddr(), i->second->m_type);
351            m_outstanding_count++;
352        } else {
353            return true;
354        }
355    } else {
356        pair<RequestTable::iterator, bool> r =
357            m_readRequestTable.insert(RequestTable::value_type(line_addr,
358                                        (GPUCoalescerRequest*) NULL));
359
360        if (r.second) {
361            RequestTable::iterator i = r.first;
362            i->second = new GPUCoalescerRequest(pkt, request_type,
363                                             curCycle());
364            DPRINTF(GPUCoalescer,
365                    "Inserting read request for paddr %#x for type %d\n",
366                    pkt->req->getPaddr(), i->second->m_type);
367            m_outstanding_count++;
368        } else {
369            return true;
370        }
371    }
372
373    m_outstandReqHist.sample(m_outstanding_count);
374
375    total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
376    assert(m_outstanding_count == total_outstanding);
377
378    return false;
379}
380
381void
382GPUCoalescer::markRemoved()
383{
384    m_outstanding_count--;
385    assert(m_outstanding_count ==
386           m_writeRequestTable.size() + m_readRequestTable.size());
387}
388
389void
390GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
391{
392    assert(m_outstanding_count ==
393           m_writeRequestTable.size() + m_readRequestTable.size());
394
395    Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
396    if ((srequest->m_type == RubyRequestType_ST) ||
397        (srequest->m_type == RubyRequestType_RMW_Read) ||
398        (srequest->m_type == RubyRequestType_RMW_Write) ||
399        (srequest->m_type == RubyRequestType_Load_Linked) ||
400        (srequest->m_type == RubyRequestType_Store_Conditional) ||
401        (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
402        (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
403        m_writeRequestTable.erase(line_addr);
404    } else {
405        m_readRequestTable.erase(line_addr);
406    }
407
408    markRemoved();
409}
410
411bool
412GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
413{
414    //
415    // The success flag indicates whether the LLSC operation was successful.
416    // LL ops will always succeed, but SC may fail if the cache line is no
417    // longer locked.
418    //
419    bool success = true;
420    if (request->m_type == RubyRequestType_Store_Conditional) {
421        if (!m_dataCache_ptr->isLocked(address, m_version)) {
422            //
423            // For failed SC requests, indicate the failure to the cpu by
424            // setting the extra data to zero.
425            //
426            request->pkt->req->setExtraData(0);
427            success = false;
428        } else {
429            //
430            // For successful SC requests, indicate the success to the cpu by
431            // setting the extra data to one.
432            //
433            request->pkt->req->setExtraData(1);
434        }
435        //
436        // Independent of success, all SC operations must clear the lock
437        //
438        m_dataCache_ptr->clearLocked(address);
439    } else if (request->m_type == RubyRequestType_Load_Linked) {
440        //
441        // Note: To fully follow Alpha LLSC semantics, should the LL clear any
442        // previously locked cache lines?
443        //
444        m_dataCache_ptr->setLocked(address, m_version);
445    } else if ((m_dataCache_ptr->isTagPresent(address)) &&
446               (m_dataCache_ptr->isLocked(address, m_version))) {
447        //
448        // Normal writes should clear the locked address
449        //
450        m_dataCache_ptr->clearLocked(address);
451    }
452    return success;
453}
454
455void
456GPUCoalescer::writeCallback(Addr address, DataBlock& data)
457{
458    writeCallback(address, MachineType_NULL, data);
459}
460
461void
462GPUCoalescer::writeCallback(Addr address,
463                         MachineType mach,
464                         DataBlock& data)
465{
466    writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
467}
468
469void
470GPUCoalescer::writeCallback(Addr address,
471                         MachineType mach,
472                         DataBlock& data,
473                         Cycles initialRequestTime,
474                         Cycles forwardRequestTime,
475                         Cycles firstResponseTime)
476{
477    writeCallback(address, mach, data,
478                  initialRequestTime, forwardRequestTime, firstResponseTime,
479                  false);
480}
481
482void
483GPUCoalescer::writeCallback(Addr address,
484                         MachineType mach,
485                         DataBlock& data,
486                         Cycles initialRequestTime,
487                         Cycles forwardRequestTime,
488                         Cycles firstResponseTime,
489                         bool isRegion)
490{
491    assert(address == makeLineAddress(address));
492
493    DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
494    assert(m_writeRequestTable.count(makeLineAddress(address)));
495
496    RequestTable::iterator i = m_writeRequestTable.find(address);
497    assert(i != m_writeRequestTable.end());
498    GPUCoalescerRequest* request = i->second;
499
500    m_writeRequestTable.erase(i);
501    markRemoved();
502
503    assert((request->m_type == RubyRequestType_ST) ||
504           (request->m_type == RubyRequestType_ATOMIC) ||
505           (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
506           (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
507           (request->m_type == RubyRequestType_RMW_Read) ||
508           (request->m_type == RubyRequestType_RMW_Write) ||
509           (request->m_type == RubyRequestType_Load_Linked) ||
510           (request->m_type == RubyRequestType_Store_Conditional) ||
511           (request->m_type == RubyRequestType_Locked_RMW_Read) ||
512           (request->m_type == RubyRequestType_Locked_RMW_Write) ||
513           (request->m_type == RubyRequestType_FLUSH));
514
515
516    //
517    // For Alpha, properly handle LL, SC, and write requests with respect to
518    // locked cache blocks.
519    //
520    // Not valid for Garnet_standalone protocl
521    //
522    bool success = true;
523    if (!m_runningGarnetStandalone)
524        success = handleLlsc(address, request);
525
526    if (request->m_type == RubyRequestType_Locked_RMW_Read) {
527        m_controller->blockOnQueue(address, m_mandatory_q_ptr);
528    } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
529        m_controller->unblock(address);
530    }
531
532    hitCallback(request, mach, data, success,
533                request->issue_time, forwardRequestTime, firstResponseTime,
534                isRegion);
535}
536
537void
538GPUCoalescer::readCallback(Addr address, DataBlock& data)
539{
540    readCallback(address, MachineType_NULL, data);
541}
542
543void
544GPUCoalescer::readCallback(Addr address,
545                        MachineType mach,
546                        DataBlock& data)
547{
548    readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
549}
550
551void
552GPUCoalescer::readCallback(Addr address,
553                        MachineType mach,
554                        DataBlock& data,
555                        Cycles initialRequestTime,
556                        Cycles forwardRequestTime,
557                        Cycles firstResponseTime)
558{
559
560    readCallback(address, mach, data,
561                 initialRequestTime, forwardRequestTime, firstResponseTime,
562                 false);
563}
564
565void
566GPUCoalescer::readCallback(Addr address,
567                        MachineType mach,
568                        DataBlock& data,
569                        Cycles initialRequestTime,
570                        Cycles forwardRequestTime,
571                        Cycles firstResponseTime,
572                        bool isRegion)
573{
574    assert(address == makeLineAddress(address));
575    assert(m_readRequestTable.count(makeLineAddress(address)));
576
577    DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
578    RequestTable::iterator i = m_readRequestTable.find(address);
579    assert(i != m_readRequestTable.end());
580    GPUCoalescerRequest* request = i->second;
581
582    m_readRequestTable.erase(i);
583    markRemoved();
584
585    assert((request->m_type == RubyRequestType_LD) ||
586           (request->m_type == RubyRequestType_IFETCH));
587
588    hitCallback(request, mach, data, true,
589                request->issue_time, forwardRequestTime, firstResponseTime,
590                isRegion);
591}
592
593void
594GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
595                       MachineType mach,
596                       DataBlock& data,
597                       bool success,
598                       Cycles initialRequestTime,
599                       Cycles forwardRequestTime,
600                       Cycles firstResponseTime,
601                       bool isRegion)
602{
603    PacketPtr pkt = srequest->pkt;
604    Addr request_address = pkt->getAddr();
605    Addr request_line_address = makeLineAddress(request_address);
606
607    RubyRequestType type = srequest->m_type;
608
609    // Set this cache entry to the most recently used
610    if (type == RubyRequestType_IFETCH) {
611        if (m_instCache_ptr->isTagPresent(request_line_address))
612            m_instCache_ptr->setMRU(request_line_address);
613    } else {
614        if (m_dataCache_ptr->isTagPresent(request_line_address))
615            m_dataCache_ptr->setMRU(request_line_address);
616    }
617
618    recordMissLatency(srequest, mach,
619                      initialRequestTime,
620                      forwardRequestTime,
621                      firstResponseTime,
622                      success, isRegion);
623    // update the data
624    //
625    // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
626    int len = reqCoalescer[request_line_address].size();
627    std::vector<PacketPtr> mylist;
628    for (int i = 0; i < len; ++i) {
629        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
630        assert(type == reqCoalescer[request_line_address][i].primaryType);
631        request_address = pkt->getAddr();
632        request_line_address = makeLineAddress(pkt->getAddr());
633        if (pkt->getPtr<uint8_t>()) {
634            if ((type == RubyRequestType_LD) ||
635                (type == RubyRequestType_ATOMIC) ||
636                (type == RubyRequestType_ATOMIC_RETURN) ||
637                (type == RubyRequestType_IFETCH) ||
638                (type == RubyRequestType_RMW_Read) ||
639                (type == RubyRequestType_Locked_RMW_Read) ||
640                (type == RubyRequestType_Load_Linked)) {
641                pkt->setData(
642                    data.getData(getOffset(request_address), pkt->getSize()));
643            } else {
644                data.setData(pkt->getPtr<uint8_t>(),
645                             getOffset(request_address), pkt->getSize());
646            }
647        } else {
648            DPRINTF(MemoryAccess,
649                    "WARNING.  Data not transfered from Ruby to M5 for type " \
650                    "%s\n",
651                    RubyRequestType_to_string(type));
652        }
653
654        // If using the RubyTester, update the RubyTester sender state's
655        // subBlock with the recieved data.  The tester will later access
656        // this state.
657        // Note: RubyPort will access it's sender state before the
658        // RubyTester.
659        if (m_usingRubyTester) {
660            RubyPort::SenderState *requestSenderState =
661                safe_cast<RubyPort::SenderState*>(pkt->senderState);
662            RubyTester::SenderState* testerSenderState =
663                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
664            testerSenderState->subBlock.mergeFrom(data);
665        }
666
667        mylist.push_back(pkt);
668    }
669    delete srequest;
670    reqCoalescer.erase(request_line_address);
671    assert(!reqCoalescer.count(request_line_address));
672
673
674
675    completeHitCallback(mylist, len);
676}
677
678bool
679GPUCoalescer::empty() const
680{
681    return m_writeRequestTable.empty() && m_readRequestTable.empty();
682}
683
684// Analyzes the packet to see if this request can be coalesced.
685// If request can be coalesced, this request is added to the reqCoalescer table
686// and makeRequest returns RequestStatus_Issued;
687// If this is the first request to a cacheline, request is added to both
688// newRequests queue and to the reqCoalescer table; makeRequest
689// returns RequestStatus_Issued.
690// If there is a pending request to this cacheline and this request
691// can't be coalesced, RequestStatus_Aliased is returned and
692// the packet needs to be reissued.
693RequestStatus
694GPUCoalescer::makeRequest(PacketPtr pkt)
695{
696    // Check for GPU Barrier Kernel End or Kernel Begin
697    // Leave these to be handled by the child class
698    // Kernel End/Barrier = isFlush + isRelease
699    // Kernel Begin = isFlush + isAcquire
700    if (pkt->req->isKernel()) {
701        if (pkt->req->isAcquire()){
702            // This is a Kernel Begin leave handling to
703            // virtual xCoalescer::makeRequest
704            return RequestStatus_Issued;
705        }else if (pkt->req->isRelease()) {
706            // This is a Kernel End leave handling to
707            // virtual xCoalescer::makeRequest
708            // If we are here then we didn't call
709            // a virtual version of this function
710            // so we will also schedule the callback
711            int wf_id = 0;
712            if (pkt->req->hasContextId()) {
713                wf_id = pkt->req->contextId();
714            }
715            insertKernel(wf_id, pkt);
716            newKernelEnds.push_back(wf_id);
717            if (!issueEvent.scheduled()) {
718                schedule(issueEvent, curTick());
719            }
720            return RequestStatus_Issued;
721        }
722    }
723
724    // If number of outstanding requests greater than the max allowed,
725    // return RequestStatus_BufferFull. This logic can be extended to
726    // support proper backpressure.
727    if (m_outstanding_count >= m_max_outstanding_requests) {
728        return RequestStatus_BufferFull;
729    }
730
731    RubyRequestType primary_type = RubyRequestType_NULL;
732    RubyRequestType secondary_type = RubyRequestType_NULL;
733
734    if (pkt->isLLSC()) {
735        //
736        // Alpha LL/SC instructions need to be handled carefully by the cache
737        // coherence protocol to ensure they follow the proper semantics. In
738        // particular, by identifying the operations as atomic, the protocol
739        // should understand that migratory sharing optimizations should not
740        // be performed (i.e. a load between the LL and SC should not steal
741        // away exclusive permission).
742        //
743        if (pkt->isWrite()) {
744            primary_type = RubyRequestType_Store_Conditional;
745        } else {
746            assert(pkt->isRead());
747            primary_type = RubyRequestType_Load_Linked;
748        }
749        secondary_type = RubyRequestType_ATOMIC;
750    } else if (pkt->req->isLockedRMW()) {
751        //
752        // x86 locked instructions are translated to store cache coherence
753        // requests because these requests should always be treated as read
754        // exclusive operations and should leverage any migratory sharing
755        // optimization built into the protocol.
756        //
757        if (pkt->isWrite()) {
758            primary_type = RubyRequestType_Locked_RMW_Write;
759        } else {
760            assert(pkt->isRead());
761            primary_type = RubyRequestType_Locked_RMW_Read;
762        }
763        secondary_type = RubyRequestType_ST;
764    } else if (pkt->isAtomicOp()) {
765        //
766        // GPU Atomic Operation
767        //
768        primary_type = RubyRequestType_ATOMIC;
769        secondary_type = RubyRequestType_ATOMIC;
770    } else {
771        if (pkt->isRead()) {
772            if (pkt->req->isInstFetch()) {
773                primary_type = secondary_type = RubyRequestType_IFETCH;
774            } else {
775#if THE_ISA == X86_ISA
776                uint32_t flags = pkt->req->getFlags();
777                bool storeCheck = flags &
778                        (TheISA::StoreCheck << TheISA::FlagShift);
779#else
780                bool storeCheck = false;
781#endif // X86_ISA
782                if (storeCheck) {
783                    primary_type = RubyRequestType_RMW_Read;
784                    secondary_type = RubyRequestType_ST;
785                } else {
786                    primary_type = secondary_type = RubyRequestType_LD;
787                }
788            }
789        } else if (pkt->isWrite()) {
790            //
791            // Note: M5 packets do not differentiate ST from RMW_Write
792            //
793            primary_type = secondary_type = RubyRequestType_ST;
794        } else if (pkt->isFlush()) {
795            primary_type = secondary_type = RubyRequestType_FLUSH;
796        } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
797            if (assumingRfOCoherence) {
798                // If we reached here, this request must be a memFence
799                // and the protocol implements RfO, the coalescer can
800                // assume sequentially consistency and schedule the callback
801                // immediately.
802                // Currently the code implements fence callbacks
803                // by reusing the mechanism for kernel completions.
804                // This should be fixed.
805                int wf_id = 0;
806                if (pkt->req->hasContextId()) {
807                    wf_id = pkt->req->contextId();
808                }
809                insertKernel(wf_id, pkt);
810                newKernelEnds.push_back(wf_id);
811                if (!issueEvent.scheduled()) {
812                    schedule(issueEvent, curTick());
813                }
814                return RequestStatus_Issued;
815            } else {
816                // If not RfO, return issued here and let the child coalescer
817                // take care of it.
818                return RequestStatus_Issued;
819            }
820        } else {
821            panic("Unsupported ruby packet type\n");
822        }
823    }
824
825    // Check if there is any pending request to this cache line from
826    // previous cycles.
827    // If there is a pending request, return aliased. Since coalescing
828    // across time is not permitted, aliased requests are not coalesced.
829    // If a request for this address has already been issued, we must block
830    RequestStatus status = getRequestStatus(pkt, primary_type);
831    if (status != RequestStatus_Ready)
832        return status;
833
834    Addr line_addr = makeLineAddress(pkt->getAddr());
835
836    // Check if this request can be coalesced with previous
837    // requests from this cycle.
838    if (!reqCoalescer.count(line_addr)) {
839        // This is the first access to this cache line.
840        // A new request to the memory subsystem has to be
841        // made in the next cycle for this cache line, so
842        // add this line addr to the "newRequests" queue
843        newRequests.push_back(line_addr);
844
845    // There was a request to this cache line in this cycle,
846    // let us see if we can coalesce this request with the previous
847    // requests from this cycle
848    } else if (primary_type !=
849               reqCoalescer[line_addr][0].primaryType) {
850        // can't coalesce loads, stores and atomics!
851        return RequestStatus_Aliased;
852    } else if (pkt->req->isLockedRMW() ||
853               reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
854        // can't coalesce locked accesses, but can coalesce atomics!
855        return RequestStatus_Aliased;
856    } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
857               pkt->req->contextId() !=
858               reqCoalescer[line_addr][0].pkt->req->contextId()) {
859        // can't coalesce releases from different wavefronts
860        return RequestStatus_Aliased;
861    }
862
863    // in addition to the packet, we need to save both request types
864    reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
865    if (!issueEvent.scheduled())
866        schedule(issueEvent, curTick());
867    // TODO: issue hardware prefetches here
868    return RequestStatus_Issued;
869}
870
871void
872GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
873{
874
875    int proc_id = -1;
876    if (pkt != NULL && pkt->req->hasContextId()) {
877        proc_id = pkt->req->contextId();
878    }
879
880    // If valid, copy the pc to the ruby request
881    Addr pc = 0;
882    if (pkt->req->hasPC()) {
883        pc = pkt->req->getPC();
884    }
885
886    // At the moment setting scopes only counts
887    // for GPU spill space accesses
888    // which is pkt->req->isStack()
889    // this scope is REPLACE since it
890    // does not need to be flushed at the end
891    // of a kernel Private and local may need
892    // to be visible at the end of the kernel
893    HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
894    HSAScope accessScope = reqScopeToHSAScope(pkt->req);
895
896    Addr line_addr = makeLineAddress(pkt->getAddr());
897
898    // Creating WriteMask that records written bytes
899    // and atomic operations. This enables partial writes
900    // and partial reads of those writes
901    DataBlock dataBlock;
902    dataBlock.clear();
903    uint32_t blockSize = RubySystem::getBlockSizeBytes();
904    std::vector<bool> accessMask(blockSize,false);
905    std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
906    uint32_t tableSize = reqCoalescer[line_addr].size();
907    for (int i = 0; i < tableSize; i++) {
908        PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
909        uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
910        uint32_t tmpSize = tmpPkt->getSize();
911        if (tmpPkt->isAtomicOp()) {
912            std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
913                                                        tmpPkt->getAtomicOp());
914            atomicOps.push_back(tmpAtomicOp);
915        } else if (tmpPkt->isWrite()) {
916            dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
917                              tmpOffset, tmpSize);
918        }
919        for (int j = 0; j < tmpSize; j++) {
920            accessMask[tmpOffset + j] = true;
921        }
922    }
923    std::shared_ptr<RubyRequest> msg;
924    if (pkt->isAtomicOp()) {
925        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
926                              pkt->getPtr<uint8_t>(),
927                              pkt->getSize(), pc, secondary_type,
928                              RubyAccessMode_Supervisor, pkt,
929                              PrefetchBit_No, proc_id, 100,
930                              blockSize, accessMask,
931                              dataBlock, atomicOps,
932                              accessScope, accessSegment);
933    } else {
934        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
935                              pkt->getPtr<uint8_t>(),
936                              pkt->getSize(), pc, secondary_type,
937                              RubyAccessMode_Supervisor, pkt,
938                              PrefetchBit_No, proc_id, 100,
939                              blockSize, accessMask,
940                              dataBlock,
941                              accessScope, accessSegment);
942    }
943    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
944             curTick(), m_version, "Coal", "Begin", "", "",
945             printAddress(msg->getPhysicalAddress()),
946             RubyRequestType_to_string(secondary_type));
947
948    fatal_if(secondary_type == RubyRequestType_IFETCH,
949             "there should not be any I-Fetch requests in the GPU Coalescer");
950
951    Tick latency = cyclesToTicks(
952                        m_controller->mandatoryQueueLatency(secondary_type));
953    assert(latency > 0);
954
955    assert(m_mandatory_q_ptr);
956    m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
957}
958
959template <class KEY, class VALUE>
960std::ostream &
961operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
962{
963    out << "[";
964    for (auto i = map.begin(); i != map.end(); ++i)
965        out << " " << i->first << "=" << i->second;
966    out << " ]";
967
968    return out;
969}
970
971void
972GPUCoalescer::print(ostream& out) const
973{
974    out << "[GPUCoalescer: " << m_version
975        << ", outstanding requests: " << m_outstanding_count
976        << ", read request table: " << m_readRequestTable
977        << ", write request table: " << m_writeRequestTable
978        << "]";
979}
980
981// this can be called from setState whenever coherence permissions are
982// upgraded when invoked, coherence violations will be checked for the
983// given block
984void
985GPUCoalescer::checkCoherence(Addr addr)
986{
987}
988
989void
990GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
991    DPRINTF(RubyStats, "Recorded statistic: %s\n",
992            SequencerRequestType_to_string(requestType));
993}
994
995
996void
997GPUCoalescer::completeIssue()
998{
999    // newRequests has the cacheline addresses of all the
1000    // requests which need to be issued to the memory subsystem
1001    // in this cycle
1002    int len = newRequests.size();
1003    DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
1004    for (int i = 0; i < len; ++i) {
1005        // Get the requests from reqCoalescer table. Get only the
1006        // first request for each cacheline, the remaining requests
1007        // can be coalesced with the first request. So, only
1008        // one request is issued per cacheline.
1009        RequestDesc info = reqCoalescer[newRequests[i]][0];
1010        PacketPtr pkt = info.pkt;
1011        DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
1012                i, pkt->req->getPaddr());
1013        // Insert this request to the read/writeRequestTables. These tables
1014        // are used to track aliased requests in makeRequest subroutine
1015        bool found = insertRequest(pkt, info.primaryType);
1016
1017        if (found) {
1018            panic("GPUCoalescer::makeRequest should never be called if the "
1019                  "request is already outstanding\n");
1020        }
1021
1022        // Issue request to ruby subsystem
1023        issueRequest(pkt, info.secondaryType);
1024    }
1025    newRequests.clear();
1026
1027    // have Kernel End releases been issued this cycle
1028    len = newKernelEnds.size();
1029    for (int i = 0; i < len; i++) {
1030        kernelCallback(newKernelEnds[i]);
1031    }
1032    newKernelEnds.clear();
1033}
1034
1035void
1036GPUCoalescer::evictionCallback(Addr address)
1037{
1038    ruby_eviction_callback(address);
1039}
1040
1041void
1042GPUCoalescer::kernelCallback(int wavefront_id)
1043{
1044    assert(kernelEndList.count(wavefront_id));
1045
1046    ruby_hit_callback(kernelEndList[wavefront_id]);
1047
1048    kernelEndList.erase(wavefront_id);
1049}
1050
1051void
1052GPUCoalescer::atomicCallback(Addr address,
1053                             MachineType mach,
1054                             const DataBlock& data)
1055{
1056    assert(address == makeLineAddress(address));
1057
1058    DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
1059    assert(m_writeRequestTable.count(makeLineAddress(address)));
1060
1061    RequestTable::iterator i = m_writeRequestTable.find(address);
1062    assert(i != m_writeRequestTable.end());
1063    GPUCoalescerRequest* srequest = i->second;
1064
1065    m_writeRequestTable.erase(i);
1066    markRemoved();
1067
1068    assert((srequest->m_type == RubyRequestType_ATOMIC) ||
1069           (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
1070           (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
1071
1072
1073    // Atomics don't write to cache, so there is no MRU update...
1074
1075    recordMissLatency(srequest, mach,
1076                      srequest->issue_time, Cycles(0), Cycles(0), true, false);
1077
1078    PacketPtr pkt = srequest->pkt;
1079    Addr request_address = pkt->getAddr();
1080    Addr request_line_address = makeLineAddress(pkt->getAddr());
1081
1082    int len = reqCoalescer[request_line_address].size();
1083    std::vector<PacketPtr> mylist;
1084    for (int i = 0; i < len; ++i) {
1085        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
1086        assert(srequest->m_type ==
1087               reqCoalescer[request_line_address][i].primaryType);
1088        request_address = (pkt->getAddr());
1089        request_line_address = makeLineAddress(request_address);
1090        if (pkt->getPtr<uint8_t>() &&
1091            srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
1092            /* atomics are done in memory, and return the data *before* the atomic op... */
1093            pkt->setData(
1094                data.getData(getOffset(request_address), pkt->getSize()));
1095        } else {
1096            DPRINTF(MemoryAccess,
1097                    "WARNING.  Data not transfered from Ruby to M5 for type " \
1098                    "%s\n",
1099                    RubyRequestType_to_string(srequest->m_type));
1100        }
1101
1102        // If using the RubyTester, update the RubyTester sender state's
1103        // subBlock with the recieved data.  The tester will later access
1104        // this state.
1105        // Note: RubyPort will access it's sender state before the
1106        // RubyTester.
1107        if (m_usingRubyTester) {
1108            RubyPort::SenderState *requestSenderState =
1109                safe_cast<RubyPort::SenderState*>(pkt->senderState);
1110            RubyTester::SenderState* testerSenderState =
1111                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
1112            testerSenderState->subBlock.mergeFrom(data);
1113        }
1114
1115        mylist.push_back(pkt);
1116    }
1117    delete srequest;
1118    reqCoalescer.erase(request_line_address);
1119    assert(!reqCoalescer.count(request_line_address));
1120
1121    completeHitCallback(mylist, len);
1122}
1123
1124void
1125GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
1126{
1127    if (myMachID == senderMachID) {
1128        CP_TCPLdHits++;
1129    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1130        CP_TCPLdTransfers++;
1131    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1132        CP_TCCLdHits++;
1133    } else {
1134        CP_LdMiss++;
1135    }
1136}
1137
1138void
1139GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
1140{
1141    if (myMachID == senderMachID) {
1142        CP_TCPStHits++;
1143    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1144        CP_TCPStTransfers++;
1145    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1146        CP_TCCStHits++;
1147    } else {
1148        CP_StMiss++;
1149    }
1150}
1151
1152void
1153GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
1154{
1155    for (int i = 0; i < len; ++i) {
1156        RubyPort::SenderState *ss =
1157            safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
1158        MemSlavePort *port = ss->port;
1159        assert(port != NULL);
1160
1161        mylist[i]->senderState = ss->predecessor;
1162        delete ss;
1163        port->hitCallback(mylist[i]);
1164        trySendRetries();
1165    }
1166
1167    testDrainComplete();
1168}
1169
1170PacketPtr
1171GPUCoalescer::mapAddrToPkt(Addr address)
1172{
1173    RequestTable::iterator i = m_readRequestTable.find(address);
1174    assert(i != m_readRequestTable.end());
1175    GPUCoalescerRequest* request = i->second;
1176    return request->pkt;
1177}
1178
1179void
1180GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
1181                                MachineType mach,
1182                                Cycles initialRequestTime,
1183                                Cycles forwardRequestTime,
1184                                Cycles firstResponseTime,
1185                                bool success, bool isRegion)
1186{
1187    RubyRequestType type = srequest->m_type;
1188    Cycles issued_time = srequest->issue_time;
1189    Cycles completion_time = curCycle();
1190    assert(completion_time >= issued_time);
1191    Cycles total_lat = completion_time - issued_time;
1192
1193    // cache stats (valid for RfO protocol only)
1194    if (mach == MachineType_TCP) {
1195        if (type == RubyRequestType_LD) {
1196            GPU_TCPLdHits++;
1197        } else {
1198            GPU_TCPStHits++;
1199        }
1200    } else if (mach == MachineType_L1Cache_wCC) {
1201        if (type == RubyRequestType_LD) {
1202            GPU_TCPLdTransfers++;
1203        } else {
1204            GPU_TCPStTransfers++;
1205        }
1206    } else if (mach == MachineType_TCC) {
1207        if (type == RubyRequestType_LD) {
1208            GPU_TCCLdHits++;
1209        } else {
1210            GPU_TCCStHits++;
1211        }
1212    } else  {
1213        if (type == RubyRequestType_LD) {
1214            GPU_LdMiss++;
1215        } else {
1216            GPU_StMiss++;
1217        }
1218    }
1219
1220    // Profile all access latency, even zero latency accesses
1221    m_latencyHist.sample(total_lat);
1222    m_typeLatencyHist[type]->sample(total_lat);
1223
1224    // Profile the miss latency for all non-zero demand misses
1225    if (total_lat != Cycles(0)) {
1226        m_missLatencyHist.sample(total_lat);
1227        m_missTypeLatencyHist[type]->sample(total_lat);
1228
1229        if (mach != MachineType_NUM) {
1230            m_missMachLatencyHist[mach]->sample(total_lat);
1231            m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
1232
1233            if ((issued_time <= initialRequestTime) &&
1234                (initialRequestTime <= forwardRequestTime) &&
1235                (forwardRequestTime <= firstResponseTime) &&
1236                (firstResponseTime <= completion_time)) {
1237
1238                m_IssueToInitialDelayHist[mach]->sample(
1239                    initialRequestTime - issued_time);
1240                m_InitialToForwardDelayHist[mach]->sample(
1241                    forwardRequestTime - initialRequestTime);
1242                m_ForwardToFirstResponseDelayHist[mach]->sample(
1243                    firstResponseTime - forwardRequestTime);
1244                m_FirstResponseToCompletionDelayHist[mach]->sample(
1245                    completion_time - firstResponseTime);
1246            }
1247        }
1248
1249    }
1250
1251    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1252             curTick(), m_version, "Coal",
1253             success ? "Done" : "SC_Failed", "", "",
1254             printAddress(srequest->pkt->getAddr()), total_lat);
1255}
1256
1257void
1258GPUCoalescer::regStats()
1259{
1260    RubyPort::regStats();
1261
1262    // These statistical variables are not for display.
1263    // The profiler will collate these across different
1264    // coalescers and display those collated statistics.
1265    m_outstandReqHist.init(10);
1266    m_latencyHist.init(10);
1267    m_missLatencyHist.init(10);
1268
1269    for (int i = 0; i < RubyRequestType_NUM; i++) {
1270        m_typeLatencyHist.push_back(new Stats::Histogram());
1271        m_typeLatencyHist[i]->init(10);
1272
1273        m_missTypeLatencyHist.push_back(new Stats::Histogram());
1274        m_missTypeLatencyHist[i]->init(10);
1275    }
1276
1277    for (int i = 0; i < MachineType_NUM; i++) {
1278        m_missMachLatencyHist.push_back(new Stats::Histogram());
1279        m_missMachLatencyHist[i]->init(10);
1280
1281        m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
1282        m_IssueToInitialDelayHist[i]->init(10);
1283
1284        m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
1285        m_InitialToForwardDelayHist[i]->init(10);
1286
1287        m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
1288        m_ForwardToFirstResponseDelayHist[i]->init(10);
1289
1290        m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
1291        m_FirstResponseToCompletionDelayHist[i]->init(10);
1292    }
1293
1294    for (int i = 0; i < RubyRequestType_NUM; i++) {
1295        m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
1296
1297        for (int j = 0; j < MachineType_NUM; j++) {
1298            m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
1299            m_missTypeMachLatencyHist[i][j]->init(10);
1300        }
1301    }
1302
1303    // GPU cache stats
1304    GPU_TCPLdHits
1305        .name(name() + ".gpu_tcp_ld_hits")
1306        .desc("loads that hit in the TCP")
1307        ;
1308    GPU_TCPLdTransfers
1309        .name(name() + ".gpu_tcp_ld_transfers")
1310        .desc("TCP to TCP load transfers")
1311        ;
1312    GPU_TCCLdHits
1313        .name(name() + ".gpu_tcc_ld_hits")
1314        .desc("loads that hit in the TCC")
1315        ;
1316    GPU_LdMiss
1317        .name(name() + ".gpu_ld_misses")
1318        .desc("loads that miss in the GPU")
1319        ;
1320
1321    GPU_TCPStHits
1322        .name(name() + ".gpu_tcp_st_hits")
1323        .desc("stores that hit in the TCP")
1324        ;
1325    GPU_TCPStTransfers
1326        .name(name() + ".gpu_tcp_st_transfers")
1327        .desc("TCP to TCP store transfers")
1328        ;
1329    GPU_TCCStHits
1330        .name(name() + ".gpu_tcc_st_hits")
1331        .desc("stores that hit in the TCC")
1332        ;
1333    GPU_StMiss
1334        .name(name() + ".gpu_st_misses")
1335        .desc("stores that miss in the GPU")
1336        ;
1337
1338    // CP cache stats
1339    CP_TCPLdHits
1340        .name(name() + ".cp_tcp_ld_hits")
1341        .desc("loads that hit in the TCP")
1342        ;
1343    CP_TCPLdTransfers
1344        .name(name() + ".cp_tcp_ld_transfers")
1345        .desc("TCP to TCP load transfers")
1346        ;
1347    CP_TCCLdHits
1348        .name(name() + ".cp_tcc_ld_hits")
1349        .desc("loads that hit in the TCC")
1350        ;
1351    CP_LdMiss
1352        .name(name() + ".cp_ld_misses")
1353        .desc("loads that miss in the GPU")
1354        ;
1355
1356    CP_TCPStHits
1357        .name(name() + ".cp_tcp_st_hits")
1358        .desc("stores that hit in the TCP")
1359        ;
1360    CP_TCPStTransfers
1361        .name(name() + ".cp_tcp_st_transfers")
1362        .desc("TCP to TCP store transfers")
1363        ;
1364    CP_TCCStHits
1365        .name(name() + ".cp_tcc_st_hits")
1366        .desc("stores that hit in the TCC")
1367        ;
1368    CP_StMiss
1369        .name(name() + ".cp_st_misses")
1370        .desc("stores that miss in the GPU")
1371        ;
1372}
1373