GPUCoalescer.cc revision 11689:9d19bb965564
1/*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Sooraj Puthoor
34 */
35
36#include "base/misc.hh"
37#include "base/str.hh"
38#include "config/the_isa.hh"
39
40#if THE_ISA == X86_ISA
41#include "arch/x86/insts/microldstop.hh"
42
43#endif // X86_ISA
44#include "mem/ruby/system/GPUCoalescer.hh"
45
46#include "cpu/testers/rubytest/RubyTester.hh"
47#include "debug/GPUCoalescer.hh"
48#include "debug/MemoryAccess.hh"
49#include "debug/ProtocolTrace.hh"
50#include "debug/RubyPort.hh"
51#include "debug/RubyStats.hh"
52#include "gpu-compute/shader.hh"
53#include "mem/packet.hh"
54#include "mem/ruby/common/DataBlock.hh"
55#include "mem/ruby/common/SubBlock.hh"
56#include "mem/ruby/network/MessageBuffer.hh"
57#include "mem/ruby/profiler/Profiler.hh"
58#include "mem/ruby/slicc_interface/AbstractController.hh"
59#include "mem/ruby/slicc_interface/RubyRequest.hh"
60#include "mem/ruby/structures/CacheMemory.hh"
61#include "mem/ruby/system/RubySystem.hh"
62#include "params/RubyGPUCoalescer.hh"
63
64using namespace std;
65
66GPUCoalescer *
67RubyGPUCoalescerParams::create()
68{
69    return new GPUCoalescer(this);
70}
71
72HSAScope
73reqScopeToHSAScope(Request* req)
74{
75    HSAScope accessScope = HSAScope_UNSPECIFIED;
76    if (req->isScoped()) {
77        if (req->isWavefrontScope()) {
78            accessScope = HSAScope_WAVEFRONT;
79        } else if (req->isWorkgroupScope()) {
80            accessScope = HSAScope_WORKGROUP;
81        } else if (req->isDeviceScope()) {
82            accessScope = HSAScope_DEVICE;
83        } else if (req->isSystemScope()) {
84            accessScope = HSAScope_SYSTEM;
85        } else {
86            fatal("Bad scope type");
87        }
88    }
89    return accessScope;
90}
91
92HSASegment
93reqSegmentToHSASegment(Request* req)
94{
95    HSASegment accessSegment = HSASegment_GLOBAL;
96
97    if (req->isGlobalSegment()) {
98        accessSegment = HSASegment_GLOBAL;
99    } else if (req->isGroupSegment()) {
100        accessSegment = HSASegment_GROUP;
101    } else if (req->isPrivateSegment()) {
102        accessSegment = HSASegment_PRIVATE;
103    } else if (req->isKernargSegment()) {
104        accessSegment = HSASegment_KERNARG;
105    } else if (req->isReadonlySegment()) {
106        accessSegment = HSASegment_READONLY;
107    } else if (req->isSpillSegment()) {
108        accessSegment = HSASegment_SPILL;
109    } else if (req->isArgSegment()) {
110        accessSegment = HSASegment_ARG;
111    } else {
112        fatal("Bad segment type");
113    }
114
115    return accessSegment;
116}
117
118GPUCoalescer::GPUCoalescer(const Params *p)
119    : RubyPort(p), issueEvent(this), deadlockCheckEvent(this)
120{
121    m_store_waiting_on_load_cycles = 0;
122    m_store_waiting_on_store_cycles = 0;
123    m_load_waiting_on_store_cycles = 0;
124    m_load_waiting_on_load_cycles = 0;
125
126    m_outstanding_count = 0;
127
128    m_max_outstanding_requests = 0;
129    m_deadlock_threshold = 0;
130    m_instCache_ptr = nullptr;
131    m_dataCache_ptr = nullptr;
132
133    m_instCache_ptr = p->icache;
134    m_dataCache_ptr = p->dcache;
135    m_max_outstanding_requests = p->max_outstanding_requests;
136    m_deadlock_threshold = p->deadlock_threshold;
137
138    assert(m_max_outstanding_requests > 0);
139    assert(m_deadlock_threshold > 0);
140    assert(m_instCache_ptr);
141    assert(m_dataCache_ptr);
142
143    m_data_cache_hit_latency = p->dcache_hit_latency;
144
145    m_runningGarnetStandalone = p->garnet_standalone;
146    assumingRfOCoherence = p->assume_rfo;
147}
148
149GPUCoalescer::~GPUCoalescer()
150{
151}
152
153void
154GPUCoalescer::wakeup()
155{
156    // Check for deadlock of any of the requests
157    Cycles current_time = curCycle();
158
159    // Check across all outstanding requests
160    int total_outstanding = 0;
161
162    RequestTable::iterator read = m_readRequestTable.begin();
163    RequestTable::iterator read_end = m_readRequestTable.end();
164    for (; read != read_end; ++read) {
165        GPUCoalescerRequest* request = read->second;
166        if (current_time - request->issue_time < m_deadlock_threshold)
167            continue;
168
169        panic("Possible Deadlock detected. Aborting!\n"
170             "version: %d request.paddr: 0x%x m_readRequestTable: %d "
171             "current time: %u issue_time: %d difference: %d\n", m_version,
172              request->pkt->getAddr(), m_readRequestTable.size(),
173              current_time * clockPeriod(), request->issue_time * clockPeriod(),
174              (current_time - request->issue_time)*clockPeriod());
175    }
176
177    RequestTable::iterator write = m_writeRequestTable.begin();
178    RequestTable::iterator write_end = m_writeRequestTable.end();
179    for (; write != write_end; ++write) {
180        GPUCoalescerRequest* request = write->second;
181        if (current_time - request->issue_time < m_deadlock_threshold)
182            continue;
183
184        panic("Possible Deadlock detected. Aborting!\n"
185             "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
186             "current time: %u issue_time: %d difference: %d\n", m_version,
187              request->pkt->getAddr(), m_writeRequestTable.size(),
188              current_time * clockPeriod(), request->issue_time * clockPeriod(),
189              (current_time - request->issue_time) * clockPeriod());
190    }
191
192    total_outstanding += m_writeRequestTable.size();
193    total_outstanding += m_readRequestTable.size();
194
195    assert(m_outstanding_count == total_outstanding);
196
197    if (m_outstanding_count > 0) {
198        // If there are still outstanding requests, keep checking
199        schedule(deadlockCheckEvent,
200                 m_deadlock_threshold * clockPeriod() +
201                 curTick());
202    }
203}
204
205void
206GPUCoalescer::resetStats()
207{
208    m_latencyHist.reset();
209    m_missLatencyHist.reset();
210    for (int i = 0; i < RubyRequestType_NUM; i++) {
211        m_typeLatencyHist[i]->reset();
212        m_missTypeLatencyHist[i]->reset();
213        for (int j = 0; j < MachineType_NUM; j++) {
214            m_missTypeMachLatencyHist[i][j]->reset();
215        }
216    }
217
218    for (int i = 0; i < MachineType_NUM; i++) {
219        m_missMachLatencyHist[i]->reset();
220
221        m_IssueToInitialDelayHist[i]->reset();
222        m_InitialToForwardDelayHist[i]->reset();
223        m_ForwardToFirstResponseDelayHist[i]->reset();
224        m_FirstResponseToCompletionDelayHist[i]->reset();
225    }
226}
227
228void
229GPUCoalescer::printProgress(ostream& out) const
230{
231}
232
233RequestStatus
234GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
235{
236    Addr line_addr = makeLineAddress(pkt->getAddr());
237
238    if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
239        return RequestStatus_BufferFull;
240    }
241
242    if (m_controller->isBlocked(line_addr) &&
243       request_type != RubyRequestType_Locked_RMW_Write) {
244        return RequestStatus_Aliased;
245    }
246
247    if ((request_type == RubyRequestType_ST) ||
248        (request_type == RubyRequestType_ATOMIC) ||
249        (request_type == RubyRequestType_ATOMIC_RETURN) ||
250        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
251        (request_type == RubyRequestType_RMW_Read) ||
252        (request_type == RubyRequestType_RMW_Write) ||
253        (request_type == RubyRequestType_Load_Linked) ||
254        (request_type == RubyRequestType_Store_Conditional) ||
255        (request_type == RubyRequestType_Locked_RMW_Read) ||
256        (request_type == RubyRequestType_Locked_RMW_Write) ||
257        (request_type == RubyRequestType_FLUSH)) {
258
259        // Check if there is any outstanding read request for the same
260        // cache line.
261        if (m_readRequestTable.count(line_addr) > 0) {
262            m_store_waiting_on_load_cycles++;
263            return RequestStatus_Aliased;
264        }
265
266        if (m_writeRequestTable.count(line_addr) > 0) {
267          // There is an outstanding write request for the cache line
268          m_store_waiting_on_store_cycles++;
269          return RequestStatus_Aliased;
270        }
271    } else {
272        // Check if there is any outstanding write request for the same
273        // cache line.
274        if (m_writeRequestTable.count(line_addr) > 0) {
275            m_load_waiting_on_store_cycles++;
276            return RequestStatus_Aliased;
277        }
278
279        if (m_readRequestTable.count(line_addr) > 0) {
280            // There is an outstanding read request for the cache line
281            m_load_waiting_on_load_cycles++;
282            return RequestStatus_Aliased;
283        }
284    }
285
286    return RequestStatus_Ready;
287
288}
289
290
291
292// sets the kernelEndList
293void
294GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
295{
296    // Don't know if this will happen or is possible
297    // but I just want to be careful and not have it become
298    // simulator hang in the future
299    DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
300    assert(kernelEndList.count(wavefront_id) == 0);
301
302    kernelEndList[wavefront_id] = pkt;
303    DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
304            kernelEndList.size());
305}
306
307
308// Insert the request on the correct request table.  Return true if
309// the entry was already present.
310bool
311GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
312{
313    assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
314           pkt->req->isLockedRMW() ||
315           !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
316
317    int total_outstanding M5_VAR_USED =
318        m_writeRequestTable.size() + m_readRequestTable.size();
319
320    assert(m_outstanding_count == total_outstanding);
321
322    // See if we should schedule a deadlock check
323    if (!deadlockCheckEvent.scheduled()) {
324        schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
325    }
326
327    Addr line_addr = makeLineAddress(pkt->getAddr());
328    if ((request_type == RubyRequestType_ST) ||
329        (request_type == RubyRequestType_ATOMIC) ||
330        (request_type == RubyRequestType_ATOMIC_RETURN) ||
331        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
332        (request_type == RubyRequestType_RMW_Read) ||
333        (request_type == RubyRequestType_RMW_Write) ||
334        (request_type == RubyRequestType_Load_Linked) ||
335        (request_type == RubyRequestType_Store_Conditional) ||
336        (request_type == RubyRequestType_Locked_RMW_Read) ||
337        (request_type == RubyRequestType_Locked_RMW_Write) ||
338        (request_type == RubyRequestType_FLUSH)) {
339
340        pair<RequestTable::iterator, bool> r =
341          m_writeRequestTable.insert(RequestTable::value_type(line_addr,
342                                       (GPUCoalescerRequest*) NULL));
343        if (r.second) {
344            RequestTable::iterator i = r.first;
345            i->second = new GPUCoalescerRequest(pkt, request_type,
346                                                curCycle());
347            DPRINTF(GPUCoalescer,
348                    "Inserting write request for paddr %#x for type %d\n",
349                    pkt->req->getPaddr(), i->second->m_type);
350            m_outstanding_count++;
351        } else {
352            return true;
353        }
354    } else {
355        pair<RequestTable::iterator, bool> r =
356            m_readRequestTable.insert(RequestTable::value_type(line_addr,
357                                        (GPUCoalescerRequest*) NULL));
358
359        if (r.second) {
360            RequestTable::iterator i = r.first;
361            i->second = new GPUCoalescerRequest(pkt, request_type,
362                                             curCycle());
363            DPRINTF(GPUCoalescer,
364                    "Inserting read request for paddr %#x for type %d\n",
365                    pkt->req->getPaddr(), i->second->m_type);
366            m_outstanding_count++;
367        } else {
368            return true;
369        }
370    }
371
372    m_outstandReqHist.sample(m_outstanding_count);
373
374    total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
375    assert(m_outstanding_count == total_outstanding);
376
377    return false;
378}
379
380void
381GPUCoalescer::markRemoved()
382{
383    m_outstanding_count--;
384    assert(m_outstanding_count ==
385           m_writeRequestTable.size() + m_readRequestTable.size());
386}
387
388void
389GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
390{
391    assert(m_outstanding_count ==
392           m_writeRequestTable.size() + m_readRequestTable.size());
393
394    Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
395    if ((srequest->m_type == RubyRequestType_ST) ||
396        (srequest->m_type == RubyRequestType_RMW_Read) ||
397        (srequest->m_type == RubyRequestType_RMW_Write) ||
398        (srequest->m_type == RubyRequestType_Load_Linked) ||
399        (srequest->m_type == RubyRequestType_Store_Conditional) ||
400        (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
401        (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
402        m_writeRequestTable.erase(line_addr);
403    } else {
404        m_readRequestTable.erase(line_addr);
405    }
406
407    markRemoved();
408}
409
410bool
411GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
412{
413    //
414    // The success flag indicates whether the LLSC operation was successful.
415    // LL ops will always succeed, but SC may fail if the cache line is no
416    // longer locked.
417    //
418    bool success = true;
419    if (request->m_type == RubyRequestType_Store_Conditional) {
420        if (!m_dataCache_ptr->isLocked(address, m_version)) {
421            //
422            // For failed SC requests, indicate the failure to the cpu by
423            // setting the extra data to zero.
424            //
425            request->pkt->req->setExtraData(0);
426            success = false;
427        } else {
428            //
429            // For successful SC requests, indicate the success to the cpu by
430            // setting the extra data to one.
431            //
432            request->pkt->req->setExtraData(1);
433        }
434        //
435        // Independent of success, all SC operations must clear the lock
436        //
437        m_dataCache_ptr->clearLocked(address);
438    } else if (request->m_type == RubyRequestType_Load_Linked) {
439        //
440        // Note: To fully follow Alpha LLSC semantics, should the LL clear any
441        // previously locked cache lines?
442        //
443        m_dataCache_ptr->setLocked(address, m_version);
444    } else if ((m_dataCache_ptr->isTagPresent(address)) &&
445               (m_dataCache_ptr->isLocked(address, m_version))) {
446        //
447        // Normal writes should clear the locked address
448        //
449        m_dataCache_ptr->clearLocked(address);
450    }
451    return success;
452}
453
454void
455GPUCoalescer::writeCallback(Addr address, DataBlock& data)
456{
457    writeCallback(address, MachineType_NULL, data);
458}
459
460void
461GPUCoalescer::writeCallback(Addr address,
462                         MachineType mach,
463                         DataBlock& data)
464{
465    writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
466}
467
468void
469GPUCoalescer::writeCallback(Addr address,
470                         MachineType mach,
471                         DataBlock& data,
472                         Cycles initialRequestTime,
473                         Cycles forwardRequestTime,
474                         Cycles firstResponseTime)
475{
476    writeCallback(address, mach, data,
477                  initialRequestTime, forwardRequestTime, firstResponseTime,
478                  false);
479}
480
481void
482GPUCoalescer::writeCallback(Addr address,
483                         MachineType mach,
484                         DataBlock& data,
485                         Cycles initialRequestTime,
486                         Cycles forwardRequestTime,
487                         Cycles firstResponseTime,
488                         bool isRegion)
489{
490    assert(address == makeLineAddress(address));
491
492    DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
493    assert(m_writeRequestTable.count(makeLineAddress(address)));
494
495    RequestTable::iterator i = m_writeRequestTable.find(address);
496    assert(i != m_writeRequestTable.end());
497    GPUCoalescerRequest* request = i->second;
498
499    m_writeRequestTable.erase(i);
500    markRemoved();
501
502    assert((request->m_type == RubyRequestType_ST) ||
503           (request->m_type == RubyRequestType_ATOMIC) ||
504           (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
505           (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
506           (request->m_type == RubyRequestType_RMW_Read) ||
507           (request->m_type == RubyRequestType_RMW_Write) ||
508           (request->m_type == RubyRequestType_Load_Linked) ||
509           (request->m_type == RubyRequestType_Store_Conditional) ||
510           (request->m_type == RubyRequestType_Locked_RMW_Read) ||
511           (request->m_type == RubyRequestType_Locked_RMW_Write) ||
512           (request->m_type == RubyRequestType_FLUSH));
513
514
515    //
516    // For Alpha, properly handle LL, SC, and write requests with respect to
517    // locked cache blocks.
518    //
519    // Not valid for Garnet_standalone protocl
520    //
521    bool success = true;
522    if (!m_runningGarnetStandalone)
523        success = handleLlsc(address, request);
524
525    if (request->m_type == RubyRequestType_Locked_RMW_Read) {
526        m_controller->blockOnQueue(address, m_mandatory_q_ptr);
527    } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
528        m_controller->unblock(address);
529    }
530
531    hitCallback(request, mach, data, success,
532                request->issue_time, forwardRequestTime, firstResponseTime,
533                isRegion);
534}
535
536void
537GPUCoalescer::readCallback(Addr address, DataBlock& data)
538{
539    readCallback(address, MachineType_NULL, data);
540}
541
542void
543GPUCoalescer::readCallback(Addr address,
544                        MachineType mach,
545                        DataBlock& data)
546{
547    readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
548}
549
550void
551GPUCoalescer::readCallback(Addr address,
552                        MachineType mach,
553                        DataBlock& data,
554                        Cycles initialRequestTime,
555                        Cycles forwardRequestTime,
556                        Cycles firstResponseTime)
557{
558
559    readCallback(address, mach, data,
560                 initialRequestTime, forwardRequestTime, firstResponseTime,
561                 false);
562}
563
564void
565GPUCoalescer::readCallback(Addr address,
566                        MachineType mach,
567                        DataBlock& data,
568                        Cycles initialRequestTime,
569                        Cycles forwardRequestTime,
570                        Cycles firstResponseTime,
571                        bool isRegion)
572{
573    assert(address == makeLineAddress(address));
574    assert(m_readRequestTable.count(makeLineAddress(address)));
575
576    DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
577    RequestTable::iterator i = m_readRequestTable.find(address);
578    assert(i != m_readRequestTable.end());
579    GPUCoalescerRequest* request = i->second;
580
581    m_readRequestTable.erase(i);
582    markRemoved();
583
584    assert((request->m_type == RubyRequestType_LD) ||
585           (request->m_type == RubyRequestType_IFETCH));
586
587    hitCallback(request, mach, data, true,
588                request->issue_time, forwardRequestTime, firstResponseTime,
589                isRegion);
590}
591
592void
593GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
594                       MachineType mach,
595                       DataBlock& data,
596                       bool success,
597                       Cycles initialRequestTime,
598                       Cycles forwardRequestTime,
599                       Cycles firstResponseTime,
600                       bool isRegion)
601{
602    PacketPtr pkt = srequest->pkt;
603    Addr request_address = pkt->getAddr();
604    Addr request_line_address = makeLineAddress(request_address);
605
606    RubyRequestType type = srequest->m_type;
607
608    // Set this cache entry to the most recently used
609    if (type == RubyRequestType_IFETCH) {
610        if (m_instCache_ptr->isTagPresent(request_line_address))
611            m_instCache_ptr->setMRU(request_line_address);
612    } else {
613        if (m_dataCache_ptr->isTagPresent(request_line_address))
614            m_dataCache_ptr->setMRU(request_line_address);
615    }
616
617    recordMissLatency(srequest, mach,
618                      initialRequestTime,
619                      forwardRequestTime,
620                      firstResponseTime,
621                      success, isRegion);
622    // update the data
623    //
624    // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
625    int len = reqCoalescer[request_line_address].size();
626    std::vector<PacketPtr> mylist;
627    for (int i = 0; i < len; ++i) {
628        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
629        assert(type == reqCoalescer[request_line_address][i].primaryType);
630        request_address = pkt->getAddr();
631        request_line_address = makeLineAddress(pkt->getAddr());
632        if (pkt->getPtr<uint8_t>()) {
633            if ((type == RubyRequestType_LD) ||
634                (type == RubyRequestType_ATOMIC) ||
635                (type == RubyRequestType_ATOMIC_RETURN) ||
636                (type == RubyRequestType_IFETCH) ||
637                (type == RubyRequestType_RMW_Read) ||
638                (type == RubyRequestType_Locked_RMW_Read) ||
639                (type == RubyRequestType_Load_Linked)) {
640                memcpy(pkt->getPtr<uint8_t>(),
641                       data.getData(getOffset(request_address),
642                                    pkt->getSize()),
643                       pkt->getSize());
644            } else {
645                data.setData(pkt->getPtr<uint8_t>(),
646                             getOffset(request_address), pkt->getSize());
647            }
648        } else {
649            DPRINTF(MemoryAccess,
650                    "WARNING.  Data not transfered from Ruby to M5 for type " \
651                    "%s\n",
652                    RubyRequestType_to_string(type));
653        }
654
655        // If using the RubyTester, update the RubyTester sender state's
656        // subBlock with the recieved data.  The tester will later access
657        // this state.
658        // Note: RubyPort will access it's sender state before the
659        // RubyTester.
660        if (m_usingRubyTester) {
661            RubyPort::SenderState *requestSenderState =
662                safe_cast<RubyPort::SenderState*>(pkt->senderState);
663            RubyTester::SenderState* testerSenderState =
664                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
665            testerSenderState->subBlock.mergeFrom(data);
666        }
667
668        mylist.push_back(pkt);
669    }
670    delete srequest;
671    reqCoalescer.erase(request_line_address);
672    assert(!reqCoalescer.count(request_line_address));
673
674
675
676    completeHitCallback(mylist, len);
677}
678
679bool
680GPUCoalescer::empty() const
681{
682    return m_writeRequestTable.empty() && m_readRequestTable.empty();
683}
684
685// Analyzes the packet to see if this request can be coalesced.
686// If request can be coalesced, this request is added to the reqCoalescer table
687// and makeRequest returns RequestStatus_Issued;
688// If this is the first request to a cacheline, request is added to both
689// newRequests queue and to the reqCoalescer table; makeRequest
690// returns RequestStatus_Issued.
691// If there is a pending request to this cacheline and this request
692// can't be coalesced, RequestStatus_Aliased is returned and
693// the packet needs to be reissued.
694RequestStatus
695GPUCoalescer::makeRequest(PacketPtr pkt)
696{
697    // Check for GPU Barrier Kernel End or Kernel Begin
698    // Leave these to be handled by the child class
699    // Kernel End/Barrier = isFlush + isRelease
700    // Kernel Begin = isFlush + isAcquire
701    if (pkt->req->isKernel()) {
702        if (pkt->req->isAcquire()){
703            // This is a Kernel Begin leave handling to
704            // virtual xCoalescer::makeRequest
705            return RequestStatus_Issued;
706        }else if (pkt->req->isRelease()) {
707            // This is a Kernel End leave handling to
708            // virtual xCoalescer::makeRequest
709            // If we are here then we didn't call
710            // a virtual version of this function
711            // so we will also schedule the callback
712            int wf_id = 0;
713            if (pkt->req->hasContextId()) {
714                wf_id = pkt->req->contextId();
715            }
716            insertKernel(wf_id, pkt);
717            newKernelEnds.push_back(wf_id);
718            if (!issueEvent.scheduled()) {
719                schedule(issueEvent, curTick());
720            }
721            return RequestStatus_Issued;
722        }
723    }
724
725    // If number of outstanding requests greater than the max allowed,
726    // return RequestStatus_BufferFull. This logic can be extended to
727    // support proper backpressure.
728    if (m_outstanding_count >= m_max_outstanding_requests) {
729        return RequestStatus_BufferFull;
730    }
731
732    RubyRequestType primary_type = RubyRequestType_NULL;
733    RubyRequestType secondary_type = RubyRequestType_NULL;
734
735    if (pkt->isLLSC()) {
736        //
737        // Alpha LL/SC instructions need to be handled carefully by the cache
738        // coherence protocol to ensure they follow the proper semantics. In
739        // particular, by identifying the operations as atomic, the protocol
740        // should understand that migratory sharing optimizations should not
741        // be performed (i.e. a load between the LL and SC should not steal
742        // away exclusive permission).
743        //
744        if (pkt->isWrite()) {
745            primary_type = RubyRequestType_Store_Conditional;
746        } else {
747            assert(pkt->isRead());
748            primary_type = RubyRequestType_Load_Linked;
749        }
750        secondary_type = RubyRequestType_ATOMIC;
751    } else if (pkt->req->isLockedRMW()) {
752        //
753        // x86 locked instructions are translated to store cache coherence
754        // requests because these requests should always be treated as read
755        // exclusive operations and should leverage any migratory sharing
756        // optimization built into the protocol.
757        //
758        if (pkt->isWrite()) {
759            primary_type = RubyRequestType_Locked_RMW_Write;
760        } else {
761            assert(pkt->isRead());
762            primary_type = RubyRequestType_Locked_RMW_Read;
763        }
764        secondary_type = RubyRequestType_ST;
765    } else if (pkt->isAtomicOp()) {
766        //
767        // GPU Atomic Operation
768        //
769        primary_type = RubyRequestType_ATOMIC;
770        secondary_type = RubyRequestType_ATOMIC;
771    } else {
772        if (pkt->isRead()) {
773            if (pkt->req->isInstFetch()) {
774                primary_type = secondary_type = RubyRequestType_IFETCH;
775            } else {
776#if THE_ISA == X86_ISA
777                uint32_t flags = pkt->req->getFlags();
778                bool storeCheck = flags &
779                        (TheISA::StoreCheck << TheISA::FlagShift);
780#else
781                bool storeCheck = false;
782#endif // X86_ISA
783                if (storeCheck) {
784                    primary_type = RubyRequestType_RMW_Read;
785                    secondary_type = RubyRequestType_ST;
786                } else {
787                    primary_type = secondary_type = RubyRequestType_LD;
788                }
789            }
790        } else if (pkt->isWrite()) {
791            //
792            // Note: M5 packets do not differentiate ST from RMW_Write
793            //
794            primary_type = secondary_type = RubyRequestType_ST;
795        } else if (pkt->isFlush()) {
796            primary_type = secondary_type = RubyRequestType_FLUSH;
797        } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
798            if (assumingRfOCoherence) {
799                // If we reached here, this request must be a memFence
800                // and the protocol implements RfO, the coalescer can
801                // assume sequentially consistency and schedule the callback
802                // immediately.
803                // Currently the code implements fence callbacks
804                // by reusing the mechanism for kernel completions.
805                // This should be fixed.
806                int wf_id = 0;
807                if (pkt->req->hasContextId()) {
808                    wf_id = pkt->req->contextId();
809                }
810                insertKernel(wf_id, pkt);
811                newKernelEnds.push_back(wf_id);
812                if (!issueEvent.scheduled()) {
813                    schedule(issueEvent, curTick());
814                }
815                return RequestStatus_Issued;
816            } else {
817                // If not RfO, return issued here and let the child coalescer
818                // take care of it.
819                return RequestStatus_Issued;
820            }
821        } else {
822            panic("Unsupported ruby packet type\n");
823        }
824    }
825
826    // Check if there is any pending request to this cache line from
827    // previous cycles.
828    // If there is a pending request, return aliased. Since coalescing
829    // across time is not permitted, aliased requests are not coalesced.
830    // If a request for this address has already been issued, we must block
831    RequestStatus status = getRequestStatus(pkt, primary_type);
832    if (status != RequestStatus_Ready)
833        return status;
834
835    Addr line_addr = makeLineAddress(pkt->getAddr());
836
837    // Check if this request can be coalesced with previous
838    // requests from this cycle.
839    if (!reqCoalescer.count(line_addr)) {
840        // This is the first access to this cache line.
841        // A new request to the memory subsystem has to be
842        // made in the next cycle for this cache line, so
843        // add this line addr to the "newRequests" queue
844        newRequests.push_back(line_addr);
845
846    // There was a request to this cache line in this cycle,
847    // let us see if we can coalesce this request with the previous
848    // requests from this cycle
849    } else if (primary_type !=
850               reqCoalescer[line_addr][0].primaryType) {
851        // can't coalesce loads, stores and atomics!
852        return RequestStatus_Aliased;
853    } else if (pkt->req->isLockedRMW() ||
854               reqCoalescer[line_addr][0].pkt->req->isLockedRMW()) {
855        // can't coalesce locked accesses, but can coalesce atomics!
856        return RequestStatus_Aliased;
857    } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
858               pkt->req->contextId() !=
859               reqCoalescer[line_addr][0].pkt->req->contextId()) {
860        // can't coalesce releases from different wavefronts
861        return RequestStatus_Aliased;
862    }
863
864    // in addition to the packet, we need to save both request types
865    reqCoalescer[line_addr].emplace_back(pkt, primary_type, secondary_type);
866    if (!issueEvent.scheduled())
867        schedule(issueEvent, curTick());
868    // TODO: issue hardware prefetches here
869    return RequestStatus_Issued;
870}
871
872void
873GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
874{
875
876    int proc_id = -1;
877    if (pkt != NULL && pkt->req->hasContextId()) {
878        proc_id = pkt->req->contextId();
879    }
880
881    // If valid, copy the pc to the ruby request
882    Addr pc = 0;
883    if (pkt->req->hasPC()) {
884        pc = pkt->req->getPC();
885    }
886
887    // At the moment setting scopes only counts
888    // for GPU spill space accesses
889    // which is pkt->req->isStack()
890    // this scope is REPLACE since it
891    // does not need to be flushed at the end
892    // of a kernel Private and local may need
893    // to be visible at the end of the kernel
894    HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
895    HSAScope accessScope = reqScopeToHSAScope(pkt->req);
896
897    Addr line_addr = makeLineAddress(pkt->getAddr());
898
899    // Creating WriteMask that records written bytes
900    // and atomic operations. This enables partial writes
901    // and partial reads of those writes
902    DataBlock dataBlock;
903    dataBlock.clear();
904    uint32_t blockSize = RubySystem::getBlockSizeBytes();
905    std::vector<bool> accessMask(blockSize,false);
906    std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
907    uint32_t tableSize = reqCoalescer[line_addr].size();
908    for (int i = 0; i < tableSize; i++) {
909        PacketPtr tmpPkt = reqCoalescer[line_addr][i].pkt;
910        uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
911        uint32_t tmpSize = tmpPkt->getSize();
912        if (tmpPkt->isAtomicOp()) {
913            std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
914                                                        tmpPkt->getAtomicOp());
915            atomicOps.push_back(tmpAtomicOp);
916        } else if (tmpPkt->isWrite()) {
917            dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
918                              tmpOffset, tmpSize);
919        }
920        for (int j = 0; j < tmpSize; j++) {
921            accessMask[tmpOffset + j] = true;
922        }
923    }
924    std::shared_ptr<RubyRequest> msg;
925    if (pkt->isAtomicOp()) {
926        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
927                              pkt->getPtr<uint8_t>(),
928                              pkt->getSize(), pc, secondary_type,
929                              RubyAccessMode_Supervisor, pkt,
930                              PrefetchBit_No, proc_id, 100,
931                              blockSize, accessMask,
932                              dataBlock, atomicOps,
933                              accessScope, accessSegment);
934    } else {
935        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
936                              pkt->getPtr<uint8_t>(),
937                              pkt->getSize(), pc, secondary_type,
938                              RubyAccessMode_Supervisor, pkt,
939                              PrefetchBit_No, proc_id, 100,
940                              blockSize, accessMask,
941                              dataBlock,
942                              accessScope, accessSegment);
943    }
944    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
945             curTick(), m_version, "Coal", "Begin", "", "",
946             printAddress(msg->getPhysicalAddress()),
947             RubyRequestType_to_string(secondary_type));
948
949    fatal_if(secondary_type == RubyRequestType_IFETCH,
950             "there should not be any I-Fetch requests in the GPU Coalescer");
951
952    // Send the message to the cache controller
953    fatal_if(m_data_cache_hit_latency == 0,
954             "should not have a latency of zero");
955
956    assert(m_mandatory_q_ptr);
957    m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
958}
959
960template <class KEY, class VALUE>
961std::ostream &
962operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
963{
964    out << "[";
965    for (auto i = map.begin(); i != map.end(); ++i)
966        out << " " << i->first << "=" << i->second;
967    out << " ]";
968
969    return out;
970}
971
972void
973GPUCoalescer::print(ostream& out) const
974{
975    out << "[GPUCoalescer: " << m_version
976        << ", outstanding requests: " << m_outstanding_count
977        << ", read request table: " << m_readRequestTable
978        << ", write request table: " << m_writeRequestTable
979        << "]";
980}
981
982// this can be called from setState whenever coherence permissions are
983// upgraded when invoked, coherence violations will be checked for the
984// given block
985void
986GPUCoalescer::checkCoherence(Addr addr)
987{
988#ifdef CHECK_COHERENCE
989    m_ruby_system->checkGlobalCoherenceInvariant(addr);
990#endif
991}
992
993void
994GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
995    DPRINTF(RubyStats, "Recorded statistic: %s\n",
996            SequencerRequestType_to_string(requestType));
997}
998
999GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)
1000    : Event(Progress_Event_Pri), seq(_seq)
1001{
1002}
1003
1004
1005void
1006GPUCoalescer::completeIssue()
1007{
1008    // newRequests has the cacheline addresses of all the
1009    // requests which need to be issued to the memory subsystem
1010    // in this cycle
1011    int len = newRequests.size();
1012    DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
1013    for (int i = 0; i < len; ++i) {
1014        // Get the requests from reqCoalescer table. Get only the
1015        // first request for each cacheline, the remaining requests
1016        // can be coalesced with the first request. So, only
1017        // one request is issued per cacheline.
1018        RequestDesc info = reqCoalescer[newRequests[i]][0];
1019        PacketPtr pkt = info.pkt;
1020        DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
1021                i, pkt->req->getPaddr());
1022        // Insert this request to the read/writeRequestTables. These tables
1023        // are used to track aliased requests in makeRequest subroutine
1024        bool found = insertRequest(pkt, info.primaryType);
1025
1026        if (found) {
1027            panic("GPUCoalescer::makeRequest should never be called if the "
1028                  "request is already outstanding\n");
1029        }
1030
1031        // Issue request to ruby subsystem
1032        issueRequest(pkt, info.secondaryType);
1033    }
1034    newRequests.clear();
1035
1036    // have Kernel End releases been issued this cycle
1037    len = newKernelEnds.size();
1038    for (int i = 0; i < len; i++) {
1039        kernelCallback(newKernelEnds[i]);
1040    }
1041    newKernelEnds.clear();
1042}
1043
1044void
1045GPUCoalescer::IssueEvent::process()
1046{
1047    seq->completeIssue();
1048}
1049
1050const char *
1051GPUCoalescer::IssueEvent::description() const
1052{
1053    return "Issue coalesced request";
1054}
1055
1056void
1057GPUCoalescer::evictionCallback(Addr address)
1058{
1059    ruby_eviction_callback(address);
1060}
1061
1062void
1063GPUCoalescer::kernelCallback(int wavefront_id)
1064{
1065    assert(kernelEndList.count(wavefront_id));
1066
1067    ruby_hit_callback(kernelEndList[wavefront_id]);
1068
1069    kernelEndList.erase(wavefront_id);
1070}
1071
1072void
1073GPUCoalescer::atomicCallback(Addr address,
1074                             MachineType mach,
1075                             const DataBlock& data)
1076{
1077    assert(address == makeLineAddress(address));
1078
1079    DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
1080    assert(m_writeRequestTable.count(makeLineAddress(address)));
1081
1082    RequestTable::iterator i = m_writeRequestTable.find(address);
1083    assert(i != m_writeRequestTable.end());
1084    GPUCoalescerRequest* srequest = i->second;
1085
1086    m_writeRequestTable.erase(i);
1087    markRemoved();
1088
1089    assert((srequest->m_type == RubyRequestType_ATOMIC) ||
1090           (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
1091           (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
1092
1093
1094    // Atomics don't write to cache, so there is no MRU update...
1095
1096    recordMissLatency(srequest, mach,
1097                      srequest->issue_time, Cycles(0), Cycles(0), true, false);
1098
1099    PacketPtr pkt = srequest->pkt;
1100    Addr request_address = pkt->getAddr();
1101    Addr request_line_address = makeLineAddress(pkt->getAddr());
1102
1103    int len = reqCoalescer[request_line_address].size();
1104    std::vector<PacketPtr> mylist;
1105    for (int i = 0; i < len; ++i) {
1106        PacketPtr pkt = reqCoalescer[request_line_address][i].pkt;
1107        assert(srequest->m_type ==
1108               reqCoalescer[request_line_address][i].primaryType);
1109        request_address = (pkt->getAddr());
1110        request_line_address = makeLineAddress(request_address);
1111        if (pkt->getPtr<uint8_t>() &&
1112            srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
1113            /* atomics are done in memory, and return the data *before* the atomic op... */
1114            memcpy(pkt->getPtr<uint8_t>(),
1115                   data.getData(getOffset(request_address),
1116                                pkt->getSize()),
1117                   pkt->getSize());
1118        } else {
1119            DPRINTF(MemoryAccess,
1120                    "WARNING.  Data not transfered from Ruby to M5 for type " \
1121                    "%s\n",
1122                    RubyRequestType_to_string(srequest->m_type));
1123        }
1124
1125        // If using the RubyTester, update the RubyTester sender state's
1126        // subBlock with the recieved data.  The tester will later access
1127        // this state.
1128        // Note: RubyPort will access it's sender state before the
1129        // RubyTester.
1130        if (m_usingRubyTester) {
1131            RubyPort::SenderState *requestSenderState =
1132                safe_cast<RubyPort::SenderState*>(pkt->senderState);
1133            RubyTester::SenderState* testerSenderState =
1134                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
1135            testerSenderState->subBlock.mergeFrom(data);
1136        }
1137
1138        mylist.push_back(pkt);
1139    }
1140    delete srequest;
1141    reqCoalescer.erase(request_line_address);
1142    assert(!reqCoalescer.count(request_line_address));
1143
1144    completeHitCallback(mylist, len);
1145}
1146
1147void
1148GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
1149{
1150    if (myMachID == senderMachID) {
1151        CP_TCPLdHits++;
1152    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1153        CP_TCPLdTransfers++;
1154    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1155        CP_TCCLdHits++;
1156    } else {
1157        CP_LdMiss++;
1158    }
1159}
1160
1161void
1162GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
1163{
1164    if (myMachID == senderMachID) {
1165        CP_TCPStHits++;
1166    } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1167        CP_TCPStTransfers++;
1168    } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1169        CP_TCCStHits++;
1170    } else {
1171        CP_StMiss++;
1172    }
1173}
1174
1175void
1176GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
1177{
1178    for (int i = 0; i < len; ++i) {
1179        RubyPort::SenderState *ss =
1180            safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
1181        MemSlavePort *port = ss->port;
1182        assert(port != NULL);
1183
1184        mylist[i]->senderState = ss->predecessor;
1185        delete ss;
1186        port->hitCallback(mylist[i]);
1187        trySendRetries();
1188    }
1189
1190    testDrainComplete();
1191}
1192
1193PacketPtr
1194GPUCoalescer::mapAddrToPkt(Addr address)
1195{
1196    RequestTable::iterator i = m_readRequestTable.find(address);
1197    assert(i != m_readRequestTable.end());
1198    GPUCoalescerRequest* request = i->second;
1199    return request->pkt;
1200}
1201
1202void
1203GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
1204                                MachineType mach,
1205                                Cycles initialRequestTime,
1206                                Cycles forwardRequestTime,
1207                                Cycles firstResponseTime,
1208                                bool success, bool isRegion)
1209{
1210    RubyRequestType type = srequest->m_type;
1211    Cycles issued_time = srequest->issue_time;
1212    Cycles completion_time = curCycle();
1213    assert(completion_time >= issued_time);
1214    Cycles total_lat = completion_time - issued_time;
1215
1216    // cache stats (valid for RfO protocol only)
1217    if (mach == MachineType_TCP) {
1218        if (type == RubyRequestType_LD) {
1219            GPU_TCPLdHits++;
1220        } else {
1221            GPU_TCPStHits++;
1222        }
1223    } else if (mach == MachineType_L1Cache_wCC) {
1224        if (type == RubyRequestType_LD) {
1225            GPU_TCPLdTransfers++;
1226        } else {
1227            GPU_TCPStTransfers++;
1228        }
1229    } else if (mach == MachineType_TCC) {
1230        if (type == RubyRequestType_LD) {
1231            GPU_TCCLdHits++;
1232        } else {
1233            GPU_TCCStHits++;
1234        }
1235    } else  {
1236        if (type == RubyRequestType_LD) {
1237            GPU_LdMiss++;
1238        } else {
1239            GPU_StMiss++;
1240        }
1241    }
1242
1243    // Profile all access latency, even zero latency accesses
1244    m_latencyHist.sample(total_lat);
1245    m_typeLatencyHist[type]->sample(total_lat);
1246
1247    // Profile the miss latency for all non-zero demand misses
1248    if (total_lat != Cycles(0)) {
1249        m_missLatencyHist.sample(total_lat);
1250        m_missTypeLatencyHist[type]->sample(total_lat);
1251
1252        if (mach != MachineType_NUM) {
1253            m_missMachLatencyHist[mach]->sample(total_lat);
1254            m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
1255
1256            if ((issued_time <= initialRequestTime) &&
1257                (initialRequestTime <= forwardRequestTime) &&
1258                (forwardRequestTime <= firstResponseTime) &&
1259                (firstResponseTime <= completion_time)) {
1260
1261                m_IssueToInitialDelayHist[mach]->sample(
1262                    initialRequestTime - issued_time);
1263                m_InitialToForwardDelayHist[mach]->sample(
1264                    forwardRequestTime - initialRequestTime);
1265                m_ForwardToFirstResponseDelayHist[mach]->sample(
1266                    firstResponseTime - forwardRequestTime);
1267                m_FirstResponseToCompletionDelayHist[mach]->sample(
1268                    completion_time - firstResponseTime);
1269            }
1270        }
1271
1272    }
1273
1274    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1275             curTick(), m_version, "Coal",
1276             success ? "Done" : "SC_Failed", "", "",
1277             printAddress(srequest->pkt->getAddr()), total_lat);
1278}
1279
1280void
1281GPUCoalescer::regStats()
1282{
1283    RubyPort::regStats();
1284
1285    // These statistical variables are not for display.
1286    // The profiler will collate these across different
1287    // coalescers and display those collated statistics.
1288    m_outstandReqHist.init(10);
1289    m_latencyHist.init(10);
1290    m_missLatencyHist.init(10);
1291
1292    for (int i = 0; i < RubyRequestType_NUM; i++) {
1293        m_typeLatencyHist.push_back(new Stats::Histogram());
1294        m_typeLatencyHist[i]->init(10);
1295
1296        m_missTypeLatencyHist.push_back(new Stats::Histogram());
1297        m_missTypeLatencyHist[i]->init(10);
1298    }
1299
1300    for (int i = 0; i < MachineType_NUM; i++) {
1301        m_missMachLatencyHist.push_back(new Stats::Histogram());
1302        m_missMachLatencyHist[i]->init(10);
1303
1304        m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
1305        m_IssueToInitialDelayHist[i]->init(10);
1306
1307        m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
1308        m_InitialToForwardDelayHist[i]->init(10);
1309
1310        m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
1311        m_ForwardToFirstResponseDelayHist[i]->init(10);
1312
1313        m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
1314        m_FirstResponseToCompletionDelayHist[i]->init(10);
1315    }
1316
1317    for (int i = 0; i < RubyRequestType_NUM; i++) {
1318        m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
1319
1320        for (int j = 0; j < MachineType_NUM; j++) {
1321            m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
1322            m_missTypeMachLatencyHist[i][j]->init(10);
1323        }
1324    }
1325
1326    // GPU cache stats
1327    GPU_TCPLdHits
1328        .name(name() + ".gpu_tcp_ld_hits")
1329        .desc("loads that hit in the TCP")
1330        ;
1331    GPU_TCPLdTransfers
1332        .name(name() + ".gpu_tcp_ld_transfers")
1333        .desc("TCP to TCP load transfers")
1334        ;
1335    GPU_TCCLdHits
1336        .name(name() + ".gpu_tcc_ld_hits")
1337        .desc("loads that hit in the TCC")
1338        ;
1339    GPU_LdMiss
1340        .name(name() + ".gpu_ld_misses")
1341        .desc("loads that miss in the GPU")
1342        ;
1343
1344    GPU_TCPStHits
1345        .name(name() + ".gpu_tcp_st_hits")
1346        .desc("stores that hit in the TCP")
1347        ;
1348    GPU_TCPStTransfers
1349        .name(name() + ".gpu_tcp_st_transfers")
1350        .desc("TCP to TCP store transfers")
1351        ;
1352    GPU_TCCStHits
1353        .name(name() + ".gpu_tcc_st_hits")
1354        .desc("stores that hit in the TCC")
1355        ;
1356    GPU_StMiss
1357        .name(name() + ".gpu_st_misses")
1358        .desc("stores that miss in the GPU")
1359        ;
1360
1361    // CP cache stats
1362    CP_TCPLdHits
1363        .name(name() + ".cp_tcp_ld_hits")
1364        .desc("loads that hit in the TCP")
1365        ;
1366    CP_TCPLdTransfers
1367        .name(name() + ".cp_tcp_ld_transfers")
1368        .desc("TCP to TCP load transfers")
1369        ;
1370    CP_TCCLdHits
1371        .name(name() + ".cp_tcc_ld_hits")
1372        .desc("loads that hit in the TCC")
1373        ;
1374    CP_LdMiss
1375        .name(name() + ".cp_ld_misses")
1376        .desc("loads that miss in the GPU")
1377        ;
1378
1379    CP_TCPStHits
1380        .name(name() + ".cp_tcp_st_hits")
1381        .desc("stores that hit in the TCP")
1382        ;
1383    CP_TCPStTransfers
1384        .name(name() + ".cp_tcp_st_transfers")
1385        .desc("TCP to TCP store transfers")
1386        ;
1387    CP_TCCStHits
1388        .name(name() + ".cp_tcc_st_hits")
1389        .desc("stores that hit in the TCC")
1390        ;
1391    CP_StMiss
1392        .name(name() + ".cp_st_misses")
1393        .desc("stores that miss in the GPU")
1394        ;
1395}
1396