tlb_coalescer.cc revision 13449:2f7efa89c58b
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Lisa Hsu
34 */
35
36#include "gpu-compute/tlb_coalescer.hh"
37
38#include <cstring>
39
40#include "base/logging.hh"
41#include "debug/GPUTLB.hh"
42#include "sim/process.hh"
43
44TLBCoalescer::TLBCoalescer(const Params *p)
45    : MemObject(p),
46      clock(p->clk_domain->clockPeriod()),
47      TLBProbesPerCycle(p->probesPerCycle),
48      coalescingWindow(p->coalescingWindow),
49      disableCoalescing(p->disableCoalescing),
50      probeTLBEvent([this]{ processProbeTLBEvent(); },
51                    "Probe the TLB below",
52                    false, Event::CPU_Tick_Pri),
53      cleanupEvent([this]{ processCleanupEvent(); },
54                   "Cleanup issuedTranslationsTable hashmap",
55                   false, Event::Maximum_Pri)
56{
57    // create the slave ports based on the number of connected ports
58    for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
59        cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
60                                              this, i));
61    }
62
63    // create the master ports based on the number of connected ports
64    for (size_t i = 0; i < p->port_master_connection_count; ++i) {
65        memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
66                                              this, i));
67    }
68}
69
70BaseSlavePort&
71TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx)
72{
73    if (if_name == "slave") {
74        if (idx >= static_cast<PortID>(cpuSidePort.size())) {
75            panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
76        }
77
78        return *cpuSidePort[idx];
79    } else {
80        panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
81    }
82}
83
84BaseMasterPort&
85TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx)
86{
87    if (if_name == "master") {
88        if (idx >= static_cast<PortID>(memSidePort.size())) {
89            panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
90        }
91
92        return *memSidePort[idx];
93    } else {
94        panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
95    }
96}
97
98/*
99 * This method returns true if the <incoming_pkt>
100 * can be coalesced with <coalesced_pkt> and false otherwise.
101 * A given set of rules is checked.
102 * The rules can potentially be modified based on the TLB level.
103 */
104bool
105TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
106{
107    if (disableCoalescing)
108        return false;
109
110    TheISA::GpuTLB::TranslationState *incoming_state =
111      safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
112
113    TheISA::GpuTLB::TranslationState *coalesced_state =
114     safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
115
116    // Rule 1: Coalesce requests only if they
117    // fall within the same virtual page
118    Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
119                                             TheISA::PageBytes);
120
121    Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
122                                              TheISA::PageBytes);
123
124    if (incoming_virt_page_addr != coalesced_virt_page_addr)
125        return false;
126
127    //* Rule 2: Coalesce requests only if they
128    // share a TLB Mode, i.e. they are both read
129    // or write requests.
130    BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
131    BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
132
133    if (incoming_mode != coalesced_mode)
134        return false;
135
136    // when we can coalesce a packet update the reqCnt
137    // that is the number of packets represented by
138    // this coalesced packet
139    if (!incoming_state->prefetch)
140        coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
141
142    return true;
143}
144
145/*
146 * We need to update the physical addresses of all the translation requests
147 * that were coalesced into the one that just returned.
148 */
149void
150TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
151{
152    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
153
154    DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
155            issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
156
157    TheISA::GpuTLB::TranslationState *sender_state =
158        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
159
160    TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry;
161    assert(tlb_entry);
162    Addr first_entry_vaddr = tlb_entry->vaddr;
163    Addr first_entry_paddr = tlb_entry->paddr;
164    int page_size = tlb_entry->size();
165    bool uncacheable = tlb_entry->uncacheable;
166    int first_hit_level = sender_state->hitLevel;
167
168    // Get the physical page address of the translated request
169    // Using the page_size specified in the TLBEntry allows us
170    // to support different page sizes.
171    Addr phys_page_paddr = pkt->req->getPaddr();
172    phys_page_paddr &= ~(page_size - 1);
173
174    for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
175        PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
176        TheISA::GpuTLB::TranslationState *sender_state =
177            safe_cast<TheISA::GpuTLB::TranslationState*>(
178                    local_pkt->senderState);
179
180        // we are sending the packet back, so pop the reqCnt associated
181        // with this level in the TLB hiearchy
182        if (!sender_state->prefetch)
183            sender_state->reqCnt.pop_back();
184
185        /*
186         * Only the first packet from this coalesced request has been
187         * translated. Grab the translated phys. page addr and update the
188         * physical addresses of the remaining packets with the appropriate
189         * page offsets.
190         */
191        if (i) {
192            Addr paddr = phys_page_paddr;
193            paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
194            local_pkt->req->setPaddr(paddr);
195
196            if (uncacheable)
197                local_pkt->req->setFlags(Request::UNCACHEABLE);
198
199            // update senderState->tlbEntry, so we can insert
200            // the correct TLBEentry in the TLBs above.
201            auto p = sender_state->tc->getProcessPtr();
202            sender_state->tlbEntry =
203                new TheISA::TlbEntry(p->pid(), first_entry_vaddr,
204                    first_entry_paddr, false, false);
205
206            // update the hitLevel for all uncoalesced reqs
207            // so that each packet knows where it hit
208            // (used for statistics in the CUs)
209            sender_state->hitLevel = first_hit_level;
210        }
211
212        SlavePort *return_port = sender_state->ports.back();
213        sender_state->ports.pop_back();
214
215        // Translation is done - Convert to a response pkt if necessary and
216        // send the translation back
217        if (local_pkt->isRequest()) {
218            local_pkt->makeTimingResponse();
219        }
220
221        return_port->sendTimingResp(local_pkt);
222    }
223
224    // schedule clean up for end of this cycle
225    // This is a maximum priority event and must be on
226    // the same cycle as GPUTLB cleanup event to prevent
227    // race conditions with an IssueProbeEvent caused by
228    // MemSidePort::recvReqRetry
229    cleanupQueue.push(virt_page_addr);
230
231    if (!cleanupEvent.scheduled())
232        schedule(cleanupEvent, curTick());
233}
234
235// Receive translation requests, create a coalesced request,
236// and send them to the TLB (TLBProbesPerCycle)
237bool
238TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
239{
240    // first packet of a coalesced request
241    PacketPtr first_packet = nullptr;
242    // true if we are able to do coalescing
243    bool didCoalesce = false;
244    // number of coalesced reqs for a given window
245    int coalescedReq_cnt = 0;
246
247    TheISA::GpuTLB::TranslationState *sender_state =
248        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
249
250    // push back the port to remember the path back
251    sender_state->ports.push_back(this);
252
253    bool update_stats = !sender_state->prefetch;
254
255    if (update_stats) {
256        // if reqCnt is empty then this packet does not represent
257        // multiple uncoalesced reqs(pkts) but just a single pkt.
258        // If it does though then the reqCnt for each level in the
259        // hierarchy accumulates the total number of reqs this packet
260        // represents
261        int req_cnt = 1;
262
263        if (!sender_state->reqCnt.empty())
264            req_cnt = sender_state->reqCnt.back();
265
266        sender_state->reqCnt.push_back(req_cnt);
267
268        // update statistics
269        coalescer->uncoalescedAccesses++;
270        req_cnt = sender_state->reqCnt.back();
271        DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
272        coalescer->queuingCycles -= (curTick() * req_cnt);
273        coalescer->localqueuingCycles -= curTick();
274    }
275
276    // FIXME if you want to coalesce not based on the issueTime
277    // of the packets (i.e., from the compute unit's perspective)
278    // but based on when they reached this coalescer then
279    // remove the following if statement and use curTick() or
280    // coalescingWindow for the tick_index.
281    if (!sender_state->issueTime)
282       sender_state->issueTime = curTick();
283
284    // The tick index is used as a key to the coalescerFIFO hashmap.
285    // It is shared by all candidates that fall within the
286    // given coalescingWindow.
287    int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
288
289    if (coalescer->coalescerFIFO.count(tick_index)) {
290        coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
291    }
292
293    // see if we can coalesce the incoming pkt with another
294    // coalesced request with the same tick_index
295    for (int i = 0; i < coalescedReq_cnt; ++i) {
296        first_packet = coalescer->coalescerFIFO[tick_index][i][0];
297
298        if (coalescer->canCoalesce(pkt, first_packet)) {
299            coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
300
301            DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
302                    i, tick_index,
303                    coalescer->coalescerFIFO[tick_index][i].size());
304
305            didCoalesce = true;
306            break;
307        }
308    }
309
310    // if this is the first request for this tick_index
311    // or we did not manage to coalesce, update stats
312    // and make necessary allocations.
313    if (!coalescedReq_cnt || !didCoalesce) {
314        if (update_stats)
315            coalescer->coalescedAccesses++;
316
317        std::vector<PacketPtr> new_array;
318        new_array.push_back(pkt);
319        coalescer->coalescerFIFO[tick_index].push_back(new_array);
320
321        DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
322                "push\n", tick_index,
323                coalescer->coalescerFIFO[tick_index].size());
324    }
325
326    //schedule probeTLBEvent next cycle to send the
327    //coalesced requests to the TLB
328    if (!coalescer->probeTLBEvent.scheduled()) {
329        coalescer->schedule(coalescer->probeTLBEvent,
330                curTick() + coalescer->ticks(1));
331    }
332
333    return true;
334}
335
336void
337TLBCoalescer::CpuSidePort::recvReqRetry()
338{
339    panic("recvReqRetry called");
340}
341
342void
343TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
344{
345
346    TheISA::GpuTLB::TranslationState *sender_state =
347        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
348
349    bool update_stats = !sender_state->prefetch;
350
351    if (update_stats)
352        coalescer->uncoalescedAccesses++;
353
354    // If there is a pending timing request for this virtual address
355    // print a warning message. This is a temporary caveat of
356    // the current simulator where atomic and timing requests can
357    // coexist. FIXME remove this check/warning in the future.
358    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
359    int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
360
361    if (map_count) {
362        DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
363                "req. pending\n", virt_page_addr);
364    }
365
366    coalescer->memSidePort[0]->sendFunctional(pkt);
367}
368
369AddrRangeList
370TLBCoalescer::CpuSidePort::getAddrRanges() const
371{
372    // currently not checked by the master
373    AddrRangeList ranges;
374
375    return ranges;
376}
377
378bool
379TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
380{
381    // a translation completed and returned
382    coalescer->updatePhysAddresses(pkt);
383
384    return true;
385}
386
387void
388TLBCoalescer::MemSidePort::recvReqRetry()
389{
390    //we've receeived a retry. Schedule a probeTLBEvent
391    if (!coalescer->probeTLBEvent.scheduled())
392        coalescer->schedule(coalescer->probeTLBEvent,
393                curTick() + coalescer->ticks(1));
394}
395
396void
397TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
398{
399    fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
400}
401
402/*
403 * Here we scan the coalescer FIFO and issue the max
404 * number of permitted probes to the TLB below. We
405 * permit bypassing of coalesced requests for the same
406 * tick_index.
407 *
408 * We do not access the next tick_index unless we've
409 * drained the previous one. The coalesced requests
410 * that are successfully sent are moved to the
411 * issuedTranslationsTable table (the table which keeps
412 * track of the outstanding reqs)
413 */
414void
415TLBCoalescer::processProbeTLBEvent()
416{
417    // number of TLB probes sent so far
418    int sent_probes = 0;
419    // rejected denotes a blocking event
420    bool rejected = false;
421
422    // It is set to true either when the recvTiming of the TLB below
423    // returns false or when there is another outstanding request for the
424    // same virt. page.
425
426    DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__);
427
428    for (auto iter = coalescerFIFO.begin();
429         iter != coalescerFIFO.end() && !rejected; ) {
430        int coalescedReq_cnt = iter->second.size();
431        int i = 0;
432        int vector_index = 0;
433
434        DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
435               coalescedReq_cnt, iter->first);
436
437        while (i < coalescedReq_cnt) {
438            ++i;
439            PacketPtr first_packet = iter->second[vector_index][0];
440
441            // compute virtual page address for this request
442            Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
443                    TheISA::PageBytes);
444
445            // is there another outstanding request for the same page addr?
446            int pending_reqs =
447                issuedTranslationsTable.count(virt_page_addr);
448
449            if (pending_reqs) {
450                DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
451                        "page %#x\n", virt_page_addr);
452
453                ++vector_index;
454                rejected = true;
455
456                continue;
457            }
458
459            // send the coalesced request for virt_page_addr
460            if (!memSidePort[0]->sendTimingReq(first_packet)) {
461                DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
462                       virt_page_addr);
463
464                // No need for a retries queue since we are already buffering
465                // the coalesced request in coalescerFIFO.
466                rejected = true;
467                ++vector_index;
468            } else {
469                TheISA::GpuTLB::TranslationState *tmp_sender_state =
470                    safe_cast<TheISA::GpuTLB::TranslationState*>
471                    (first_packet->senderState);
472
473                bool update_stats = !tmp_sender_state->prefetch;
474
475                if (update_stats) {
476                    // req_cnt is total number of packets represented
477                    // by the one we just sent counting all the way from
478                    // the top of TLB hiearchy (i.e., from the CU)
479                    int req_cnt = tmp_sender_state->reqCnt.back();
480                    queuingCycles += (curTick() * req_cnt);
481
482                    DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
483                            name(), req_cnt);
484
485                    // pkt_cnt is number of packets we coalesced into the one
486                    // we just sent but only at this coalescer level
487                    int pkt_cnt = iter->second[vector_index].size();
488                    localqueuingCycles += (curTick() * pkt_cnt);
489                }
490
491                DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
492                       virt_page_addr);
493
494                //copy coalescedReq to issuedTranslationsTable
495                issuedTranslationsTable[virt_page_addr]
496                    = iter->second[vector_index];
497
498                //erase the entry of this coalesced req
499                iter->second.erase(iter->second.begin() + vector_index);
500
501                if (iter->second.empty())
502                    assert(i == coalescedReq_cnt);
503
504                sent_probes++;
505                if (sent_probes == TLBProbesPerCycle)
506                   return;
507            }
508        }
509
510        //if there are no more coalesced reqs for this tick_index
511        //erase the hash_map with the first iterator
512        if (iter->second.empty()) {
513            coalescerFIFO.erase(iter++);
514        } else {
515            ++iter;
516        }
517    }
518}
519
520void
521TLBCoalescer::processCleanupEvent()
522{
523    while (!cleanupQueue.empty()) {
524        Addr cleanup_addr = cleanupQueue.front();
525        cleanupQueue.pop();
526        issuedTranslationsTable.erase(cleanup_addr);
527
528        DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
529                cleanup_addr);
530    }
531}
532
533void
534TLBCoalescer::regStats()
535{
536    MemObject::regStats();
537
538    uncoalescedAccesses
539        .name(name() + ".uncoalesced_accesses")
540        .desc("Number of uncoalesced TLB accesses")
541        ;
542
543    coalescedAccesses
544        .name(name() + ".coalesced_accesses")
545        .desc("Number of coalesced TLB accesses")
546        ;
547
548    queuingCycles
549        .name(name() + ".queuing_cycles")
550        .desc("Number of cycles spent in queue")
551        ;
552
553    localqueuingCycles
554        .name(name() + ".local_queuing_cycles")
555        .desc("Number of cycles spent in queue for all incoming reqs")
556        ;
557
558    localLatency
559        .name(name() + ".local_latency")
560        .desc("Avg. latency over all incoming pkts")
561        ;
562
563    localLatency = localqueuingCycles / uncoalescedAccesses;
564}
565
566
567TLBCoalescer*
568TLBCoalescerParams::create()
569{
570    return new TLBCoalescer(this);
571}
572
573