tlb_coalescer.cc revision 12717
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Lisa Hsu
34 */
35
36#include "gpu-compute/tlb_coalescer.hh"
37
38#include <cstring>
39
40#include "debug/GPUTLB.hh"
41#include "sim/process.hh"
42
43TLBCoalescer::TLBCoalescer(const Params *p)
44    : MemObject(p),
45      clock(p->clk_domain->clockPeriod()),
46      TLBProbesPerCycle(p->probesPerCycle),
47      coalescingWindow(p->coalescingWindow),
48      disableCoalescing(p->disableCoalescing),
49      probeTLBEvent([this]{ processProbeTLBEvent(); },
50                    "Probe the TLB below",
51                    false, Event::CPU_Tick_Pri),
52      cleanupEvent([this]{ processCleanupEvent(); },
53                   "Cleanup issuedTranslationsTable hashmap",
54                   false, Event::Maximum_Pri)
55{
56    // create the slave ports based on the number of connected ports
57    for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
58        cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
59                                              this, i));
60    }
61
62    // create the master ports based on the number of connected ports
63    for (size_t i = 0; i < p->port_master_connection_count; ++i) {
64        memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
65                                              this, i));
66    }
67}
68
69BaseSlavePort&
70TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx)
71{
72    if (if_name == "slave") {
73        if (idx >= static_cast<PortID>(cpuSidePort.size())) {
74            panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
75        }
76
77        return *cpuSidePort[idx];
78    } else {
79        panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
80    }
81}
82
83BaseMasterPort&
84TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx)
85{
86    if (if_name == "master") {
87        if (idx >= static_cast<PortID>(memSidePort.size())) {
88            panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
89        }
90
91        return *memSidePort[idx];
92    } else {
93        panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
94    }
95}
96
97/*
98 * This method returns true if the <incoming_pkt>
99 * can be coalesced with <coalesced_pkt> and false otherwise.
100 * A given set of rules is checked.
101 * The rules can potentially be modified based on the TLB level.
102 */
103bool
104TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
105{
106    if (disableCoalescing)
107        return false;
108
109    TheISA::GpuTLB::TranslationState *incoming_state =
110      safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
111
112    TheISA::GpuTLB::TranslationState *coalesced_state =
113     safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
114
115    // Rule 1: Coalesce requests only if they
116    // fall within the same virtual page
117    Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
118                                             TheISA::PageBytes);
119
120    Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
121                                              TheISA::PageBytes);
122
123    if (incoming_virt_page_addr != coalesced_virt_page_addr)
124        return false;
125
126    //* Rule 2: Coalesce requests only if they
127    // share a TLB Mode, i.e. they are both read
128    // or write requests.
129    BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
130    BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
131
132    if (incoming_mode != coalesced_mode)
133        return false;
134
135    // when we can coalesce a packet update the reqCnt
136    // that is the number of packets represented by
137    // this coalesced packet
138    if (!incoming_state->prefetch)
139        coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
140
141    return true;
142}
143
144/*
145 * We need to update the physical addresses of all the translation requests
146 * that were coalesced into the one that just returned.
147 */
148void
149TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
150{
151    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
152
153    DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
154            issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
155
156    TheISA::GpuTLB::TranslationState *sender_state =
157        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
158
159    TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry;
160    assert(tlb_entry);
161    Addr first_entry_vaddr = tlb_entry->vaddr;
162    Addr first_entry_paddr = tlb_entry->paddr;
163    int page_size = tlb_entry->size();
164    bool uncacheable = tlb_entry->uncacheable;
165    int first_hit_level = sender_state->hitLevel;
166
167    // Get the physical page address of the translated request
168    // Using the page_size specified in the TLBEntry allows us
169    // to support different page sizes.
170    Addr phys_page_paddr = pkt->req->getPaddr();
171    phys_page_paddr &= ~(page_size - 1);
172
173    for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
174        PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
175        TheISA::GpuTLB::TranslationState *sender_state =
176            safe_cast<TheISA::GpuTLB::TranslationState*>(
177                    local_pkt->senderState);
178
179        // we are sending the packet back, so pop the reqCnt associated
180        // with this level in the TLB hiearchy
181        if (!sender_state->prefetch)
182            sender_state->reqCnt.pop_back();
183
184        /*
185         * Only the first packet from this coalesced request has been
186         * translated. Grab the translated phys. page addr and update the
187         * physical addresses of the remaining packets with the appropriate
188         * page offsets.
189         */
190        if (i) {
191            Addr paddr = phys_page_paddr;
192            paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
193            local_pkt->req->setPaddr(paddr);
194
195            if (uncacheable)
196                local_pkt->req->setFlags(Request::UNCACHEABLE);
197
198            // update senderState->tlbEntry, so we can insert
199            // the correct TLBEentry in the TLBs above.
200            auto p = sender_state->tc->getProcessPtr();
201            sender_state->tlbEntry =
202                new TheISA::TlbEntry(p->pid(), first_entry_vaddr,
203                    first_entry_paddr, false, false);
204
205            // update the hitLevel for all uncoalesced reqs
206            // so that each packet knows where it hit
207            // (used for statistics in the CUs)
208            sender_state->hitLevel = first_hit_level;
209        }
210
211        SlavePort *return_port = sender_state->ports.back();
212        sender_state->ports.pop_back();
213
214        // Translation is done - Convert to a response pkt if necessary and
215        // send the translation back
216        if (local_pkt->isRequest()) {
217            local_pkt->makeTimingResponse();
218        }
219
220        return_port->sendTimingResp(local_pkt);
221    }
222
223    // schedule clean up for end of this cycle
224    // This is a maximum priority event and must be on
225    // the same cycle as GPUTLB cleanup event to prevent
226    // race conditions with an IssueProbeEvent caused by
227    // MemSidePort::recvReqRetry
228    cleanupQueue.push(virt_page_addr);
229
230    if (!cleanupEvent.scheduled())
231        schedule(cleanupEvent, curTick());
232}
233
234// Receive translation requests, create a coalesced request,
235// and send them to the TLB (TLBProbesPerCycle)
236bool
237TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
238{
239    // first packet of a coalesced request
240    PacketPtr first_packet = nullptr;
241    // true if we are able to do coalescing
242    bool didCoalesce = false;
243    // number of coalesced reqs for a given window
244    int coalescedReq_cnt = 0;
245
246    TheISA::GpuTLB::TranslationState *sender_state =
247        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
248
249    // push back the port to remember the path back
250    sender_state->ports.push_back(this);
251
252    bool update_stats = !sender_state->prefetch;
253
254    if (update_stats) {
255        // if reqCnt is empty then this packet does not represent
256        // multiple uncoalesced reqs(pkts) but just a single pkt.
257        // If it does though then the reqCnt for each level in the
258        // hierarchy accumulates the total number of reqs this packet
259        // represents
260        int req_cnt = 1;
261
262        if (!sender_state->reqCnt.empty())
263            req_cnt = sender_state->reqCnt.back();
264
265        sender_state->reqCnt.push_back(req_cnt);
266
267        // update statistics
268        coalescer->uncoalescedAccesses++;
269        req_cnt = sender_state->reqCnt.back();
270        DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
271        coalescer->queuingCycles -= (curTick() * req_cnt);
272        coalescer->localqueuingCycles -= curTick();
273    }
274
275    // FIXME if you want to coalesce not based on the issueTime
276    // of the packets (i.e., from the compute unit's perspective)
277    // but based on when they reached this coalescer then
278    // remove the following if statement and use curTick() or
279    // coalescingWindow for the tick_index.
280    if (!sender_state->issueTime)
281       sender_state->issueTime = curTick();
282
283    // The tick index is used as a key to the coalescerFIFO hashmap.
284    // It is shared by all candidates that fall within the
285    // given coalescingWindow.
286    int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
287
288    if (coalescer->coalescerFIFO.count(tick_index)) {
289        coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
290    }
291
292    // see if we can coalesce the incoming pkt with another
293    // coalesced request with the same tick_index
294    for (int i = 0; i < coalescedReq_cnt; ++i) {
295        first_packet = coalescer->coalescerFIFO[tick_index][i][0];
296
297        if (coalescer->canCoalesce(pkt, first_packet)) {
298            coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
299
300            DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
301                    i, tick_index,
302                    coalescer->coalescerFIFO[tick_index][i].size());
303
304            didCoalesce = true;
305            break;
306        }
307    }
308
309    // if this is the first request for this tick_index
310    // or we did not manage to coalesce, update stats
311    // and make necessary allocations.
312    if (!coalescedReq_cnt || !didCoalesce) {
313        if (update_stats)
314            coalescer->coalescedAccesses++;
315
316        std::vector<PacketPtr> new_array;
317        new_array.push_back(pkt);
318        coalescer->coalescerFIFO[tick_index].push_back(new_array);
319
320        DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
321                "push\n", tick_index,
322                coalescer->coalescerFIFO[tick_index].size());
323    }
324
325    //schedule probeTLBEvent next cycle to send the
326    //coalesced requests to the TLB
327    if (!coalescer->probeTLBEvent.scheduled()) {
328        coalescer->schedule(coalescer->probeTLBEvent,
329                curTick() + coalescer->ticks(1));
330    }
331
332    return true;
333}
334
335void
336TLBCoalescer::CpuSidePort::recvReqRetry()
337{
338    assert(false);
339}
340
341void
342TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
343{
344
345    TheISA::GpuTLB::TranslationState *sender_state =
346        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
347
348    bool update_stats = !sender_state->prefetch;
349
350    if (update_stats)
351        coalescer->uncoalescedAccesses++;
352
353    // If there is a pending timing request for this virtual address
354    // print a warning message. This is a temporary caveat of
355    // the current simulator where atomic and timing requests can
356    // coexist. FIXME remove this check/warning in the future.
357    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
358    int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
359
360    if (map_count) {
361        DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
362                "req. pending\n", virt_page_addr);
363    }
364
365    coalescer->memSidePort[0]->sendFunctional(pkt);
366}
367
368AddrRangeList
369TLBCoalescer::CpuSidePort::getAddrRanges() const
370{
371    // currently not checked by the master
372    AddrRangeList ranges;
373
374    return ranges;
375}
376
377bool
378TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
379{
380    // a translation completed and returned
381    coalescer->updatePhysAddresses(pkt);
382
383    return true;
384}
385
386void
387TLBCoalescer::MemSidePort::recvReqRetry()
388{
389    //we've receeived a retry. Schedule a probeTLBEvent
390    if (!coalescer->probeTLBEvent.scheduled())
391        coalescer->schedule(coalescer->probeTLBEvent,
392                curTick() + coalescer->ticks(1));
393}
394
395void
396TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
397{
398    fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
399}
400
401/*
402 * Here we scan the coalescer FIFO and issue the max
403 * number of permitted probes to the TLB below. We
404 * permit bypassing of coalesced requests for the same
405 * tick_index.
406 *
407 * We do not access the next tick_index unless we've
408 * drained the previous one. The coalesced requests
409 * that are successfully sent are moved to the
410 * issuedTranslationsTable table (the table which keeps
411 * track of the outstanding reqs)
412 */
413void
414TLBCoalescer::processProbeTLBEvent()
415{
416    // number of TLB probes sent so far
417    int sent_probes = 0;
418    // rejected denotes a blocking event
419    bool rejected = false;
420
421    // It is set to true either when the recvTiming of the TLB below
422    // returns false or when there is another outstanding request for the
423    // same virt. page.
424
425    DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__);
426
427    for (auto iter = coalescerFIFO.begin();
428         iter != coalescerFIFO.end() && !rejected; ) {
429        int coalescedReq_cnt = iter->second.size();
430        int i = 0;
431        int vector_index = 0;
432
433        DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
434               coalescedReq_cnt, iter->first);
435
436        while (i < coalescedReq_cnt) {
437            ++i;
438            PacketPtr first_packet = iter->second[vector_index][0];
439
440            // compute virtual page address for this request
441            Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
442                    TheISA::PageBytes);
443
444            // is there another outstanding request for the same page addr?
445            int pending_reqs =
446                issuedTranslationsTable.count(virt_page_addr);
447
448            if (pending_reqs) {
449                DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
450                        "page %#x\n", virt_page_addr);
451
452                ++vector_index;
453                rejected = true;
454
455                continue;
456            }
457
458            // send the coalesced request for virt_page_addr
459            if (!memSidePort[0]->sendTimingReq(first_packet)) {
460                DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
461                       virt_page_addr);
462
463                // No need for a retries queue since we are already buffering
464                // the coalesced request in coalescerFIFO.
465                rejected = true;
466                ++vector_index;
467            } else {
468                TheISA::GpuTLB::TranslationState *tmp_sender_state =
469                    safe_cast<TheISA::GpuTLB::TranslationState*>
470                    (first_packet->senderState);
471
472                bool update_stats = !tmp_sender_state->prefetch;
473
474                if (update_stats) {
475                    // req_cnt is total number of packets represented
476                    // by the one we just sent counting all the way from
477                    // the top of TLB hiearchy (i.e., from the CU)
478                    int req_cnt = tmp_sender_state->reqCnt.back();
479                    queuingCycles += (curTick() * req_cnt);
480
481                    DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
482                            name(), req_cnt);
483
484                    // pkt_cnt is number of packets we coalesced into the one
485                    // we just sent but only at this coalescer level
486                    int pkt_cnt = iter->second[vector_index].size();
487                    localqueuingCycles += (curTick() * pkt_cnt);
488                }
489
490                DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
491                       virt_page_addr);
492
493                //copy coalescedReq to issuedTranslationsTable
494                issuedTranslationsTable[virt_page_addr]
495                    = iter->second[vector_index];
496
497                //erase the entry of this coalesced req
498                iter->second.erase(iter->second.begin() + vector_index);
499
500                if (iter->second.empty())
501                    assert(i == coalescedReq_cnt);
502
503                sent_probes++;
504                if (sent_probes == TLBProbesPerCycle)
505                   return;
506            }
507        }
508
509        //if there are no more coalesced reqs for this tick_index
510        //erase the hash_map with the first iterator
511        if (iter->second.empty()) {
512            coalescerFIFO.erase(iter++);
513        } else {
514            ++iter;
515        }
516    }
517}
518
519void
520TLBCoalescer::processCleanupEvent()
521{
522    while (!cleanupQueue.empty()) {
523        Addr cleanup_addr = cleanupQueue.front();
524        cleanupQueue.pop();
525        issuedTranslationsTable.erase(cleanup_addr);
526
527        DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
528                cleanup_addr);
529    }
530}
531
532void
533TLBCoalescer::regStats()
534{
535    MemObject::regStats();
536
537    uncoalescedAccesses
538        .name(name() + ".uncoalesced_accesses")
539        .desc("Number of uncoalesced TLB accesses")
540        ;
541
542    coalescedAccesses
543        .name(name() + ".coalesced_accesses")
544        .desc("Number of coalesced TLB accesses")
545        ;
546
547    queuingCycles
548        .name(name() + ".queuing_cycles")
549        .desc("Number of cycles spent in queue")
550        ;
551
552    localqueuingCycles
553        .name(name() + ".local_queuing_cycles")
554        .desc("Number of cycles spent in queue for all incoming reqs")
555        ;
556
557    localLatency
558        .name(name() + ".local_latency")
559        .desc("Avg. latency over all incoming pkts")
560        ;
561
562    localLatency = localqueuingCycles / uncoalescedAccesses;
563}
564
565
566TLBCoalescer*
567TLBCoalescerParams::create()
568{
569    return new TLBCoalescer(this);
570}
571
572