tlb_coalescer.cc revision 12697:cd71b966be1e
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Lisa Hsu
34 */
35
36#include "gpu-compute/tlb_coalescer.hh"
37
38#include <cstring>
39
40#include "debug/GPUTLB.hh"
41
42TLBCoalescer::TLBCoalescer(const Params *p)
43    : MemObject(p),
44      clock(p->clk_domain->clockPeriod()),
45      TLBProbesPerCycle(p->probesPerCycle),
46      coalescingWindow(p->coalescingWindow),
47      disableCoalescing(p->disableCoalescing),
48      probeTLBEvent([this]{ processProbeTLBEvent(); },
49                    "Probe the TLB below",
50                    false, Event::CPU_Tick_Pri),
51      cleanupEvent([this]{ processCleanupEvent(); },
52                   "Cleanup issuedTranslationsTable hashmap",
53                   false, Event::Maximum_Pri)
54{
55    // create the slave ports based on the number of connected ports
56    for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
57        cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
58                                              this, i));
59    }
60
61    // create the master ports based on the number of connected ports
62    for (size_t i = 0; i < p->port_master_connection_count; ++i) {
63        memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
64                                              this, i));
65    }
66}
67
68BaseSlavePort&
69TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx)
70{
71    if (if_name == "slave") {
72        if (idx >= static_cast<PortID>(cpuSidePort.size())) {
73            panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
74        }
75
76        return *cpuSidePort[idx];
77    } else {
78        panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
79    }
80}
81
82BaseMasterPort&
83TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx)
84{
85    if (if_name == "master") {
86        if (idx >= static_cast<PortID>(memSidePort.size())) {
87            panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
88        }
89
90        return *memSidePort[idx];
91    } else {
92        panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
93    }
94}
95
96/*
97 * This method returns true if the <incoming_pkt>
98 * can be coalesced with <coalesced_pkt> and false otherwise.
99 * A given set of rules is checked.
100 * The rules can potentially be modified based on the TLB level.
101 */
102bool
103TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
104{
105    if (disableCoalescing)
106        return false;
107
108    TheISA::GpuTLB::TranslationState *incoming_state =
109      safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
110
111    TheISA::GpuTLB::TranslationState *coalesced_state =
112     safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
113
114    // Rule 1: Coalesce requests only if they
115    // fall within the same virtual page
116    Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
117                                             TheISA::PageBytes);
118
119    Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
120                                              TheISA::PageBytes);
121
122    if (incoming_virt_page_addr != coalesced_virt_page_addr)
123        return false;
124
125    //* Rule 2: Coalesce requests only if they
126    // share a TLB Mode, i.e. they are both read
127    // or write requests.
128    BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
129    BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
130
131    if (incoming_mode != coalesced_mode)
132        return false;
133
134    // when we can coalesce a packet update the reqCnt
135    // that is the number of packets represented by
136    // this coalesced packet
137    if (!incoming_state->prefetch)
138        coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
139
140    return true;
141}
142
143/*
144 * We need to update the physical addresses of all the translation requests
145 * that were coalesced into the one that just returned.
146 */
147void
148TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
149{
150    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
151
152    DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
153            issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
154
155    TheISA::GpuTLB::TranslationState *sender_state =
156        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
157
158    TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry;
159    assert(tlb_entry);
160    Addr first_entry_vaddr = tlb_entry->vaddr;
161    Addr first_entry_paddr = tlb_entry->paddr;
162    int page_size = tlb_entry->size();
163    bool uncacheable = tlb_entry->uncacheable;
164    int first_hit_level = sender_state->hitLevel;
165    bool valid = tlb_entry->valid;
166
167    // Get the physical page address of the translated request
168    // Using the page_size specified in the TLBEntry allows us
169    // to support different page sizes.
170    Addr phys_page_paddr = pkt->req->getPaddr();
171    phys_page_paddr &= ~(page_size - 1);
172
173    for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
174        PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
175        TheISA::GpuTLB::TranslationState *sender_state =
176            safe_cast<TheISA::GpuTLB::TranslationState*>(
177                    local_pkt->senderState);
178
179        // we are sending the packet back, so pop the reqCnt associated
180        // with this level in the TLB hiearchy
181        if (!sender_state->prefetch)
182            sender_state->reqCnt.pop_back();
183
184        /*
185         * Only the first packet from this coalesced request has been
186         * translated. Grab the translated phys. page addr and update the
187         * physical addresses of the remaining packets with the appropriate
188         * page offsets.
189         */
190        if (i) {
191            Addr paddr = phys_page_paddr;
192            paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
193            local_pkt->req->setPaddr(paddr);
194
195            if (uncacheable)
196                local_pkt->req->setFlags(Request::UNCACHEABLE);
197
198            // update senderState->tlbEntry, so we can insert
199            // the correct TLBEentry in the TLBs above.
200            sender_state->tlbEntry =
201                new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr,
202                                        valid);
203
204            // update the hitLevel for all uncoalesced reqs
205            // so that each packet knows where it hit
206            // (used for statistics in the CUs)
207            sender_state->hitLevel = first_hit_level;
208        }
209
210        SlavePort *return_port = sender_state->ports.back();
211        sender_state->ports.pop_back();
212
213        // Translation is done - Convert to a response pkt if necessary and
214        // send the translation back
215        if (local_pkt->isRequest()) {
216            local_pkt->makeTimingResponse();
217        }
218
219        return_port->sendTimingResp(local_pkt);
220    }
221
222    // schedule clean up for end of this cycle
223    // This is a maximum priority event and must be on
224    // the same cycle as GPUTLB cleanup event to prevent
225    // race conditions with an IssueProbeEvent caused by
226    // MemSidePort::recvReqRetry
227    cleanupQueue.push(virt_page_addr);
228
229    if (!cleanupEvent.scheduled())
230        schedule(cleanupEvent, curTick());
231}
232
233// Receive translation requests, create a coalesced request,
234// and send them to the TLB (TLBProbesPerCycle)
235bool
236TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
237{
238    // first packet of a coalesced request
239    PacketPtr first_packet = nullptr;
240    // true if we are able to do coalescing
241    bool didCoalesce = false;
242    // number of coalesced reqs for a given window
243    int coalescedReq_cnt = 0;
244
245    TheISA::GpuTLB::TranslationState *sender_state =
246        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
247
248    // push back the port to remember the path back
249    sender_state->ports.push_back(this);
250
251    bool update_stats = !sender_state->prefetch;
252
253    if (update_stats) {
254        // if reqCnt is empty then this packet does not represent
255        // multiple uncoalesced reqs(pkts) but just a single pkt.
256        // If it does though then the reqCnt for each level in the
257        // hierarchy accumulates the total number of reqs this packet
258        // represents
259        int req_cnt = 1;
260
261        if (!sender_state->reqCnt.empty())
262            req_cnt = sender_state->reqCnt.back();
263
264        sender_state->reqCnt.push_back(req_cnt);
265
266        // update statistics
267        coalescer->uncoalescedAccesses++;
268        req_cnt = sender_state->reqCnt.back();
269        DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
270        coalescer->queuingCycles -= (curTick() * req_cnt);
271        coalescer->localqueuingCycles -= curTick();
272    }
273
274    // FIXME if you want to coalesce not based on the issueTime
275    // of the packets (i.e., from the compute unit's perspective)
276    // but based on when they reached this coalescer then
277    // remove the following if statement and use curTick() or
278    // coalescingWindow for the tick_index.
279    if (!sender_state->issueTime)
280       sender_state->issueTime = curTick();
281
282    // The tick index is used as a key to the coalescerFIFO hashmap.
283    // It is shared by all candidates that fall within the
284    // given coalescingWindow.
285    int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
286
287    if (coalescer->coalescerFIFO.count(tick_index)) {
288        coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
289    }
290
291    // see if we can coalesce the incoming pkt with another
292    // coalesced request with the same tick_index
293    for (int i = 0; i < coalescedReq_cnt; ++i) {
294        first_packet = coalescer->coalescerFIFO[tick_index][i][0];
295
296        if (coalescer->canCoalesce(pkt, first_packet)) {
297            coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
298
299            DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
300                    i, tick_index,
301                    coalescer->coalescerFIFO[tick_index][i].size());
302
303            didCoalesce = true;
304            break;
305        }
306    }
307
308    // if this is the first request for this tick_index
309    // or we did not manage to coalesce, update stats
310    // and make necessary allocations.
311    if (!coalescedReq_cnt || !didCoalesce) {
312        if (update_stats)
313            coalescer->coalescedAccesses++;
314
315        std::vector<PacketPtr> new_array;
316        new_array.push_back(pkt);
317        coalescer->coalescerFIFO[tick_index].push_back(new_array);
318
319        DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
320                "push\n", tick_index,
321                coalescer->coalescerFIFO[tick_index].size());
322    }
323
324    //schedule probeTLBEvent next cycle to send the
325    //coalesced requests to the TLB
326    if (!coalescer->probeTLBEvent.scheduled()) {
327        coalescer->schedule(coalescer->probeTLBEvent,
328                curTick() + coalescer->ticks(1));
329    }
330
331    return true;
332}
333
334void
335TLBCoalescer::CpuSidePort::recvReqRetry()
336{
337    assert(false);
338}
339
340void
341TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
342{
343
344    TheISA::GpuTLB::TranslationState *sender_state =
345        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
346
347    bool update_stats = !sender_state->prefetch;
348
349    if (update_stats)
350        coalescer->uncoalescedAccesses++;
351
352    // If there is a pending timing request for this virtual address
353    // print a warning message. This is a temporary caveat of
354    // the current simulator where atomic and timing requests can
355    // coexist. FIXME remove this check/warning in the future.
356    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
357    int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
358
359    if (map_count) {
360        DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
361                "req. pending\n", virt_page_addr);
362    }
363
364    coalescer->memSidePort[0]->sendFunctional(pkt);
365}
366
367AddrRangeList
368TLBCoalescer::CpuSidePort::getAddrRanges() const
369{
370    // currently not checked by the master
371    AddrRangeList ranges;
372
373    return ranges;
374}
375
376bool
377TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
378{
379    // a translation completed and returned
380    coalescer->updatePhysAddresses(pkt);
381
382    return true;
383}
384
385void
386TLBCoalescer::MemSidePort::recvReqRetry()
387{
388    //we've receeived a retry. Schedule a probeTLBEvent
389    if (!coalescer->probeTLBEvent.scheduled())
390        coalescer->schedule(coalescer->probeTLBEvent,
391                curTick() + coalescer->ticks(1));
392}
393
394void
395TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
396{
397    fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
398}
399
400/*
401 * Here we scan the coalescer FIFO and issue the max
402 * number of permitted probes to the TLB below. We
403 * permit bypassing of coalesced requests for the same
404 * tick_index.
405 *
406 * We do not access the next tick_index unless we've
407 * drained the previous one. The coalesced requests
408 * that are successfully sent are moved to the
409 * issuedTranslationsTable table (the table which keeps
410 * track of the outstanding reqs)
411 */
412void
413TLBCoalescer::processProbeTLBEvent()
414{
415    // number of TLB probes sent so far
416    int sent_probes = 0;
417    // rejected denotes a blocking event
418    bool rejected = false;
419
420    // It is set to true either when the recvTiming of the TLB below
421    // returns false or when there is another outstanding request for the
422    // same virt. page.
423
424    DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__);
425
426    for (auto iter = coalescerFIFO.begin();
427         iter != coalescerFIFO.end() && !rejected; ) {
428        int coalescedReq_cnt = iter->second.size();
429        int i = 0;
430        int vector_index = 0;
431
432        DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
433               coalescedReq_cnt, iter->first);
434
435        while (i < coalescedReq_cnt) {
436            ++i;
437            PacketPtr first_packet = iter->second[vector_index][0];
438
439            // compute virtual page address for this request
440            Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
441                    TheISA::PageBytes);
442
443            // is there another outstanding request for the same page addr?
444            int pending_reqs =
445                issuedTranslationsTable.count(virt_page_addr);
446
447            if (pending_reqs) {
448                DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
449                        "page %#x\n", virt_page_addr);
450
451                ++vector_index;
452                rejected = true;
453
454                continue;
455            }
456
457            // send the coalesced request for virt_page_addr
458            if (!memSidePort[0]->sendTimingReq(first_packet)) {
459                DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
460                       virt_page_addr);
461
462                // No need for a retries queue since we are already buffering
463                // the coalesced request in coalescerFIFO.
464                rejected = true;
465                ++vector_index;
466            } else {
467                TheISA::GpuTLB::TranslationState *tmp_sender_state =
468                    safe_cast<TheISA::GpuTLB::TranslationState*>
469                    (first_packet->senderState);
470
471                bool update_stats = !tmp_sender_state->prefetch;
472
473                if (update_stats) {
474                    // req_cnt is total number of packets represented
475                    // by the one we just sent counting all the way from
476                    // the top of TLB hiearchy (i.e., from the CU)
477                    int req_cnt = tmp_sender_state->reqCnt.back();
478                    queuingCycles += (curTick() * req_cnt);
479
480                    DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
481                            name(), req_cnt);
482
483                    // pkt_cnt is number of packets we coalesced into the one
484                    // we just sent but only at this coalescer level
485                    int pkt_cnt = iter->second[vector_index].size();
486                    localqueuingCycles += (curTick() * pkt_cnt);
487                }
488
489                DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
490                       virt_page_addr);
491
492                //copy coalescedReq to issuedTranslationsTable
493                issuedTranslationsTable[virt_page_addr]
494                    = iter->second[vector_index];
495
496                //erase the entry of this coalesced req
497                iter->second.erase(iter->second.begin() + vector_index);
498
499                if (iter->second.empty())
500                    assert(i == coalescedReq_cnt);
501
502                sent_probes++;
503                if (sent_probes == TLBProbesPerCycle)
504                   return;
505            }
506        }
507
508        //if there are no more coalesced reqs for this tick_index
509        //erase the hash_map with the first iterator
510        if (iter->second.empty()) {
511            coalescerFIFO.erase(iter++);
512        } else {
513            ++iter;
514        }
515    }
516}
517
518void
519TLBCoalescer::processCleanupEvent()
520{
521    while (!cleanupQueue.empty()) {
522        Addr cleanup_addr = cleanupQueue.front();
523        cleanupQueue.pop();
524        issuedTranslationsTable.erase(cleanup_addr);
525
526        DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
527                cleanup_addr);
528    }
529}
530
531void
532TLBCoalescer::regStats()
533{
534    MemObject::regStats();
535
536    uncoalescedAccesses
537        .name(name() + ".uncoalesced_accesses")
538        .desc("Number of uncoalesced TLB accesses")
539        ;
540
541    coalescedAccesses
542        .name(name() + ".coalesced_accesses")
543        .desc("Number of coalesced TLB accesses")
544        ;
545
546    queuingCycles
547        .name(name() + ".queuing_cycles")
548        .desc("Number of cycles spent in queue")
549        ;
550
551    localqueuingCycles
552        .name(name() + ".local_queuing_cycles")
553        .desc("Number of cycles spent in queue for all incoming reqs")
554        ;
555
556    localLatency
557        .name(name() + ".local_latency")
558        .desc("Avg. latency over all incoming pkts")
559        ;
560
561    localLatency = localqueuingCycles / uncoalescedAccesses;
562}
563
564
565TLBCoalescer*
566TLBCoalescerParams::create()
567{
568    return new TLBCoalescer(this);
569}
570
571