tlb_coalescer.cc (13784:1941dc118243) tlb_coalescer.cc (13892:0182a0601f66)
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Lisa Hsu
34 */
35
36#include "gpu-compute/tlb_coalescer.hh"
37
38#include <cstring>
39
40#include "base/logging.hh"
41#include "debug/GPUTLB.hh"
42#include "sim/process.hh"
43
44TLBCoalescer::TLBCoalescer(const Params *p)
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Lisa Hsu
34 */
35
36#include "gpu-compute/tlb_coalescer.hh"
37
38#include <cstring>
39
40#include "base/logging.hh"
41#include "debug/GPUTLB.hh"
42#include "sim/process.hh"
43
44TLBCoalescer::TLBCoalescer(const Params *p)
45 : MemObject(p),
45 : ClockedObject(p),
46 clock(p->clk_domain->clockPeriod()),
47 TLBProbesPerCycle(p->probesPerCycle),
48 coalescingWindow(p->coalescingWindow),
49 disableCoalescing(p->disableCoalescing),
50 probeTLBEvent([this]{ processProbeTLBEvent(); },
51 "Probe the TLB below",
52 false, Event::CPU_Tick_Pri),
53 cleanupEvent([this]{ processCleanupEvent(); },
54 "Cleanup issuedTranslationsTable hashmap",
55 false, Event::Maximum_Pri)
56{
57 // create the slave ports based on the number of connected ports
58 for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
59 cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
60 this, i));
61 }
62
63 // create the master ports based on the number of connected ports
64 for (size_t i = 0; i < p->port_master_connection_count; ++i) {
65 memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
66 this, i));
67 }
68}
69
70Port &
71TLBCoalescer::getPort(const std::string &if_name, PortID idx)
72{
73 if (if_name == "slave") {
74 if (idx >= static_cast<PortID>(cpuSidePort.size())) {
75 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
76 }
77
78 return *cpuSidePort[idx];
79 } else if (if_name == "master") {
80 if (idx >= static_cast<PortID>(memSidePort.size())) {
81 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
82 }
83
84 return *memSidePort[idx];
85 } else {
86 panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
87 }
88}
89
90/*
91 * This method returns true if the <incoming_pkt>
92 * can be coalesced with <coalesced_pkt> and false otherwise.
93 * A given set of rules is checked.
94 * The rules can potentially be modified based on the TLB level.
95 */
96bool
97TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
98{
99 if (disableCoalescing)
100 return false;
101
102 TheISA::GpuTLB::TranslationState *incoming_state =
103 safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
104
105 TheISA::GpuTLB::TranslationState *coalesced_state =
106 safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
107
108 // Rule 1: Coalesce requests only if they
109 // fall within the same virtual page
110 Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
111 TheISA::PageBytes);
112
113 Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
114 TheISA::PageBytes);
115
116 if (incoming_virt_page_addr != coalesced_virt_page_addr)
117 return false;
118
119 //* Rule 2: Coalesce requests only if they
120 // share a TLB Mode, i.e. they are both read
121 // or write requests.
122 BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
123 BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
124
125 if (incoming_mode != coalesced_mode)
126 return false;
127
128 // when we can coalesce a packet update the reqCnt
129 // that is the number of packets represented by
130 // this coalesced packet
131 if (!incoming_state->prefetch)
132 coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
133
134 return true;
135}
136
137/*
138 * We need to update the physical addresses of all the translation requests
139 * that were coalesced into the one that just returned.
140 */
141void
142TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
143{
144 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
145
146 DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
147 issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
148
149 TheISA::GpuTLB::TranslationState *sender_state =
150 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
151
152 TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry;
153 assert(tlb_entry);
154 Addr first_entry_vaddr = tlb_entry->vaddr;
155 Addr first_entry_paddr = tlb_entry->paddr;
156 int page_size = tlb_entry->size();
157 bool uncacheable = tlb_entry->uncacheable;
158 int first_hit_level = sender_state->hitLevel;
159
160 // Get the physical page address of the translated request
161 // Using the page_size specified in the TLBEntry allows us
162 // to support different page sizes.
163 Addr phys_page_paddr = pkt->req->getPaddr();
164 phys_page_paddr &= ~(page_size - 1);
165
166 for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
167 PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
168 TheISA::GpuTLB::TranslationState *sender_state =
169 safe_cast<TheISA::GpuTLB::TranslationState*>(
170 local_pkt->senderState);
171
172 // we are sending the packet back, so pop the reqCnt associated
173 // with this level in the TLB hiearchy
174 if (!sender_state->prefetch)
175 sender_state->reqCnt.pop_back();
176
177 /*
178 * Only the first packet from this coalesced request has been
179 * translated. Grab the translated phys. page addr and update the
180 * physical addresses of the remaining packets with the appropriate
181 * page offsets.
182 */
183 if (i) {
184 Addr paddr = phys_page_paddr;
185 paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
186 local_pkt->req->setPaddr(paddr);
187
188 if (uncacheable)
189 local_pkt->req->setFlags(Request::UNCACHEABLE);
190
191 // update senderState->tlbEntry, so we can insert
192 // the correct TLBEentry in the TLBs above.
193 auto p = sender_state->tc->getProcessPtr();
194 sender_state->tlbEntry =
195 new TheISA::TlbEntry(p->pid(), first_entry_vaddr,
196 first_entry_paddr, false, false);
197
198 // update the hitLevel for all uncoalesced reqs
199 // so that each packet knows where it hit
200 // (used for statistics in the CUs)
201 sender_state->hitLevel = first_hit_level;
202 }
203
204 SlavePort *return_port = sender_state->ports.back();
205 sender_state->ports.pop_back();
206
207 // Translation is done - Convert to a response pkt if necessary and
208 // send the translation back
209 if (local_pkt->isRequest()) {
210 local_pkt->makeTimingResponse();
211 }
212
213 return_port->sendTimingResp(local_pkt);
214 }
215
216 // schedule clean up for end of this cycle
217 // This is a maximum priority event and must be on
218 // the same cycle as GPUTLB cleanup event to prevent
219 // race conditions with an IssueProbeEvent caused by
220 // MemSidePort::recvReqRetry
221 cleanupQueue.push(virt_page_addr);
222
223 if (!cleanupEvent.scheduled())
224 schedule(cleanupEvent, curTick());
225}
226
227// Receive translation requests, create a coalesced request,
228// and send them to the TLB (TLBProbesPerCycle)
229bool
230TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
231{
232 // first packet of a coalesced request
233 PacketPtr first_packet = nullptr;
234 // true if we are able to do coalescing
235 bool didCoalesce = false;
236 // number of coalesced reqs for a given window
237 int coalescedReq_cnt = 0;
238
239 TheISA::GpuTLB::TranslationState *sender_state =
240 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
241
242 // push back the port to remember the path back
243 sender_state->ports.push_back(this);
244
245 bool update_stats = !sender_state->prefetch;
246
247 if (update_stats) {
248 // if reqCnt is empty then this packet does not represent
249 // multiple uncoalesced reqs(pkts) but just a single pkt.
250 // If it does though then the reqCnt for each level in the
251 // hierarchy accumulates the total number of reqs this packet
252 // represents
253 int req_cnt = 1;
254
255 if (!sender_state->reqCnt.empty())
256 req_cnt = sender_state->reqCnt.back();
257
258 sender_state->reqCnt.push_back(req_cnt);
259
260 // update statistics
261 coalescer->uncoalescedAccesses++;
262 req_cnt = sender_state->reqCnt.back();
263 DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
264 coalescer->queuingCycles -= (curTick() * req_cnt);
265 coalescer->localqueuingCycles -= curTick();
266 }
267
268 // FIXME if you want to coalesce not based on the issueTime
269 // of the packets (i.e., from the compute unit's perspective)
270 // but based on when they reached this coalescer then
271 // remove the following if statement and use curTick() or
272 // coalescingWindow for the tick_index.
273 if (!sender_state->issueTime)
274 sender_state->issueTime = curTick();
275
276 // The tick index is used as a key to the coalescerFIFO hashmap.
277 // It is shared by all candidates that fall within the
278 // given coalescingWindow.
279 int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
280
281 if (coalescer->coalescerFIFO.count(tick_index)) {
282 coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
283 }
284
285 // see if we can coalesce the incoming pkt with another
286 // coalesced request with the same tick_index
287 for (int i = 0; i < coalescedReq_cnt; ++i) {
288 first_packet = coalescer->coalescerFIFO[tick_index][i][0];
289
290 if (coalescer->canCoalesce(pkt, first_packet)) {
291 coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
292
293 DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
294 i, tick_index,
295 coalescer->coalescerFIFO[tick_index][i].size());
296
297 didCoalesce = true;
298 break;
299 }
300 }
301
302 // if this is the first request for this tick_index
303 // or we did not manage to coalesce, update stats
304 // and make necessary allocations.
305 if (!coalescedReq_cnt || !didCoalesce) {
306 if (update_stats)
307 coalescer->coalescedAccesses++;
308
309 std::vector<PacketPtr> new_array;
310 new_array.push_back(pkt);
311 coalescer->coalescerFIFO[tick_index].push_back(new_array);
312
313 DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
314 "push\n", tick_index,
315 coalescer->coalescerFIFO[tick_index].size());
316 }
317
318 //schedule probeTLBEvent next cycle to send the
319 //coalesced requests to the TLB
320 if (!coalescer->probeTLBEvent.scheduled()) {
321 coalescer->schedule(coalescer->probeTLBEvent,
322 curTick() + coalescer->ticks(1));
323 }
324
325 return true;
326}
327
328void
329TLBCoalescer::CpuSidePort::recvReqRetry()
330{
331 panic("recvReqRetry called");
332}
333
334void
335TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
336{
337
338 TheISA::GpuTLB::TranslationState *sender_state =
339 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
340
341 bool update_stats = !sender_state->prefetch;
342
343 if (update_stats)
344 coalescer->uncoalescedAccesses++;
345
346 // If there is a pending timing request for this virtual address
347 // print a warning message. This is a temporary caveat of
348 // the current simulator where atomic and timing requests can
349 // coexist. FIXME remove this check/warning in the future.
350 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
351 int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
352
353 if (map_count) {
354 DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
355 "req. pending\n", virt_page_addr);
356 }
357
358 coalescer->memSidePort[0]->sendFunctional(pkt);
359}
360
361AddrRangeList
362TLBCoalescer::CpuSidePort::getAddrRanges() const
363{
364 // currently not checked by the master
365 AddrRangeList ranges;
366
367 return ranges;
368}
369
370bool
371TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
372{
373 // a translation completed and returned
374 coalescer->updatePhysAddresses(pkt);
375
376 return true;
377}
378
379void
380TLBCoalescer::MemSidePort::recvReqRetry()
381{
382 //we've receeived a retry. Schedule a probeTLBEvent
383 if (!coalescer->probeTLBEvent.scheduled())
384 coalescer->schedule(coalescer->probeTLBEvent,
385 curTick() + coalescer->ticks(1));
386}
387
388void
389TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
390{
391 fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
392}
393
394/*
395 * Here we scan the coalescer FIFO and issue the max
396 * number of permitted probes to the TLB below. We
397 * permit bypassing of coalesced requests for the same
398 * tick_index.
399 *
400 * We do not access the next tick_index unless we've
401 * drained the previous one. The coalesced requests
402 * that are successfully sent are moved to the
403 * issuedTranslationsTable table (the table which keeps
404 * track of the outstanding reqs)
405 */
406void
407TLBCoalescer::processProbeTLBEvent()
408{
409 // number of TLB probes sent so far
410 int sent_probes = 0;
411 // rejected denotes a blocking event
412 bool rejected = false;
413
414 // It is set to true either when the recvTiming of the TLB below
415 // returns false or when there is another outstanding request for the
416 // same virt. page.
417
418 DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__);
419
420 for (auto iter = coalescerFIFO.begin();
421 iter != coalescerFIFO.end() && !rejected; ) {
422 int coalescedReq_cnt = iter->second.size();
423 int i = 0;
424 int vector_index = 0;
425
426 DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
427 coalescedReq_cnt, iter->first);
428
429 while (i < coalescedReq_cnt) {
430 ++i;
431 PacketPtr first_packet = iter->second[vector_index][0];
432
433 // compute virtual page address for this request
434 Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
435 TheISA::PageBytes);
436
437 // is there another outstanding request for the same page addr?
438 int pending_reqs =
439 issuedTranslationsTable.count(virt_page_addr);
440
441 if (pending_reqs) {
442 DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
443 "page %#x\n", virt_page_addr);
444
445 ++vector_index;
446 rejected = true;
447
448 continue;
449 }
450
451 // send the coalesced request for virt_page_addr
452 if (!memSidePort[0]->sendTimingReq(first_packet)) {
453 DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
454 virt_page_addr);
455
456 // No need for a retries queue since we are already buffering
457 // the coalesced request in coalescerFIFO.
458 rejected = true;
459 ++vector_index;
460 } else {
461 TheISA::GpuTLB::TranslationState *tmp_sender_state =
462 safe_cast<TheISA::GpuTLB::TranslationState*>
463 (first_packet->senderState);
464
465 bool update_stats = !tmp_sender_state->prefetch;
466
467 if (update_stats) {
468 // req_cnt is total number of packets represented
469 // by the one we just sent counting all the way from
470 // the top of TLB hiearchy (i.e., from the CU)
471 int req_cnt = tmp_sender_state->reqCnt.back();
472 queuingCycles += (curTick() * req_cnt);
473
474 DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
475 name(), req_cnt);
476
477 // pkt_cnt is number of packets we coalesced into the one
478 // we just sent but only at this coalescer level
479 int pkt_cnt = iter->second[vector_index].size();
480 localqueuingCycles += (curTick() * pkt_cnt);
481 }
482
483 DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
484 virt_page_addr);
485
486 //copy coalescedReq to issuedTranslationsTable
487 issuedTranslationsTable[virt_page_addr]
488 = iter->second[vector_index];
489
490 //erase the entry of this coalesced req
491 iter->second.erase(iter->second.begin() + vector_index);
492
493 if (iter->second.empty())
494 assert(i == coalescedReq_cnt);
495
496 sent_probes++;
497 if (sent_probes == TLBProbesPerCycle)
498 return;
499 }
500 }
501
502 //if there are no more coalesced reqs for this tick_index
503 //erase the hash_map with the first iterator
504 if (iter->second.empty()) {
505 coalescerFIFO.erase(iter++);
506 } else {
507 ++iter;
508 }
509 }
510}
511
512void
513TLBCoalescer::processCleanupEvent()
514{
515 while (!cleanupQueue.empty()) {
516 Addr cleanup_addr = cleanupQueue.front();
517 cleanupQueue.pop();
518 issuedTranslationsTable.erase(cleanup_addr);
519
520 DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
521 cleanup_addr);
522 }
523}
524
525void
526TLBCoalescer::regStats()
527{
46 clock(p->clk_domain->clockPeriod()),
47 TLBProbesPerCycle(p->probesPerCycle),
48 coalescingWindow(p->coalescingWindow),
49 disableCoalescing(p->disableCoalescing),
50 probeTLBEvent([this]{ processProbeTLBEvent(); },
51 "Probe the TLB below",
52 false, Event::CPU_Tick_Pri),
53 cleanupEvent([this]{ processCleanupEvent(); },
54 "Cleanup issuedTranslationsTable hashmap",
55 false, Event::Maximum_Pri)
56{
57 // create the slave ports based on the number of connected ports
58 for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
59 cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
60 this, i));
61 }
62
63 // create the master ports based on the number of connected ports
64 for (size_t i = 0; i < p->port_master_connection_count; ++i) {
65 memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
66 this, i));
67 }
68}
69
70Port &
71TLBCoalescer::getPort(const std::string &if_name, PortID idx)
72{
73 if (if_name == "slave") {
74 if (idx >= static_cast<PortID>(cpuSidePort.size())) {
75 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
76 }
77
78 return *cpuSidePort[idx];
79 } else if (if_name == "master") {
80 if (idx >= static_cast<PortID>(memSidePort.size())) {
81 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
82 }
83
84 return *memSidePort[idx];
85 } else {
86 panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
87 }
88}
89
90/*
91 * This method returns true if the <incoming_pkt>
92 * can be coalesced with <coalesced_pkt> and false otherwise.
93 * A given set of rules is checked.
94 * The rules can potentially be modified based on the TLB level.
95 */
96bool
97TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
98{
99 if (disableCoalescing)
100 return false;
101
102 TheISA::GpuTLB::TranslationState *incoming_state =
103 safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
104
105 TheISA::GpuTLB::TranslationState *coalesced_state =
106 safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
107
108 // Rule 1: Coalesce requests only if they
109 // fall within the same virtual page
110 Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
111 TheISA::PageBytes);
112
113 Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
114 TheISA::PageBytes);
115
116 if (incoming_virt_page_addr != coalesced_virt_page_addr)
117 return false;
118
119 //* Rule 2: Coalesce requests only if they
120 // share a TLB Mode, i.e. they are both read
121 // or write requests.
122 BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
123 BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
124
125 if (incoming_mode != coalesced_mode)
126 return false;
127
128 // when we can coalesce a packet update the reqCnt
129 // that is the number of packets represented by
130 // this coalesced packet
131 if (!incoming_state->prefetch)
132 coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
133
134 return true;
135}
136
137/*
138 * We need to update the physical addresses of all the translation requests
139 * that were coalesced into the one that just returned.
140 */
141void
142TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
143{
144 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
145
146 DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
147 issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
148
149 TheISA::GpuTLB::TranslationState *sender_state =
150 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
151
152 TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry;
153 assert(tlb_entry);
154 Addr first_entry_vaddr = tlb_entry->vaddr;
155 Addr first_entry_paddr = tlb_entry->paddr;
156 int page_size = tlb_entry->size();
157 bool uncacheable = tlb_entry->uncacheable;
158 int first_hit_level = sender_state->hitLevel;
159
160 // Get the physical page address of the translated request
161 // Using the page_size specified in the TLBEntry allows us
162 // to support different page sizes.
163 Addr phys_page_paddr = pkt->req->getPaddr();
164 phys_page_paddr &= ~(page_size - 1);
165
166 for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
167 PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
168 TheISA::GpuTLB::TranslationState *sender_state =
169 safe_cast<TheISA::GpuTLB::TranslationState*>(
170 local_pkt->senderState);
171
172 // we are sending the packet back, so pop the reqCnt associated
173 // with this level in the TLB hiearchy
174 if (!sender_state->prefetch)
175 sender_state->reqCnt.pop_back();
176
177 /*
178 * Only the first packet from this coalesced request has been
179 * translated. Grab the translated phys. page addr and update the
180 * physical addresses of the remaining packets with the appropriate
181 * page offsets.
182 */
183 if (i) {
184 Addr paddr = phys_page_paddr;
185 paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
186 local_pkt->req->setPaddr(paddr);
187
188 if (uncacheable)
189 local_pkt->req->setFlags(Request::UNCACHEABLE);
190
191 // update senderState->tlbEntry, so we can insert
192 // the correct TLBEentry in the TLBs above.
193 auto p = sender_state->tc->getProcessPtr();
194 sender_state->tlbEntry =
195 new TheISA::TlbEntry(p->pid(), first_entry_vaddr,
196 first_entry_paddr, false, false);
197
198 // update the hitLevel for all uncoalesced reqs
199 // so that each packet knows where it hit
200 // (used for statistics in the CUs)
201 sender_state->hitLevel = first_hit_level;
202 }
203
204 SlavePort *return_port = sender_state->ports.back();
205 sender_state->ports.pop_back();
206
207 // Translation is done - Convert to a response pkt if necessary and
208 // send the translation back
209 if (local_pkt->isRequest()) {
210 local_pkt->makeTimingResponse();
211 }
212
213 return_port->sendTimingResp(local_pkt);
214 }
215
216 // schedule clean up for end of this cycle
217 // This is a maximum priority event and must be on
218 // the same cycle as GPUTLB cleanup event to prevent
219 // race conditions with an IssueProbeEvent caused by
220 // MemSidePort::recvReqRetry
221 cleanupQueue.push(virt_page_addr);
222
223 if (!cleanupEvent.scheduled())
224 schedule(cleanupEvent, curTick());
225}
226
227// Receive translation requests, create a coalesced request,
228// and send them to the TLB (TLBProbesPerCycle)
229bool
230TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
231{
232 // first packet of a coalesced request
233 PacketPtr first_packet = nullptr;
234 // true if we are able to do coalescing
235 bool didCoalesce = false;
236 // number of coalesced reqs for a given window
237 int coalescedReq_cnt = 0;
238
239 TheISA::GpuTLB::TranslationState *sender_state =
240 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
241
242 // push back the port to remember the path back
243 sender_state->ports.push_back(this);
244
245 bool update_stats = !sender_state->prefetch;
246
247 if (update_stats) {
248 // if reqCnt is empty then this packet does not represent
249 // multiple uncoalesced reqs(pkts) but just a single pkt.
250 // If it does though then the reqCnt for each level in the
251 // hierarchy accumulates the total number of reqs this packet
252 // represents
253 int req_cnt = 1;
254
255 if (!sender_state->reqCnt.empty())
256 req_cnt = sender_state->reqCnt.back();
257
258 sender_state->reqCnt.push_back(req_cnt);
259
260 // update statistics
261 coalescer->uncoalescedAccesses++;
262 req_cnt = sender_state->reqCnt.back();
263 DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
264 coalescer->queuingCycles -= (curTick() * req_cnt);
265 coalescer->localqueuingCycles -= curTick();
266 }
267
268 // FIXME if you want to coalesce not based on the issueTime
269 // of the packets (i.e., from the compute unit's perspective)
270 // but based on when they reached this coalescer then
271 // remove the following if statement and use curTick() or
272 // coalescingWindow for the tick_index.
273 if (!sender_state->issueTime)
274 sender_state->issueTime = curTick();
275
276 // The tick index is used as a key to the coalescerFIFO hashmap.
277 // It is shared by all candidates that fall within the
278 // given coalescingWindow.
279 int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
280
281 if (coalescer->coalescerFIFO.count(tick_index)) {
282 coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
283 }
284
285 // see if we can coalesce the incoming pkt with another
286 // coalesced request with the same tick_index
287 for (int i = 0; i < coalescedReq_cnt; ++i) {
288 first_packet = coalescer->coalescerFIFO[tick_index][i][0];
289
290 if (coalescer->canCoalesce(pkt, first_packet)) {
291 coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
292
293 DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
294 i, tick_index,
295 coalescer->coalescerFIFO[tick_index][i].size());
296
297 didCoalesce = true;
298 break;
299 }
300 }
301
302 // if this is the first request for this tick_index
303 // or we did not manage to coalesce, update stats
304 // and make necessary allocations.
305 if (!coalescedReq_cnt || !didCoalesce) {
306 if (update_stats)
307 coalescer->coalescedAccesses++;
308
309 std::vector<PacketPtr> new_array;
310 new_array.push_back(pkt);
311 coalescer->coalescerFIFO[tick_index].push_back(new_array);
312
313 DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
314 "push\n", tick_index,
315 coalescer->coalescerFIFO[tick_index].size());
316 }
317
318 //schedule probeTLBEvent next cycle to send the
319 //coalesced requests to the TLB
320 if (!coalescer->probeTLBEvent.scheduled()) {
321 coalescer->schedule(coalescer->probeTLBEvent,
322 curTick() + coalescer->ticks(1));
323 }
324
325 return true;
326}
327
328void
329TLBCoalescer::CpuSidePort::recvReqRetry()
330{
331 panic("recvReqRetry called");
332}
333
334void
335TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
336{
337
338 TheISA::GpuTLB::TranslationState *sender_state =
339 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
340
341 bool update_stats = !sender_state->prefetch;
342
343 if (update_stats)
344 coalescer->uncoalescedAccesses++;
345
346 // If there is a pending timing request for this virtual address
347 // print a warning message. This is a temporary caveat of
348 // the current simulator where atomic and timing requests can
349 // coexist. FIXME remove this check/warning in the future.
350 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
351 int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
352
353 if (map_count) {
354 DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
355 "req. pending\n", virt_page_addr);
356 }
357
358 coalescer->memSidePort[0]->sendFunctional(pkt);
359}
360
361AddrRangeList
362TLBCoalescer::CpuSidePort::getAddrRanges() const
363{
364 // currently not checked by the master
365 AddrRangeList ranges;
366
367 return ranges;
368}
369
370bool
371TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
372{
373 // a translation completed and returned
374 coalescer->updatePhysAddresses(pkt);
375
376 return true;
377}
378
379void
380TLBCoalescer::MemSidePort::recvReqRetry()
381{
382 //we've receeived a retry. Schedule a probeTLBEvent
383 if (!coalescer->probeTLBEvent.scheduled())
384 coalescer->schedule(coalescer->probeTLBEvent,
385 curTick() + coalescer->ticks(1));
386}
387
388void
389TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
390{
391 fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
392}
393
394/*
395 * Here we scan the coalescer FIFO and issue the max
396 * number of permitted probes to the TLB below. We
397 * permit bypassing of coalesced requests for the same
398 * tick_index.
399 *
400 * We do not access the next tick_index unless we've
401 * drained the previous one. The coalesced requests
402 * that are successfully sent are moved to the
403 * issuedTranslationsTable table (the table which keeps
404 * track of the outstanding reqs)
405 */
406void
407TLBCoalescer::processProbeTLBEvent()
408{
409 // number of TLB probes sent so far
410 int sent_probes = 0;
411 // rejected denotes a blocking event
412 bool rejected = false;
413
414 // It is set to true either when the recvTiming of the TLB below
415 // returns false or when there is another outstanding request for the
416 // same virt. page.
417
418 DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__);
419
420 for (auto iter = coalescerFIFO.begin();
421 iter != coalescerFIFO.end() && !rejected; ) {
422 int coalescedReq_cnt = iter->second.size();
423 int i = 0;
424 int vector_index = 0;
425
426 DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
427 coalescedReq_cnt, iter->first);
428
429 while (i < coalescedReq_cnt) {
430 ++i;
431 PacketPtr first_packet = iter->second[vector_index][0];
432
433 // compute virtual page address for this request
434 Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
435 TheISA::PageBytes);
436
437 // is there another outstanding request for the same page addr?
438 int pending_reqs =
439 issuedTranslationsTable.count(virt_page_addr);
440
441 if (pending_reqs) {
442 DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
443 "page %#x\n", virt_page_addr);
444
445 ++vector_index;
446 rejected = true;
447
448 continue;
449 }
450
451 // send the coalesced request for virt_page_addr
452 if (!memSidePort[0]->sendTimingReq(first_packet)) {
453 DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
454 virt_page_addr);
455
456 // No need for a retries queue since we are already buffering
457 // the coalesced request in coalescerFIFO.
458 rejected = true;
459 ++vector_index;
460 } else {
461 TheISA::GpuTLB::TranslationState *tmp_sender_state =
462 safe_cast<TheISA::GpuTLB::TranslationState*>
463 (first_packet->senderState);
464
465 bool update_stats = !tmp_sender_state->prefetch;
466
467 if (update_stats) {
468 // req_cnt is total number of packets represented
469 // by the one we just sent counting all the way from
470 // the top of TLB hiearchy (i.e., from the CU)
471 int req_cnt = tmp_sender_state->reqCnt.back();
472 queuingCycles += (curTick() * req_cnt);
473
474 DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
475 name(), req_cnt);
476
477 // pkt_cnt is number of packets we coalesced into the one
478 // we just sent but only at this coalescer level
479 int pkt_cnt = iter->second[vector_index].size();
480 localqueuingCycles += (curTick() * pkt_cnt);
481 }
482
483 DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
484 virt_page_addr);
485
486 //copy coalescedReq to issuedTranslationsTable
487 issuedTranslationsTable[virt_page_addr]
488 = iter->second[vector_index];
489
490 //erase the entry of this coalesced req
491 iter->second.erase(iter->second.begin() + vector_index);
492
493 if (iter->second.empty())
494 assert(i == coalescedReq_cnt);
495
496 sent_probes++;
497 if (sent_probes == TLBProbesPerCycle)
498 return;
499 }
500 }
501
502 //if there are no more coalesced reqs for this tick_index
503 //erase the hash_map with the first iterator
504 if (iter->second.empty()) {
505 coalescerFIFO.erase(iter++);
506 } else {
507 ++iter;
508 }
509 }
510}
511
512void
513TLBCoalescer::processCleanupEvent()
514{
515 while (!cleanupQueue.empty()) {
516 Addr cleanup_addr = cleanupQueue.front();
517 cleanupQueue.pop();
518 issuedTranslationsTable.erase(cleanup_addr);
519
520 DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
521 cleanup_addr);
522 }
523}
524
525void
526TLBCoalescer::regStats()
527{
528 MemObject::regStats();
528 ClockedObject::regStats();
529
530 uncoalescedAccesses
531 .name(name() + ".uncoalesced_accesses")
532 .desc("Number of uncoalesced TLB accesses")
533 ;
534
535 coalescedAccesses
536 .name(name() + ".coalesced_accesses")
537 .desc("Number of coalesced TLB accesses")
538 ;
539
540 queuingCycles
541 .name(name() + ".queuing_cycles")
542 .desc("Number of cycles spent in queue")
543 ;
544
545 localqueuingCycles
546 .name(name() + ".local_queuing_cycles")
547 .desc("Number of cycles spent in queue for all incoming reqs")
548 ;
549
550 localLatency
551 .name(name() + ".local_latency")
552 .desc("Avg. latency over all incoming pkts")
553 ;
554
555 localLatency = localqueuingCycles / uncoalescedAccesses;
556}
557
558
559TLBCoalescer*
560TLBCoalescerParams::create()
561{
562 return new TLBCoalescer(this);
563}
564
529
530 uncoalescedAccesses
531 .name(name() + ".uncoalesced_accesses")
532 .desc("Number of uncoalesced TLB accesses")
533 ;
534
535 coalescedAccesses
536 .name(name() + ".coalesced_accesses")
537 .desc("Number of coalesced TLB accesses")
538 ;
539
540 queuingCycles
541 .name(name() + ".queuing_cycles")
542 .desc("Number of cycles spent in queue")
543 ;
544
545 localqueuingCycles
546 .name(name() + ".local_queuing_cycles")
547 .desc("Number of cycles spent in queue for all incoming reqs")
548 ;
549
550 localLatency
551 .name(name() + ".local_latency")
552 .desc("Avg. latency over all incoming pkts")
553 ;
554
555 localLatency = localqueuingCycles / uncoalescedAccesses;
556}
557
558
559TLBCoalescer*
560TLBCoalescerParams::create()
561{
562 return new TLBCoalescer(this);
563}
564