fetch_unit.cc revision 12697
1/* 2 * Copyright (c) 2014-2017 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Anthony Gutierrez, 34 * Brad Beckmann, 35 * Sooraj Puthoor 36 */ 37 38#include "gpu-compute/fetch_unit.hh" 39 40#include "debug/GPUFetch.hh" 41#include "debug/GPUPort.hh" 42#include "debug/GPUTLB.hh" 43#include "gpu-compute/compute_unit.hh" 44#include "gpu-compute/gpu_dyn_inst.hh" 45#include "gpu-compute/gpu_static_inst.hh" 46#include "gpu-compute/shader.hh" 47#include "gpu-compute/wavefront.hh" 48#include "mem/ruby/system/RubySystem.hh" 49 50uint32_t FetchUnit::globalFetchUnitID; 51 52FetchUnit::FetchUnit(const ComputeUnitParams* params) : 53 timingSim(true), 54 computeUnit(nullptr), 55 fetchScheduler(params), 56 waveList(nullptr) 57{ 58} 59 60FetchUnit::~FetchUnit() 61{ 62 fetchQueue.clear(); 63 fetchStatusQueue.clear(); 64} 65 66void 67FetchUnit::init(ComputeUnit *cu) 68{ 69 computeUnit = cu; 70 timingSim = computeUnit->shader->timingSim; 71 fetchQueue.clear(); 72 fetchStatusQueue.resize(computeUnit->shader->n_wf); 73 74 for (int j = 0; j < computeUnit->shader->n_wf; ++j) { 75 fetchStatusQueue[j] = std::make_pair(waveList->at(j), false); 76 } 77 78 fetchScheduler.bindList(&fetchQueue); 79} 80 81void 82FetchUnit::exec() 83{ 84 // re-evaluate waves which are marked as not ready for fetch 85 for (int j = 0; j < computeUnit->shader->n_wf; ++j) { 86 // Following code assumes 64-bit opertaion and all insts are 87 // represented by 64-bit pointers to inst objects. 88 Wavefront *curWave = fetchStatusQueue[j].first; 89 assert (curWave); 90 91 // The wavefront has to be active, the IB occupancy has to be 92 // 4 or less instructions and it can not have any branches to 93 // prevent speculative instruction fetches 94 if (!fetchStatusQueue[j].second) { 95 if (curWave->status == Wavefront::S_RUNNING && 96 curWave->instructionBuffer.size() <= 4 && 97 !curWave->instructionBufferHasBranch() && 98 !curWave->pendingFetch) { 99 fetchQueue.push_back(curWave); 100 fetchStatusQueue[j].second = true; 101 } 102 } 103 } 104 105 // Fetch only if there is some wave ready to be fetched 106 // An empty fetchQueue will cause the schedular to panic 107 if (fetchQueue.size()) { 108 Wavefront *waveToBeFetched = fetchScheduler.chooseWave(); 109 waveToBeFetched->pendingFetch = true; 110 fetchStatusQueue[waveToBeFetched->wfSlotId].second = false; 111 initiateFetch(waveToBeFetched); 112 } 113} 114 115void 116FetchUnit::initiateFetch(Wavefront *wavefront) 117{ 118 // calculate the virtual address to fetch from the SQC 119 Addr vaddr = wavefront->pc(); 120 121 /** 122 * the instruction buffer holds one instruction per entry, regardless 123 * of the underlying instruction's size. the PC, however, addresses 124 * instrutions on a 32b granularity so we must account for that here. 125 */ 126 for (int i = 0; i < wavefront->instructionBuffer.size(); ++i) { 127 vaddr += 128 wavefront->instructionBuffer.at(i)->staticInstruction()->instSize(); 129 } 130 vaddr = wavefront->basePtr + vaddr; 131 132 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n", 133 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr); 134 135 // Since this is an instruction prefetch, if you're split then just finish 136 // out the current line. 137 int block_size = computeUnit->cacheLineSize(); 138 // check for split accesses 139 Addr split_addr = roundDown(vaddr + block_size - 1, block_size); 140 int size = block_size; 141 142 if (split_addr > vaddr) { 143 // misaligned access, just grab the rest of the line 144 size = split_addr - vaddr; 145 } 146 147 // set up virtual request 148 Request *req = new Request(0, vaddr, size, Request::INST_FETCH, 149 computeUnit->masterId(), 0, 0, 0); 150 151 PacketPtr pkt = new Packet(req, MemCmd::ReadReq); 152 // This fetchBlock is kind of faux right now - because the translations so 153 // far don't actually return Data 154 uint64_t fetchBlock; 155 pkt->dataStatic(&fetchBlock); 156 157 if (timingSim) { 158 // SenderState needed on Return 159 pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront); 160 161 // Sender State needed by TLB hierarchy 162 pkt->senderState = 163 new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, 164 computeUnit->shader->gpuTc, 165 false, pkt->senderState); 166 167 if (computeUnit->sqcTLBPort->isStalled()) { 168 assert(computeUnit->sqcTLBPort->retries.size() > 0); 169 170 DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", 171 vaddr); 172 173 computeUnit->sqcTLBPort->retries.push_back(pkt); 174 } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) { 175 // Stall the data port; 176 // No more packet is issued till 177 // ruby indicates resources are freed by 178 // a recvReqRetry() call back on this port. 179 computeUnit->sqcTLBPort->stallPort(); 180 181 DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", 182 vaddr); 183 184 computeUnit->sqcTLBPort->retries.push_back(pkt); 185 } else { 186 DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr); 187 } 188 } else { 189 pkt->senderState = 190 new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, 191 computeUnit->shader->gpuTc); 192 193 computeUnit->sqcTLBPort->sendFunctional(pkt); 194 195 TheISA::GpuTLB::TranslationState *sender_state = 196 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 197 198 delete sender_state->tlbEntry; 199 delete sender_state; 200 // fetch the instructions from the SQC when we operate in 201 // functional mode only 202 fetch(pkt, wavefront); 203 } 204} 205 206void 207FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) 208{ 209 assert(pkt->req->hasPaddr()); 210 assert(pkt->req->hasSize()); 211 212 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n", 213 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, 214 pkt->req->getPaddr()); 215 216 // this is necessary because the GPU TLB receives packets instead of 217 // requests. when the translation is complete, all relevent fields in the 218 // request will be populated, but not in the packet. here we create the 219 // new packet so we can set the size, addr, and proper flags. 220 PacketPtr oldPkt = pkt; 221 pkt = new Packet(oldPkt->req, oldPkt->cmd); 222 delete oldPkt; 223 224 TheGpuISA::RawMachInst *data = 225 new TheGpuISA::RawMachInst[pkt->req->getSize() / 226 sizeof(TheGpuISA::RawMachInst)]; 227 228 pkt->dataDynamic<TheGpuISA::RawMachInst>(data); 229 230 // New SenderState for the memory access 231 pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront); 232 233 if (timingSim) { 234 // translation is done. Send the appropriate timing memory request. 235 236 if (!computeUnit->sqcPort->sendTimingReq(pkt)) { 237 computeUnit->sqcPort->retries.push_back(std::make_pair(pkt, 238 wavefront)); 239 240 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n", 241 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, 242 pkt->req->getPaddr()); 243 } else { 244 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n", 245 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, 246 pkt->req->getPaddr()); 247 } 248 } else { 249 computeUnit->sqcPort->sendFunctional(pkt); 250 processFetchReturn(pkt); 251 } 252} 253 254void 255FetchUnit::processFetchReturn(PacketPtr pkt) 256{ 257 ComputeUnit::SQCPort::SenderState *sender_state = 258 safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState); 259 260 Wavefront *wavefront = sender_state->wavefront; 261 262 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned " 263 "%d bytes, %d instructions!\n", computeUnit->cu_id, 264 wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(), 265 pkt->req->getSize(), pkt->req->getSize() / 266 sizeof(TheGpuISA::RawMachInst)); 267 268 if (wavefront->dropFetch) { 269 assert(wavefront->instructionBuffer.empty()); 270 wavefront->dropFetch = false; 271 } else { 272 TheGpuISA::RawMachInst *inst_index_ptr = 273 (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>(); 274 275 assert(wavefront->instructionBuffer.size() <= 4); 276 277 for (int i = 0; i < pkt->req->getSize() / 278 sizeof(TheGpuISA::RawMachInst); ++i) { 279 GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]); 280 281 assert(inst_ptr); 282 283 if (inst_ptr->instSize() == 8) { 284 /** 285 * this instruction occupies 2 consecutive 286 * entries in the instruction array, the 287 * second of which contains a nullptr. so if 288 * this inst is 8 bytes we advance two entries 289 * instead of 1 290 */ 291 ++i; 292 } 293 294 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n", 295 computeUnit->cu_id, wavefront->simdId, 296 wavefront->wfSlotId, inst_ptr->disassemble()); 297 298 GPUDynInstPtr gpuDynInst = 299 std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr, 300 computeUnit->getAndIncSeqNum()); 301 302 wavefront->instructionBuffer.push_back(gpuDynInst); 303 } 304 } 305 306 wavefront->pendingFetch = false; 307 308 delete pkt->senderState; 309 delete pkt->req; 310 delete pkt; 311} 312 313void 314FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list) 315{ 316 waveList = wave_list; 317} 318