1/* 2 * Copyright (c) 2014-2017 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Anthony Gutierrez, 34 * Brad Beckmann, 35 * Sooraj Puthoor 36 */ 37 38#include "gpu-compute/fetch_unit.hh" 39 40#include "debug/GPUFetch.hh" 41#include "debug/GPUPort.hh" 42#include "debug/GPUTLB.hh" 43#include "gpu-compute/compute_unit.hh" 44#include "gpu-compute/gpu_dyn_inst.hh" 45#include "gpu-compute/gpu_static_inst.hh" 46#include "gpu-compute/shader.hh" 47#include "gpu-compute/wavefront.hh" 48#include "mem/ruby/system/RubySystem.hh" 49 50uint32_t FetchUnit::globalFetchUnitID; 51 52FetchUnit::FetchUnit(const ComputeUnitParams* params) : 53 timingSim(true), 54 computeUnit(nullptr), 55 fetchScheduler(params), 56 waveList(nullptr) 57{ 58} 59 60FetchUnit::~FetchUnit() 61{ 62 fetchQueue.clear(); 63 fetchStatusQueue.clear(); 64} 65 66void 67FetchUnit::init(ComputeUnit *cu) 68{ 69 computeUnit = cu; 70 timingSim = computeUnit->shader->timingSim; 71 fetchQueue.clear(); 72 fetchStatusQueue.resize(computeUnit->shader->n_wf); 73 74 for (int j = 0; j < computeUnit->shader->n_wf; ++j) { 75 fetchStatusQueue[j] = std::make_pair(waveList->at(j), false); 76 } 77 78 fetchScheduler.bindList(&fetchQueue); 79} 80 81void 82FetchUnit::exec() 83{ 84 // re-evaluate waves which are marked as not ready for fetch 85 for (int j = 0; j < computeUnit->shader->n_wf; ++j) { 86 // Following code assumes 64-bit opertaion and all insts are 87 // represented by 64-bit pointers to inst objects. 88 Wavefront *curWave = fetchStatusQueue[j].first; 89 assert (curWave); 90 91 // The wavefront has to be active, the IB occupancy has to be 92 // 4 or less instructions and it can not have any branches to 93 // prevent speculative instruction fetches 94 if (!fetchStatusQueue[j].second) { 95 if (curWave->status == Wavefront::S_RUNNING && 96 curWave->instructionBuffer.size() <= 4 && 97 !curWave->instructionBufferHasBranch() && 98 !curWave->pendingFetch) { 99 fetchQueue.push_back(curWave); 100 fetchStatusQueue[j].second = true; 101 } 102 } 103 } 104 105 // Fetch only if there is some wave ready to be fetched 106 // An empty fetchQueue will cause the schedular to panic 107 if (fetchQueue.size()) { 108 Wavefront *waveToBeFetched = fetchScheduler.chooseWave(); 109 waveToBeFetched->pendingFetch = true; 110 fetchStatusQueue[waveToBeFetched->wfSlotId].second = false; 111 initiateFetch(waveToBeFetched); 112 } 113} 114 115void 116FetchUnit::initiateFetch(Wavefront *wavefront) 117{ 118 // calculate the virtual address to fetch from the SQC 119 Addr vaddr = wavefront->pc(); 120 121 /** 122 * the instruction buffer holds one instruction per entry, regardless 123 * of the underlying instruction's size. the PC, however, addresses 124 * instrutions on a 32b granularity so we must account for that here. 125 */ 126 for (int i = 0; i < wavefront->instructionBuffer.size(); ++i) { 127 vaddr += 128 wavefront->instructionBuffer.at(i)->staticInstruction()->instSize(); 129 } 130 vaddr = wavefront->basePtr + vaddr; 131 132 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n", 133 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr); 134 135 // Since this is an instruction prefetch, if you're split then just finish 136 // out the current line. 137 int block_size = computeUnit->cacheLineSize(); 138 // check for split accesses 139 Addr split_addr = roundDown(vaddr + block_size - 1, block_size); 140 int size = block_size; 141 142 if (split_addr > vaddr) { 143 // misaligned access, just grab the rest of the line 144 size = split_addr - vaddr; 145 } 146 147 // set up virtual request 148 RequestPtr req = std::make_shared<Request>( 149 0, vaddr, size, Request::INST_FETCH, 150 computeUnit->masterId(), 0, 0, nullptr); 151 152 PacketPtr pkt = new Packet(req, MemCmd::ReadReq); 153 // This fetchBlock is kind of faux right now - because the translations so 154 // far don't actually return Data 155 uint64_t fetchBlock; 156 pkt->dataStatic(&fetchBlock); 157 158 if (timingSim) { 159 // SenderState needed on Return 160 pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront); 161 162 // Sender State needed by TLB hierarchy 163 pkt->senderState = 164 new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, 165 computeUnit->shader->gpuTc, 166 false, pkt->senderState); 167 168 if (computeUnit->sqcTLBPort->isStalled()) { 169 assert(computeUnit->sqcTLBPort->retries.size() > 0); 170 171 DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", 172 vaddr); 173 174 computeUnit->sqcTLBPort->retries.push_back(pkt); 175 } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) { 176 // Stall the data port; 177 // No more packet is issued till 178 // ruby indicates resources are freed by 179 // a recvReqRetry() call back on this port. 180 computeUnit->sqcTLBPort->stallPort(); 181 182 DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", 183 vaddr); 184 185 computeUnit->sqcTLBPort->retries.push_back(pkt); 186 } else { 187 DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr); 188 } 189 } else { 190 pkt->senderState = 191 new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, 192 computeUnit->shader->gpuTc); 193 194 computeUnit->sqcTLBPort->sendFunctional(pkt); 195 196 TheISA::GpuTLB::TranslationState *sender_state = 197 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 198 199 delete sender_state->tlbEntry; 200 delete sender_state; 201 // fetch the instructions from the SQC when we operate in 202 // functional mode only 203 fetch(pkt, wavefront); 204 } 205} 206 207void 208FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) 209{ 210 assert(pkt->req->hasPaddr()); 211 assert(pkt->req->hasSize()); 212 213 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n", 214 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, 215 pkt->req->getPaddr()); 216 217 // this is necessary because the GPU TLB receives packets instead of 218 // requests. when the translation is complete, all relevent fields in the 219 // request will be populated, but not in the packet. here we create the 220 // new packet so we can set the size, addr, and proper flags. 221 PacketPtr oldPkt = pkt; 222 pkt = new Packet(oldPkt->req, oldPkt->cmd); 223 delete oldPkt; 224 225 TheGpuISA::RawMachInst *data = 226 new TheGpuISA::RawMachInst[pkt->req->getSize() / 227 sizeof(TheGpuISA::RawMachInst)]; 228 229 pkt->dataDynamic<TheGpuISA::RawMachInst>(data); 230 231 // New SenderState for the memory access 232 pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront); 233 234 if (timingSim) { 235 // translation is done. Send the appropriate timing memory request. 236 237 if (!computeUnit->sqcPort->sendTimingReq(pkt)) { 238 computeUnit->sqcPort->retries.push_back(std::make_pair(pkt, 239 wavefront)); 240 241 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n", 242 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, 243 pkt->req->getPaddr()); 244 } else { 245 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n", 246 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, 247 pkt->req->getPaddr()); 248 } 249 } else { 250 computeUnit->sqcPort->sendFunctional(pkt); 251 processFetchReturn(pkt); 252 } 253} 254 255void 256FetchUnit::processFetchReturn(PacketPtr pkt) 257{ 258 ComputeUnit::SQCPort::SenderState *sender_state = 259 safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState); 260 261 Wavefront *wavefront = sender_state->wavefront; 262 263 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned " 264 "%d bytes, %d instructions!\n", computeUnit->cu_id, 265 wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(), 266 pkt->req->getSize(), pkt->req->getSize() / 267 sizeof(TheGpuISA::RawMachInst)); 268 269 if (wavefront->dropFetch) { 270 assert(wavefront->instructionBuffer.empty()); 271 wavefront->dropFetch = false; 272 } else { 273 TheGpuISA::RawMachInst *inst_index_ptr = 274 (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>(); 275 276 assert(wavefront->instructionBuffer.size() <= 4); 277 278 for (int i = 0; i < pkt->req->getSize() / 279 sizeof(TheGpuISA::RawMachInst); ++i) { 280 GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]); 281 282 assert(inst_ptr); 283 284 if (inst_ptr->instSize() == 8) { 285 /** 286 * this instruction occupies 2 consecutive 287 * entries in the instruction array, the 288 * second of which contains a nullptr. so if 289 * this inst is 8 bytes we advance two entries 290 * instead of 1 291 */ 292 ++i; 293 } 294 295 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n", 296 computeUnit->cu_id, wavefront->simdId, 297 wavefront->wfSlotId, inst_ptr->disassemble()); 298 299 GPUDynInstPtr gpuDynInst = 300 std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr, 301 computeUnit->getAndIncSeqNum()); 302 303 wavefront->instructionBuffer.push_back(gpuDynInst); 304 } 305 } 306 307 wavefront->pendingFetch = false; 308 309 delete pkt->senderState; 310 delete pkt; 311} 312 313void 314FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list) 315{ 316 waveList = wave_list; 317} 318