fetch_unit.cc revision 11697
1/* 2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Brad Beckmann, Sooraj Puthoor 34 */ 35 36#include "gpu-compute/fetch_unit.hh" 37 38#include "debug/GPUFetch.hh" 39#include "debug/GPUPort.hh" 40#include "debug/GPUTLB.hh" 41#include "gpu-compute/compute_unit.hh" 42#include "gpu-compute/gpu_dyn_inst.hh" 43#include "gpu-compute/gpu_static_inst.hh" 44#include "gpu-compute/shader.hh" 45#include "gpu-compute/wavefront.hh" 46#include "mem/ruby/system/RubySystem.hh" 47 48uint32_t FetchUnit::globalFetchUnitID; 49 50FetchUnit::FetchUnit(const ComputeUnitParams* params) : 51 timingSim(true), 52 computeUnit(nullptr), 53 fetchScheduler(params), 54 waveList(nullptr) 55{ 56} 57 58FetchUnit::~FetchUnit() 59{ 60 fetchQueue.clear(); 61 fetchStatusQueue.clear(); 62} 63 64void 65FetchUnit::init(ComputeUnit *cu) 66{ 67 computeUnit = cu; 68 timingSim = computeUnit->shader->timingSim; 69 fetchQueue.clear(); 70 fetchStatusQueue.resize(computeUnit->shader->n_wf); 71 72 for (int j = 0; j < computeUnit->shader->n_wf; ++j) { 73 fetchStatusQueue[j] = std::make_pair(waveList->at(j), false); 74 } 75 76 fetchScheduler.bindList(&fetchQueue); 77} 78 79void 80FetchUnit::exec() 81{ 82 // re-evaluate waves which are marked as not ready for fetch 83 for (int j = 0; j < computeUnit->shader->n_wf; ++j) { 84 // Following code assumes 64-bit opertaion and all insts are 85 // represented by 64-bit pointers to inst objects. 86 Wavefront *curWave = fetchStatusQueue[j].first; 87 assert (curWave); 88 89 // The wavefront has to be active, the IB occupancy has to be 90 // 4 or less instructions and it can not have any branches to 91 // prevent speculative instruction fetches 92 if (!fetchStatusQueue[j].second) { 93 if (curWave->status == Wavefront::S_RUNNING && 94 curWave->instructionBuffer.size() <= 4 && 95 !curWave->instructionBufferHasBranch() && 96 !curWave->pendingFetch) { 97 fetchQueue.push_back(curWave); 98 fetchStatusQueue[j].second = true; 99 } 100 } 101 } 102 103 // Fetch only if there is some wave ready to be fetched 104 // An empty fetchQueue will cause the schedular to panic 105 if (fetchQueue.size()) { 106 Wavefront *waveToBeFetched = fetchScheduler.chooseWave(); 107 waveToBeFetched->pendingFetch = true; 108 fetchStatusQueue[waveToBeFetched->wfSlotId].second = false; 109 initiateFetch(waveToBeFetched); 110 } 111} 112 113void 114FetchUnit::initiateFetch(Wavefront *wavefront) 115{ 116 // calculate the virtual address to fetch from the SQC 117 Addr vaddr = wavefront->pc(); 118 119 /** 120 * the instruction buffer holds one instruction per entry, regardless 121 * of the underlying instruction's size. the PC, however, addresses 122 * instrutions on a 32b granularity so we must account for that here. 123 */ 124 for (int i = 0; i < wavefront->instructionBuffer.size(); ++i) { 125 vaddr += 126 wavefront->instructionBuffer.at(i)->staticInstruction()->instSize(); 127 } 128 vaddr = wavefront->basePtr + vaddr; 129 130 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n", 131 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr); 132 133 // Since this is an instruction prefetch, if you're split then just finish 134 // out the current line. 135 unsigned block_size = RubySystem::getBlockSizeBytes(); 136 // check for split accesses 137 Addr split_addr = roundDown(vaddr + block_size - 1, block_size); 138 unsigned size = block_size; 139 140 if (split_addr > vaddr) { 141 // misaligned access, just grab the rest of the line 142 size = split_addr - vaddr; 143 } 144 145 // set up virtual request 146 Request *req = new Request(0, vaddr, size, Request::INST_FETCH, 147 computeUnit->masterId(), 0, 0, 0); 148 149 PacketPtr pkt = new Packet(req, MemCmd::ReadReq); 150 // This fetchBlock is kind of faux right now - because the translations so 151 // far don't actually return Data 152 uint64_t fetchBlock; 153 pkt->dataStatic(&fetchBlock); 154 155 if (timingSim) { 156 // SenderState needed on Return 157 pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront); 158 159 // Sender State needed by TLB hierarchy 160 pkt->senderState = 161 new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, 162 computeUnit->shader->gpuTc, 163 false, pkt->senderState); 164 165 if (computeUnit->sqcTLBPort->isStalled()) { 166 assert(computeUnit->sqcTLBPort->retries.size() > 0); 167 168 DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", 169 vaddr); 170 171 computeUnit->sqcTLBPort->retries.push_back(pkt); 172 } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) { 173 // Stall the data port; 174 // No more packet is issued till 175 // ruby indicates resources are freed by 176 // a recvReqRetry() call back on this port. 177 computeUnit->sqcTLBPort->stallPort(); 178 179 DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", 180 vaddr); 181 182 computeUnit->sqcTLBPort->retries.push_back(pkt); 183 } else { 184 DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr); 185 } 186 } else { 187 pkt->senderState = 188 new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, 189 computeUnit->shader->gpuTc); 190 191 computeUnit->sqcTLBPort->sendFunctional(pkt); 192 193 TheISA::GpuTLB::TranslationState *sender_state = 194 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 195 196 delete sender_state->tlbEntry; 197 delete sender_state; 198 // fetch the instructions from the SQC when we operate in 199 // functional mode only 200 fetch(pkt, wavefront); 201 } 202} 203 204void 205FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) 206{ 207 assert(pkt->req->hasPaddr()); 208 assert(pkt->req->hasSize()); 209 210 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n", 211 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, 212 pkt->req->getPaddr()); 213 214 // this is necessary because the GPU TLB receives packets instead of 215 // requests. when the translation is complete, all relevent fields in the 216 // request will be populated, but not in the packet. here we create the 217 // new packet so we can set the size, addr, and proper flags. 218 PacketPtr oldPkt = pkt; 219 pkt = new Packet(oldPkt->req, oldPkt->cmd); 220 delete oldPkt; 221 222 TheGpuISA::RawMachInst *data = 223 new TheGpuISA::RawMachInst[pkt->req->getSize() / 224 sizeof(TheGpuISA::RawMachInst)]; 225 226 pkt->dataDynamic<TheGpuISA::RawMachInst>(data); 227 228 // New SenderState for the memory access 229 pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront); 230 231 if (timingSim) { 232 // translation is done. Send the appropriate timing memory request. 233 234 if (!computeUnit->sqcPort->sendTimingReq(pkt)) { 235 computeUnit->sqcPort->retries.push_back(std::make_pair(pkt, 236 wavefront)); 237 238 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n", 239 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, 240 pkt->req->getPaddr()); 241 } else { 242 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n", 243 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, 244 pkt->req->getPaddr()); 245 } 246 } else { 247 computeUnit->sqcPort->sendFunctional(pkt); 248 processFetchReturn(pkt); 249 } 250} 251 252void 253FetchUnit::processFetchReturn(PacketPtr pkt) 254{ 255 ComputeUnit::SQCPort::SenderState *sender_state = 256 safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState); 257 258 Wavefront *wavefront = sender_state->wavefront; 259 260 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned " 261 "%d bytes, %d instructions!\n", computeUnit->cu_id, 262 wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(), 263 pkt->req->getSize(), pkt->req->getSize() / 264 sizeof(TheGpuISA::RawMachInst)); 265 266 if (wavefront->dropFetch) { 267 assert(wavefront->instructionBuffer.empty()); 268 wavefront->dropFetch = false; 269 } else { 270 TheGpuISA::RawMachInst *inst_index_ptr = 271 (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>(); 272 273 assert(wavefront->instructionBuffer.size() <= 4); 274 275 for (int i = 0; i < pkt->req->getSize() / 276 sizeof(TheGpuISA::RawMachInst); ++i) { 277 GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]); 278 279 assert(inst_ptr); 280 281 if (inst_ptr->instSize() == 8) { 282 /** 283 * this instruction occupies 2 consecutive 284 * entries in the instruction array, the 285 * second of which contains a nullptr. so if 286 * this inst is 8 bytes we advance two entries 287 * instead of 1 288 */ 289 ++i; 290 } 291 292 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n", 293 computeUnit->cu_id, wavefront->simdId, 294 wavefront->wfSlotId, inst_ptr->disassemble()); 295 296 GPUDynInstPtr gpuDynInst = 297 std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr, 298 computeUnit->getAndIncSeqNum()); 299 300 wavefront->instructionBuffer.push_back(gpuDynInst); 301 } 302 } 303 304 wavefront->pendingFetch = false; 305 306 delete pkt->senderState; 307 delete pkt->req; 308 delete pkt; 309} 310 311void 312FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list) 313{ 314 waveList = wave_list; 315} 316