fetch_unit.cc revision 11639
1/* 2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Brad Beckmann, Sooraj Puthoor 34 */ 35 36#include "gpu-compute/fetch_unit.hh" 37 38#include "debug/GPUFetch.hh" 39#include "debug/GPUPort.hh" 40#include "debug/GPUTLB.hh" 41#include "gpu-compute/compute_unit.hh" 42#include "gpu-compute/gpu_dyn_inst.hh" 43#include "gpu-compute/gpu_static_inst.hh" 44#include "gpu-compute/shader.hh" 45#include "gpu-compute/wavefront.hh" 46#include "mem/ruby/system/RubySystem.hh" 47 48uint32_t FetchUnit::globalFetchUnitID; 49 50FetchUnit::FetchUnit(const ComputeUnitParams* params) : 51 timingSim(true), 52 computeUnit(nullptr), 53 fetchScheduler(params), 54 waveList(nullptr) 55{ 56} 57 58FetchUnit::~FetchUnit() 59{ 60 fetchQueue.clear(); 61 fetchStatusQueue.clear(); 62} 63 64void 65FetchUnit::init(ComputeUnit *cu) 66{ 67 computeUnit = cu; 68 timingSim = computeUnit->shader->timingSim; 69 fetchQueue.clear(); 70 fetchStatusQueue.resize(computeUnit->shader->n_wf); 71 72 for (int j = 0; j < computeUnit->shader->n_wf; ++j) { 73 fetchStatusQueue[j] = std::make_pair(waveList->at(j), false); 74 } 75 76 fetchScheduler.bindList(&fetchQueue); 77} 78 79void 80FetchUnit::exec() 81{ 82 // re-evaluate waves which are marked as not ready for fetch 83 for (int j = 0; j < computeUnit->shader->n_wf; ++j) { 84 // Following code assumes 64-bit opertaion and all insts are 85 // represented by 64-bit pointers to inst objects. 86 Wavefront *curWave = fetchStatusQueue[j].first; 87 assert (curWave); 88 89 // The wavefront has to be active, the IB occupancy has to be 90 // 4 or less instructions and it can not have any branches to 91 // prevent speculative instruction fetches 92 if (!fetchStatusQueue[j].second) { 93 if (curWave->status == Wavefront::S_RUNNING && 94 curWave->instructionBuffer.size() <= 4 && 95 !curWave->instructionBufferHasBranch() && 96 !curWave->pendingFetch) { 97 fetchQueue.push_back(curWave); 98 fetchStatusQueue[j].second = true; 99 } 100 } 101 } 102 103 // Fetch only if there is some wave ready to be fetched 104 // An empty fetchQueue will cause the schedular to panic 105 if (fetchQueue.size()) { 106 Wavefront *waveToBeFetched = fetchScheduler.chooseWave(); 107 waveToBeFetched->pendingFetch = true; 108 fetchStatusQueue[waveToBeFetched->wfSlotId].second = false; 109 initiateFetch(waveToBeFetched); 110 } 111} 112 113void 114FetchUnit::initiateFetch(Wavefront *wavefront) 115{ 116 // calculate the virtual address to fetch from the SQC 117 Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size(); 118 vaddr = wavefront->basePtr + vaddr * sizeof(GPUStaticInst*); 119 120 DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n", 121 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr); 122 123 // Since this is an instruction prefetch, if you're split then just finish 124 // out the current line. 125 unsigned block_size = RubySystem::getBlockSizeBytes(); 126 // check for split accesses 127 Addr split_addr = roundDown(vaddr + block_size - 1, block_size); 128 unsigned size = block_size; 129 130 if (split_addr > vaddr) { 131 // misaligned access, just grab the rest of the line 132 size = split_addr - vaddr; 133 } 134 135 // set up virtual request 136 Request *req = new Request(0, vaddr, size, Request::INST_FETCH, 137 computeUnit->masterId(), 0, 0, 0); 138 139 PacketPtr pkt = new Packet(req, MemCmd::ReadReq); 140 // This fetchBlock is kind of faux right now - because the translations so 141 // far don't actually return Data 142 uint64_t fetchBlock; 143 pkt->dataStatic(&fetchBlock); 144 145 if (timingSim) { 146 // SenderState needed on Return 147 pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront); 148 149 // Sender State needed by TLB hierarchy 150 pkt->senderState = 151 new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, 152 computeUnit->shader->gpuTc, 153 false, pkt->senderState); 154 155 if (computeUnit->sqcTLBPort->isStalled()) { 156 assert(computeUnit->sqcTLBPort->retries.size() > 0); 157 158 DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", 159 vaddr); 160 161 computeUnit->sqcTLBPort->retries.push_back(pkt); 162 } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) { 163 // Stall the data port; 164 // No more packet is issued till 165 // ruby indicates resources are freed by 166 // a recvReqRetry() call back on this port. 167 computeUnit->sqcTLBPort->stallPort(); 168 169 DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n", 170 vaddr); 171 172 computeUnit->sqcTLBPort->retries.push_back(pkt); 173 } else { 174 DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr); 175 } 176 } else { 177 pkt->senderState = 178 new TheISA::GpuTLB::TranslationState(BaseTLB::Execute, 179 computeUnit->shader->gpuTc); 180 181 computeUnit->sqcTLBPort->sendFunctional(pkt); 182 183 TheISA::GpuTLB::TranslationState *sender_state = 184 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); 185 186 delete sender_state->tlbEntry; 187 delete sender_state; 188 // fetch the instructions from the SQC when we operate in 189 // functional mode only 190 fetch(pkt, wavefront); 191 } 192} 193 194void 195FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront) 196{ 197 assert(pkt->req->hasPaddr()); 198 assert(pkt->req->hasSize()); 199 200 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n", 201 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, 202 pkt->req->getPaddr()); 203 204 // this is necessary because the GPU TLB receives packets instead of 205 // requests. when the translation is complete, all relevent fields in the 206 // request will be populated, but not in the packet. here we create the 207 // new packet so we can set the size, addr, and proper flags. 208 PacketPtr oldPkt = pkt; 209 pkt = new Packet(oldPkt->req, oldPkt->cmd); 210 delete oldPkt; 211 212 TheGpuISA::RawMachInst *data = 213 new TheGpuISA::RawMachInst[pkt->req->getSize() / 214 sizeof(TheGpuISA::RawMachInst)]; 215 216 pkt->dataDynamic<TheGpuISA::RawMachInst>(data); 217 218 // New SenderState for the memory access 219 pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront); 220 221 if (timingSim) { 222 // translation is done. Send the appropriate timing memory request. 223 224 if (!computeUnit->sqcPort->sendTimingReq(pkt)) { 225 computeUnit->sqcPort->retries.push_back(std::make_pair(pkt, 226 wavefront)); 227 228 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n", 229 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, 230 pkt->req->getPaddr()); 231 } else { 232 DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n", 233 computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, 234 pkt->req->getPaddr()); 235 } 236 } else { 237 computeUnit->sqcPort->sendFunctional(pkt); 238 processFetchReturn(pkt); 239 } 240} 241 242void 243FetchUnit::processFetchReturn(PacketPtr pkt) 244{ 245 ComputeUnit::SQCPort::SenderState *sender_state = 246 safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState); 247 248 Wavefront *wavefront = sender_state->wavefront; 249 250 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned " 251 "%d bytes, %d instructions!\n", computeUnit->cu_id, 252 wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(), 253 pkt->req->getSize(), pkt->req->getSize() / 254 sizeof(TheGpuISA::RawMachInst)); 255 256 if (wavefront->dropFetch) { 257 assert(wavefront->instructionBuffer.empty()); 258 wavefront->dropFetch = false; 259 } else { 260 TheGpuISA::RawMachInst *inst_index_ptr = 261 (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>(); 262 263 assert(wavefront->instructionBuffer.size() <= 4); 264 265 for (int i = 0; i < pkt->req->getSize() / 266 sizeof(TheGpuISA::RawMachInst); ++i) { 267 GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]); 268 269 assert(inst_ptr); 270 DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n", 271 computeUnit->cu_id, wavefront->simdId, 272 wavefront->wfSlotId, inst_ptr->disassemble()); 273 274 GPUDynInstPtr gpuDynInst = 275 std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr, 276 computeUnit->getAndIncSeqNum()); 277 278 wavefront->instructionBuffer.push_back(gpuDynInst); 279 } 280 } 281 282 wavefront->pendingFetch = false; 283 284 delete pkt->senderState; 285 delete pkt->req; 286 delete pkt; 287} 288 289void 290FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list) 291{ 292 waveList = wave_list; 293} 294