lds_state.cc revision 11692
1/* 2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: John Kalamatianos, Joe Gross 34 */ 35 36#include "gpu-compute/lds_state.hh" 37 38#include <array> 39#include <cstdio> 40#include <cstdlib> 41 42#include "gpu-compute/compute_unit.hh" 43#include "gpu-compute/gpu_dyn_inst.hh" 44#include "gpu-compute/shader.hh" 45 46/** 47 * the default constructor that works with SWIG 48 */ 49LdsState::LdsState(const Params *params) : 50 MemObject(params), 51 tickEvent(this), 52 cuPort(name() + ".port", this), 53 maximumSize(params->size), 54 range(params->range), 55 bankConflictPenalty(params->bankConflictPenalty), 56 banks(params->banks) 57{ 58 fatal_if(params->banks <= 0, 59 "Number of LDS banks should be positive number"); 60 fatal_if((params->banks & (params->banks - 1)) != 0, 61 "Number of LDS banks should be a power of 2"); 62 fatal_if(params->size <= 0, 63 "cannot allocate an LDS with a size less than 1"); 64 fatal_if(params->size % 2, 65 "the LDS should be an even number"); 66} 67 68/** 69 * Needed by the SWIG compiler 70 */ 71LdsState * 72LdsStateParams::create() 73{ 74 return new LdsState(this); 75} 76 77/** 78 * set the parent and name based on the parent 79 */ 80void 81LdsState::setParent(ComputeUnit *x_parent) 82{ 83 // check that this gets assigned to the same thing each time 84 fatal_if(!x_parent, "x_parent should not be nullptr"); 85 fatal_if(x_parent == parent, 86 "should not be setting the parent twice"); 87 88 parent = x_parent; 89 _name = x_parent->name() + ".LdsState"; 90} 91 92/** 93 * derive the gpu mem packet from the packet and then count the bank conflicts 94 */ 95unsigned 96LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses) 97{ 98 Packet::SenderState *baseSenderState = packet->senderState; 99 while (baseSenderState->predecessor) { 100 baseSenderState = baseSenderState->predecessor; 101 } 102 const ComputeUnit::LDSPort::SenderState *senderState = 103 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState); 104 105 fatal_if(!senderState, 106 "did not get the right sort of sender state"); 107 108 GPUDynInstPtr gpuDynInst = senderState->getMemInst(); 109 110 return countBankConflicts(gpuDynInst, bankAccesses); 111} 112 113// Count the total number of bank conflicts for the local memory packet 114unsigned 115LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, 116 unsigned *numBankAccesses) 117{ 118 int bank_conflicts = 0; 119 std::vector<int> bank; 120 // the number of LDS banks being touched by the memory instruction 121 int numBanks = std::min(parent->wfSize(), banks); 122 // if the wavefront size is larger than the number of LDS banks, we 123 // need to iterate over all work items to calculate the total 124 // number of bank conflicts 125 int groups = (parent->wfSize() > numBanks) ? 126 (parent->wfSize() / numBanks) : 1; 127 for (int i = 0; i < groups; i++) { 128 // Address Array holding all the work item addresses of an instruction 129 std::vector<Addr> addr_array; 130 addr_array.resize(numBanks, 0); 131 bank.clear(); 132 bank.resize(banks, 0); 133 int max_bank = 0; 134 135 // populate the address array for all active work items 136 for (int j = 0; j < numBanks; j++) { 137 if (gpuDynInst->exec_mask[(i*numBanks)+j]) { 138 addr_array[j] = gpuDynInst->addr[(i*numBanks)+j]; 139 } else { 140 addr_array[j] = std::numeric_limits<Addr>::max(); 141 } 142 } 143 144 if (gpuDynInst->isLoad() || gpuDynInst->isStore()) { 145 // mask identical addresses 146 for (int j = 0; j < numBanks; ++j) { 147 for (int j0 = 0; j0 < j; j0++) { 148 if (addr_array[j] != std::numeric_limits<Addr>::max() 149 && addr_array[j] == addr_array[j0]) { 150 addr_array[j] = std::numeric_limits<Addr>::max(); 151 } 152 } 153 } 154 } 155 // calculate bank conflicts 156 for (int j = 0; j < numBanks; ++j) { 157 if (addr_array[j] != std::numeric_limits<Addr>::max()) { 158 int bankId = addr_array[j] % banks; 159 bank[bankId]++; 160 max_bank = std::max(max_bank, bank[bankId]); 161 // Count the number of LDS banks accessed. 162 // Since we have masked identical addresses all remaining 163 // accesses will need to be serialized if they access 164 // the same bank (bank conflict). 165 (*numBankAccesses)++; 166 } 167 } 168 bank_conflicts += max_bank; 169 } 170 panic_if(bank_conflicts > parent->wfSize(), 171 "Max bank conflicts should match num of work items per instr"); 172 return bank_conflicts; 173} 174 175/** 176 * receive the packet from the CU 177 */ 178bool 179LdsState::CuSidePort::recvTimingReq(PacketPtr packet) 180{ 181 return ownerLds->processPacket(packet); 182} 183 184GPUDynInstPtr 185LdsState::getDynInstr(PacketPtr packet) 186{ 187 ComputeUnit::LDSPort::SenderState *ss = 188 dynamic_cast<ComputeUnit::LDSPort::SenderState *>( 189 packet->senderState); 190 return ss->getMemInst(); 191} 192 193/** 194 * process an incoming packet, add it to the return queue 195 */ 196bool 197LdsState::processPacket(PacketPtr packet) 198{ 199 unsigned bankAccesses = 0; 200 // the number of conflicts this packet will have when accessing the LDS 201 unsigned bankConflicts = countBankConflicts(packet, &bankAccesses); 202 // count the total number of physical LDS bank accessed 203 parent->ldsBankAccesses += bankAccesses; 204 // count the LDS bank conflicts. A number set to 1 indicates one 205 // access per bank maximum so there are no bank conflicts 206 parent->ldsBankConflictDist.sample(bankConflicts-1); 207 208 GPUDynInstPtr dynInst = getDynInstr(packet); 209 // account for the LDS bank conflict overhead 210 int busLength = (dynInst->isLoad()) ? parent->loadBusLength() : 211 (dynInst->isStore()) ? parent->storeBusLength() : 212 parent->loadBusLength(); 213 // delay for accessing the LDS 214 Tick processingTime = 215 parent->shader->ticks(bankConflicts * bankConflictPenalty) + 216 parent->shader->ticks(busLength); 217 // choose (delay + last packet in queue) or (now + delay) as the time to 218 // return this 219 Tick doneAt = earliestReturnTime() + processingTime; 220 // then store it for processing 221 return returnQueuePush(std::make_pair(doneAt, packet)); 222} 223 224/** 225 * add this to the queue of packets to be returned 226 */ 227bool 228LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair) 229{ 230 // TODO add time limits (e.g. one packet per cycle) and queue size limits 231 // and implement flow control 232 returnQueue.push(thePair); 233 234 // if there is no set wakeup time, look through the queue 235 if (!tickEvent.scheduled()) { 236 process(); 237 } 238 239 return true; 240} 241 242/** 243 * receive a packet in functional mode 244 */ 245void 246LdsState::CuSidePort::recvFunctional(PacketPtr pkt) 247{ 248 fatal("not implemented"); 249} 250 251/** 252 * receive a retry for a response 253 */ 254void 255LdsState::CuSidePort::recvRespRetry() 256{ 257 // TODO verify that this is the right way to do this 258 assert(ownerLds->isRetryResp()); 259 ownerLds->setRetryResp(false); 260 ownerLds->process(); 261} 262 263/** 264 * receive a retry 265 */ 266void 267LdsState::CuSidePort::recvRetry() 268{ 269 fatal("not implemented"); 270} 271 272/** 273 * look for packets to return at this time 274 */ 275bool 276LdsState::process() 277{ 278 Tick now = clockEdge(); 279 280 // send back completed packets 281 while (!returnQueue.empty() && returnQueue.front().first <= now) { 282 PacketPtr packet = returnQueue.front().second; 283 284 ComputeUnit::LDSPort::SenderState *ss = 285 dynamic_cast<ComputeUnit::LDSPort::SenderState *>( 286 packet->senderState); 287 288 GPUDynInstPtr gpuDynInst = ss->getMemInst(); 289 290 gpuDynInst->initiateAcc(gpuDynInst); 291 292 packet->makeTimingResponse(); 293 294 returnQueue.pop(); 295 296 bool success = cuPort.sendTimingResp(packet); 297 298 if (!success) { 299 retryResp = true; 300 panic("have not handled timing responses being NACK'd when sent" 301 "back"); 302 } 303 } 304 305 // determine the next wakeup time 306 if (!returnQueue.empty()) { 307 308 Tick next = returnQueue.front().first; 309 310 if (tickEvent.scheduled()) { 311 312 if (next < tickEvent.when()) { 313 314 tickEvent.deschedule(); 315 tickEvent.schedule(next); 316 } 317 } else { 318 tickEvent.schedule(next); 319 } 320 } 321 322 return true; 323} 324 325/** 326 * wake up at this time and perform specified actions 327 */ 328void 329LdsState::TickEvent::process() 330{ 331 ldsState->process(); 332} 333