Cross Reference: /gem5/src/gpu-compute/lds

Deleted Added

sdiff udiff text old ( 12697:cd71b966be1e ) new ( 13892:0182a0601f66 )

full compact

lds_state.cc (12697:cd71b966be1e)	lds_state.cc (13892:0182a0601f66)
1/* 2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: John Kalamatianos, 34 * Joe Gross 35 / 36 37#include "gpu-compute/lds_state.hh" 38 39#include <array> 40#include <cstdio> 41#include <cstdlib> 42 43#include "gpu-compute/compute_unit.hh" 44#include "gpu-compute/gpu_dyn_inst.hh" 45#include "gpu-compute/shader.hh" 46 47/* 48 * the default constructor that works with SWIG 49 / 50LdsState::LdsState(const Params params) :	1/* 2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: John Kalamatianos, 34 * Joe Gross 35 / 36 37#include "gpu-compute/lds_state.hh" 38 39#include <array> 40#include <cstdio> 41#include <cstdlib> 42 43#include "gpu-compute/compute_unit.hh" 44#include "gpu-compute/gpu_dyn_inst.hh" 45#include "gpu-compute/shader.hh" 46 47/* 48 * the default constructor that works with SWIG 49 / 50LdsState::LdsState(const Params params) :
51 MemObject(params),	51 ClockedObject(params),
52 tickEvent(this), 53 cuPort(name() + ".port", this), 54 maximumSize(params->size), 55 range(params->range), 56 bankConflictPenalty(params->bankConflictPenalty), 57 banks(params->banks) 58{ 59 fatal_if(params->banks <= 0, 60 "Number of LDS banks should be positive number"); 61 fatal_if((params->banks & (params->banks - 1)) != 0, 62 "Number of LDS banks should be a power of 2"); 63 fatal_if(params->size <= 0, 64 "cannot allocate an LDS with a size less than 1"); 65 fatal_if(params->size % 2, 66 "the LDS should be an even number"); 67} 68 69/** 70 * Needed by the SWIG compiler 71 / 72LdsState 73LdsStateParams::create() 74{ 75 return new LdsState(this); 76} 77 78/** 79 * set the parent and name based on the parent 80 / 81void 82LdsState::setParent(ComputeUnit x_parent) 83{ 84 // check that this gets assigned to the same thing each time 85 fatal_if(!x_parent, "x_parent should not be nullptr"); 86 fatal_if(x_parent == parent, 87 "should not be setting the parent twice"); 88 89 parent = x_parent; 90 _name = x_parent->name() + ".LdsState"; 91} 92 93/** 94 * derive the gpu mem packet from the packet and then count the bank conflicts 95 / 96unsigned 97LdsState::countBankConflicts(PacketPtr packet, unsigned bankAccesses) 98{ 99 Packet::SenderState baseSenderState = packet->senderState; 100* while (baseSenderState->predecessor) { 101 baseSenderState = baseSenderState->predecessor; 102 } 103 const ComputeUnit::LDSPort::SenderState senderState = 104* dynamic_cast<ComputeUnit::LDSPort::SenderState >(baseSenderState); 105* 106 fatal_if(!senderState, 107 "did not get the right sort of sender state"); 108 109 GPUDynInstPtr gpuDynInst = senderState->getMemInst(); 110 111 return countBankConflicts(gpuDynInst, bankAccesses); 112} 113 114// Count the total number of bank conflicts for the local memory packet 115unsigned 116LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, 117 unsigned numBankAccesses) 118{ 119* int bank_conflicts = 0; 120 std::vector<int> bank; 121 // the number of LDS banks being touched by the memory instruction 122 int numBanks = std::min(parent->wfSize(), banks); 123 // if the wavefront size is larger than the number of LDS banks, we 124 // need to iterate over all work items to calculate the total 125 // number of bank conflicts 126 int groups = (parent->wfSize() > numBanks) ? 127 (parent->wfSize() / numBanks) : 1; 128 for (int i = 0; i < groups; i++) { 129 // Address Array holding all the work item addresses of an instruction 130 std::vector<Addr> addr_array; 131 addr_array.resize(numBanks, 0); 132 bank.clear(); 133 bank.resize(banks, 0); 134 int max_bank = 0; 135 136 // populate the address array for all active work items 137 for (int j = 0; j < numBanks; j++) { 138 if (gpuDynInst->exec_mask[(inumBanks)+j]) { 139* addr_array[j] = gpuDynInst->addr[(inumBanks)+j]; 140* } else { 141 addr_array[j] = std::numeric_limits<Addr>::max(); 142 } 143 } 144 145 if (gpuDynInst->isLoad() \|\| gpuDynInst->isStore()) { 146 // mask identical addresses 147 for (int j = 0; j < numBanks; ++j) { 148 for (int j0 = 0; j0 < j; j0++) { 149 if (addr_array[j] != std::numeric_limits<Addr>::max() 150 && addr_array[j] == addr_array[j0]) { 151 addr_array[j] = std::numeric_limits<Addr>::max(); 152 } 153 } 154 } 155 } 156 // calculate bank conflicts 157 for (int j = 0; j < numBanks; ++j) { 158 if (addr_array[j] != std::numeric_limits<Addr>::max()) { 159 int bankId = addr_array[j] % banks; 160 bank[bankId]++; 161 max_bank = std::max(max_bank, bank[bankId]); 162 // Count the number of LDS banks accessed. 163 // Since we have masked identical addresses all remaining 164 // accesses will need to be serialized if they access 165 // the same bank (bank conflict). 166 (numBankAccesses)++; 167* } 168 } 169 bank_conflicts += max_bank; 170 } 171 panic_if(bank_conflicts > parent->wfSize(), 172 "Max bank conflicts should match num of work items per instr"); 173 return bank_conflicts; 174} 175 176/** 177 * receive the packet from the CU 178 / 179bool 180LdsState::CuSidePort::recvTimingReq(PacketPtr packet) 181{ 182* return ownerLds->processPacket(packet); 183} 184 185GPUDynInstPtr 186LdsState::getDynInstr(PacketPtr packet) 187{ 188 ComputeUnit::LDSPort::SenderState ss = 189* dynamic_cast<ComputeUnit::LDSPort::SenderState >( 190* packet->senderState); 191 return ss->getMemInst(); 192} 193 194/** 195 * process an incoming packet, add it to the return queue 196 / 197bool 198LdsState::processPacket(PacketPtr packet) 199{ 200* unsigned bankAccesses = 0; 201 // the number of conflicts this packet will have when accessing the LDS 202 unsigned bankConflicts = countBankConflicts(packet, &bankAccesses); 203 // count the total number of physical LDS bank accessed 204 parent->ldsBankAccesses += bankAccesses; 205 // count the LDS bank conflicts. A number set to 1 indicates one 206 // access per bank maximum so there are no bank conflicts 207 parent->ldsBankConflictDist.sample(bankConflicts-1); 208 209 GPUDynInstPtr dynInst = getDynInstr(packet); 210 // account for the LDS bank conflict overhead 211 int busLength = (dynInst->isLoad()) ? parent->loadBusLength() : 212 (dynInst->isStore()) ? parent->storeBusLength() : 213 parent->loadBusLength(); 214 // delay for accessing the LDS 215 Tick processingTime = 216 parent->shader->ticks(bankConflicts * bankConflictPenalty) + 217 parent->shader->ticks(busLength); 218 // choose (delay + last packet in queue) or (now + delay) as the time to 219 // return this 220 Tick doneAt = earliestReturnTime() + processingTime; 221 // then store it for processing 222 return returnQueuePush(std::make_pair(doneAt, packet)); 223} 224 225/** 226 * add this to the queue of packets to be returned 227 / 228bool 229LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair) 230{ 231* // TODO add time limits (e.g. one packet per cycle) and queue size limits 232 // and implement flow control 233 returnQueue.push(thePair); 234 235 // if there is no set wakeup time, look through the queue 236 if (!tickEvent.scheduled()) { 237 process(); 238 } 239 240 return true; 241} 242 243/** 244 * receive a packet in functional mode 245 / 246void 247LdsState::CuSidePort::recvFunctional(PacketPtr pkt) 248{ 249* fatal("not implemented"); 250} 251 252/** 253 * receive a retry for a response 254 / 255void 256LdsState::CuSidePort::recvRespRetry() 257{ 258* // TODO verify that this is the right way to do this 259 assert(ownerLds->isRetryResp()); 260 ownerLds->setRetryResp(false); 261 ownerLds->process(); 262} 263 264/** 265 * receive a retry 266 / 267void 268LdsState::CuSidePort::recvRetry() 269{ 270* fatal("not implemented"); 271} 272 273/** 274 * look for packets to return at this time 275 / 276bool 277LdsState::process() 278{ 279* Tick now = clockEdge(); 280 281 // send back completed packets 282 while (!returnQueue.empty() && returnQueue.front().first <= now) { 283 PacketPtr packet = returnQueue.front().second; 284 285 ComputeUnit::LDSPort::SenderState ss = 286* dynamic_cast<ComputeUnit::LDSPort::SenderState >( 287* packet->senderState); 288 289 GPUDynInstPtr gpuDynInst = ss->getMemInst(); 290 291 gpuDynInst->initiateAcc(gpuDynInst); 292 293 packet->makeTimingResponse(); 294 295 returnQueue.pop(); 296 297 bool success = cuPort.sendTimingResp(packet); 298 299 if (!success) { 300 retryResp = true; 301 panic("have not handled timing responses being NACK'd when sent" 302 "back"); 303 } 304 } 305 306 // determine the next wakeup time 307 if (!returnQueue.empty()) { 308 309 Tick next = returnQueue.front().first; 310 311 if (tickEvent.scheduled()) { 312 313 if (next < tickEvent.when()) { 314 315 tickEvent.deschedule(); 316 tickEvent.schedule(next); 317 } 318 } else { 319 tickEvent.schedule(next); 320 } 321 } 322 323 return true; 324} 325 326/** 327 * wake up at this time and perform specified actions 328 / 329void 330LdsState::TickEvent::process() 331{ 332* ldsState->process(); 333}	52 tickEvent(this), 53 cuPort(name() + ".port", this), 54 maximumSize(params->size), 55 range(params->range), 56 bankConflictPenalty(params->bankConflictPenalty), 57 banks(params->banks) 58{ 59 fatal_if(params->banks <= 0, 60 "Number of LDS banks should be positive number"); 61 fatal_if((params->banks & (params->banks - 1)) != 0, 62 "Number of LDS banks should be a power of 2"); 63 fatal_if(params->size <= 0, 64 "cannot allocate an LDS with a size less than 1"); 65 fatal_if(params->size % 2, 66 "the LDS should be an even number"); 67} 68 69/** 70 * Needed by the SWIG compiler 71 / 72LdsState 73LdsStateParams::create() 74{ 75 return new LdsState(this); 76} 77 78/** 79 * set the parent and name based on the parent 80 / 81void 82LdsState::setParent(ComputeUnit x_parent) 83{ 84 // check that this gets assigned to the same thing each time 85 fatal_if(!x_parent, "x_parent should not be nullptr"); 86 fatal_if(x_parent == parent, 87 "should not be setting the parent twice"); 88 89 parent = x_parent; 90 _name = x_parent->name() + ".LdsState"; 91} 92 93/** 94 * derive the gpu mem packet from the packet and then count the bank conflicts 95 / 96unsigned 97LdsState::countBankConflicts(PacketPtr packet, unsigned bankAccesses) 98{ 99 Packet::SenderState baseSenderState = packet->senderState; 100* while (baseSenderState->predecessor) { 101 baseSenderState = baseSenderState->predecessor; 102 } 103 const ComputeUnit::LDSPort::SenderState senderState = 104* dynamic_cast<ComputeUnit::LDSPort::SenderState >(baseSenderState); 105* 106 fatal_if(!senderState, 107 "did not get the right sort of sender state"); 108 109 GPUDynInstPtr gpuDynInst = senderState->getMemInst(); 110 111 return countBankConflicts(gpuDynInst, bankAccesses); 112} 113 114// Count the total number of bank conflicts for the local memory packet 115unsigned 116LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, 117 unsigned numBankAccesses) 118{ 119* int bank_conflicts = 0; 120 std::vector<int> bank; 121 // the number of LDS banks being touched by the memory instruction 122 int numBanks = std::min(parent->wfSize(), banks); 123 // if the wavefront size is larger than the number of LDS banks, we 124 // need to iterate over all work items to calculate the total 125 // number of bank conflicts 126 int groups = (parent->wfSize() > numBanks) ? 127 (parent->wfSize() / numBanks) : 1; 128 for (int i = 0; i < groups; i++) { 129 // Address Array holding all the work item addresses of an instruction 130 std::vector<Addr> addr_array; 131 addr_array.resize(numBanks, 0); 132 bank.clear(); 133 bank.resize(banks, 0); 134 int max_bank = 0; 135 136 // populate the address array for all active work items 137 for (int j = 0; j < numBanks; j++) { 138 if (gpuDynInst->exec_mask[(inumBanks)+j]) { 139* addr_array[j] = gpuDynInst->addr[(inumBanks)+j]; 140* } else { 141 addr_array[j] = std::numeric_limits<Addr>::max(); 142 } 143 } 144 145 if (gpuDynInst->isLoad() \|\| gpuDynInst->isStore()) { 146 // mask identical addresses 147 for (int j = 0; j < numBanks; ++j) { 148 for (int j0 = 0; j0 < j; j0++) { 149 if (addr_array[j] != std::numeric_limits<Addr>::max() 150 && addr_array[j] == addr_array[j0]) { 151 addr_array[j] = std::numeric_limits<Addr>::max(); 152 } 153 } 154 } 155 } 156 // calculate bank conflicts 157 for (int j = 0; j < numBanks; ++j) { 158 if (addr_array[j] != std::numeric_limits<Addr>::max()) { 159 int bankId = addr_array[j] % banks; 160 bank[bankId]++; 161 max_bank = std::max(max_bank, bank[bankId]); 162 // Count the number of LDS banks accessed. 163 // Since we have masked identical addresses all remaining 164 // accesses will need to be serialized if they access 165 // the same bank (bank conflict). 166 (numBankAccesses)++; 167* } 168 } 169 bank_conflicts += max_bank; 170 } 171 panic_if(bank_conflicts > parent->wfSize(), 172 "Max bank conflicts should match num of work items per instr"); 173 return bank_conflicts; 174} 175 176/** 177 * receive the packet from the CU 178 / 179bool 180LdsState::CuSidePort::recvTimingReq(PacketPtr packet) 181{ 182* return ownerLds->processPacket(packet); 183} 184 185GPUDynInstPtr 186LdsState::getDynInstr(PacketPtr packet) 187{ 188 ComputeUnit::LDSPort::SenderState ss = 189* dynamic_cast<ComputeUnit::LDSPort::SenderState >( 190* packet->senderState); 191 return ss->getMemInst(); 192} 193 194/** 195 * process an incoming packet, add it to the return queue 196 / 197bool 198LdsState::processPacket(PacketPtr packet) 199{ 200* unsigned bankAccesses = 0; 201 // the number of conflicts this packet will have when accessing the LDS 202 unsigned bankConflicts = countBankConflicts(packet, &bankAccesses); 203 // count the total number of physical LDS bank accessed 204 parent->ldsBankAccesses += bankAccesses; 205 // count the LDS bank conflicts. A number set to 1 indicates one 206 // access per bank maximum so there are no bank conflicts 207 parent->ldsBankConflictDist.sample(bankConflicts-1); 208 209 GPUDynInstPtr dynInst = getDynInstr(packet); 210 // account for the LDS bank conflict overhead 211 int busLength = (dynInst->isLoad()) ? parent->loadBusLength() : 212 (dynInst->isStore()) ? parent->storeBusLength() : 213 parent->loadBusLength(); 214 // delay for accessing the LDS 215 Tick processingTime = 216 parent->shader->ticks(bankConflicts * bankConflictPenalty) + 217 parent->shader->ticks(busLength); 218 // choose (delay + last packet in queue) or (now + delay) as the time to 219 // return this 220 Tick doneAt = earliestReturnTime() + processingTime; 221 // then store it for processing 222 return returnQueuePush(std::make_pair(doneAt, packet)); 223} 224 225/** 226 * add this to the queue of packets to be returned 227 / 228bool 229LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair) 230{ 231* // TODO add time limits (e.g. one packet per cycle) and queue size limits 232 // and implement flow control 233 returnQueue.push(thePair); 234 235 // if there is no set wakeup time, look through the queue 236 if (!tickEvent.scheduled()) { 237 process(); 238 } 239 240 return true; 241} 242 243/** 244 * receive a packet in functional mode 245 / 246void 247LdsState::CuSidePort::recvFunctional(PacketPtr pkt) 248{ 249* fatal("not implemented"); 250} 251 252/** 253 * receive a retry for a response 254 / 255void 256LdsState::CuSidePort::recvRespRetry() 257{ 258* // TODO verify that this is the right way to do this 259 assert(ownerLds->isRetryResp()); 260 ownerLds->setRetryResp(false); 261 ownerLds->process(); 262} 263 264/** 265 * receive a retry 266 / 267void 268LdsState::CuSidePort::recvRetry() 269{ 270* fatal("not implemented"); 271} 272 273/** 274 * look for packets to return at this time 275 / 276bool 277LdsState::process() 278{ 279* Tick now = clockEdge(); 280 281 // send back completed packets 282 while (!returnQueue.empty() && returnQueue.front().first <= now) { 283 PacketPtr packet = returnQueue.front().second; 284 285 ComputeUnit::LDSPort::SenderState ss = 286* dynamic_cast<ComputeUnit::LDSPort::SenderState >( 287* packet->senderState); 288 289 GPUDynInstPtr gpuDynInst = ss->getMemInst(); 290 291 gpuDynInst->initiateAcc(gpuDynInst); 292 293 packet->makeTimingResponse(); 294 295 returnQueue.pop(); 296 297 bool success = cuPort.sendTimingResp(packet); 298 299 if (!success) { 300 retryResp = true; 301 panic("have not handled timing responses being NACK'd when sent" 302 "back"); 303 } 304 } 305 306 // determine the next wakeup time 307 if (!returnQueue.empty()) { 308 309 Tick next = returnQueue.front().first; 310 311 if (tickEvent.scheduled()) { 312 313 if (next < tickEvent.when()) { 314 315 tickEvent.deschedule(); 316 tickEvent.schedule(next); 317 } 318 } else { 319 tickEvent.schedule(next); 320 } 321 } 322 323 return true; 324} 325 326/** 327 * wake up at this time and perform specified actions 328 / 329void 330LdsState::TickEvent::process() 331{ 332* ldsState->process(); 333}