Cross Reference: /gem5/src/gpu-compute/lds

Deleted Added

sdiff udiff text old ( 11308:7d8836fd043d ) new ( 11523:81332eb10367 )

full compact

lds_state.cc (11308:7d8836fd043d)	lds_state.cc (11523:81332eb10367)
1/* 2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: John Kalamatianos, Joe Gross 34 / 35 36#include "gpu-compute/lds_state.hh" 37 38#include <array> 39#include <cstdio> 40#include <cstdlib> 41 42#include "gpu-compute/compute_unit.hh" 43#include "gpu-compute/gpu_dyn_inst.hh" 44#include "gpu-compute/shader.hh" 45 46/* 47 * the default constructor that works with SWIG 48 / 49LdsState::LdsState(const Params params) : 50 MemObject(params), 51 tickEvent(this), 52 cuPort(name() + ".port", this), 53 maximumSize(params->size), 54 range(params->range), 55 bankConflictPenalty(params->bankConflictPenalty), 56 banks(params->banks) 57{ 58 fatal_if(params->banks <= 0, 59 "Number of LDS banks should be positive number"); 60 fatal_if((params->banks & (params->banks - 1)) != 0, 61 "Number of LDS banks should be a power of 2"); 62 fatal_if(params->size <= 0, 63 "cannot allocate an LDS with a size less than 1"); 64 fatal_if(params->size % 2, 65 "the LDS should be an even number"); 66} 67 68/** 69 * Needed by the SWIG compiler 70 / 71LdsState 72LdsStateParams::create() 73{ 74 return new LdsState(this); 75} 76 77/** 78 * set the parent and name based on the parent 79 / 80void 81LdsState::setParent(ComputeUnit x_parent) 82{ 83 // check that this gets assigned to the same thing each time 84 fatal_if(!x_parent, "x_parent should not be nullptr"); 85 fatal_if(x_parent == parent, 86 "should not be setting the parent twice"); 87 88 parent = x_parent; 89 _name = x_parent->name() + ".LdsState"; 90} 91 92/** 93 * derive the gpu mem packet from the packet and then count the bank conflicts 94 / 95unsigned 96LdsState::countBankConflicts(PacketPtr packet, unsigned bankAccesses) 97{ 98 Packet::SenderState baseSenderState = packet->senderState; 99 while (baseSenderState->predecessor) { 100* baseSenderState = baseSenderState->predecessor; 101 } 102 const ComputeUnit::LDSPort::SenderState senderState = 103* dynamic_cast<ComputeUnit::LDSPort::SenderState >(baseSenderState); 104* 105 fatal_if(!senderState, 106 "did not get the right sort of sender state"); 107 108 GPUDynInstPtr gpuDynInst = senderState->getMemInst(); 109 110 return countBankConflicts(gpuDynInst, bankAccesses); 111} 112 113// Count the total number of bank conflicts for the local memory packet 114unsigned 115LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, 116 unsigned numBankAccesses) 117{ 118* int bank_conflicts = 0; 119 std::vector<int> bank; 120 // the number of LDS banks being touched by the memory instruction 121 int numBanks = std::min(parent->wfSize(), banks); 122 // if the wavefront size is larger than the number of LDS banks, we 123 // need to iterate over all work items to calculate the total 124 // number of bank conflicts 125 int groups = (parent->wfSize() > numBanks) ? 126 (parent->wfSize() / numBanks) : 1; 127 for (int i = 0; i < groups; i++) { 128 // Address Array holding all the work item addresses of an instruction 129 std::vector<Addr> addr_array; 130 addr_array.resize(numBanks, 0); 131 bank.clear(); 132 bank.resize(banks, 0); 133 int max_bank = 0; 134 135 // populate the address array for all active work items 136 for (int j = 0; j < numBanks; j++) { 137 if (gpuDynInst->exec_mask[(inumBanks)+j]) { 138* addr_array[j] = gpuDynInst->addr[(inumBanks)+j]; 139* } else { 140 addr_array[j] = std::numeric_limits<Addr>::max(); 141 } 142 } 143 144 if (gpuDynInst->m_op == Enums::MO_LD \|\| 145 gpuDynInst->m_op == Enums::MO_ST) { 146 // mask identical addresses 147 for (int j = 0; j < numBanks; ++j) { 148 for (int j0 = 0; j0 < j; j0++) { 149 if (addr_array[j] != std::numeric_limits<Addr>::max() 150 && addr_array[j] == addr_array[j0]) { 151 addr_array[j] = std::numeric_limits<Addr>::max(); 152 } 153 } 154 } 155 } 156 // calculate bank conflicts 157 for (int j = 0; j < numBanks; ++j) { 158 if (addr_array[j] != std::numeric_limits<Addr>::max()) { 159 int bankId = addr_array[j] % banks; 160 bank[bankId]++; 161 max_bank = std::max(max_bank, bank[bankId]); 162 // Count the number of LDS banks accessed. 163 // Since we have masked identical addresses all remaining 164 // accesses will need to be serialized if they access 165 // the same bank (bank conflict). 166 (numBankAccesses)++; 167* } 168 } 169 bank_conflicts += max_bank; 170 } 171 panic_if(bank_conflicts > parent->wfSize(), 172 "Max bank conflicts should match num of work items per instr"); 173 return bank_conflicts; 174} 175 176/** 177 * receive the packet from the CU 178 / 179bool 180LdsState::CuSidePort::recvTimingReq(PacketPtr packet) 181{ 182* return ownerLds->processPacket(packet); 183} 184 185GPUDynInstPtr 186LdsState::getDynInstr(PacketPtr packet) 187{ 188 ComputeUnit::LDSPort::SenderState ss = 189* dynamic_cast<ComputeUnit::LDSPort::SenderState >( 190* packet->senderState); 191 return ss->getMemInst(); 192} 193 194/** 195 * process an incoming packet, add it to the return queue 196 / 197bool 198LdsState::processPacket(PacketPtr packet) 199{ 200* unsigned bankAccesses = 0; 201 // the number of conflicts this packet will have when accessing the LDS 202 unsigned bankConflicts = countBankConflicts(packet, &bankAccesses); 203 // count the total number of physical LDS bank accessed 204 parent->ldsBankAccesses += bankAccesses; 205 // count the LDS bank conflicts. A number set to 1 indicates one 206 // access per bank maximum so there are no bank conflicts 207 parent->ldsBankConflictDist.sample(bankConflicts-1); 208 209 GPUDynInstPtr dynInst = getDynInstr(packet); 210 // account for the LDS bank conflict overhead 211 int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() : 212 (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() : 213 parent->loadBusLength(); 214 // delay for accessing the LDS 215 Tick processingTime = 216 parent->shader->ticks(bankConflicts * bankConflictPenalty) + 217 parent->shader->ticks(busLength); 218 // choose (delay + last packet in queue) or (now + delay) as the time to 219 // return this 220 Tick doneAt = earliestReturnTime() + processingTime; 221 // then store it for processing 222 return returnQueuePush(std::make_pair(doneAt, packet)); 223} 224 225/** 226 * add this to the queue of packets to be returned 227 / 228bool 229LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair) 230{ 231* // TODO add time limits (e.g. one packet per cycle) and queue size limits 232 // and implement flow control 233 returnQueue.push(thePair); 234 235 // if there is no set wakeup time, look through the queue 236 if (!tickEvent.scheduled()) { 237 process(); 238 } 239 240 return true; 241} 242 243/** 244 * receive a packet in functional mode 245 / 246void 247LdsState::CuSidePort::recvFunctional(PacketPtr pkt) 248{ 249* fatal("not implemented"); 250} 251 252/** 253 * receive a retry for a response 254 / 255void 256LdsState::CuSidePort::recvRespRetry() 257{ 258* // TODO verify that this is the right way to do this 259 assert(ownerLds->isRetryResp()); 260 ownerLds->setRetryResp(false); 261 ownerLds->process(); 262} 263 264/** 265 * receive a retry 266 / 267void 268LdsState::CuSidePort::recvRetry() 269{ 270* fatal("not implemented"); 271} 272 273/** 274 * look for packets to return at this time 275 / 276bool 277LdsState::process() 278{ 279* Tick now = clockEdge(); 280 281 // send back completed packets 282 while (!returnQueue.empty() && returnQueue.front().first <= now) { 283 PacketPtr packet = returnQueue.front().second; 284 285 ComputeUnit::LDSPort::SenderState ss = 286* dynamic_cast<ComputeUnit::LDSPort::SenderState >( 287* packet->senderState); 288 289 GPUDynInstPtr gpuDynInst = ss->getMemInst(); 290 291 gpuDynInst->initiateAcc(gpuDynInst); 292 293 packet->makeTimingResponse(); 294 295 returnQueue.pop(); 296 297 bool success = cuPort.sendTimingResp(packet); 298 299 if (!success) { 300 retryResp = true; 301 panic("have not handled timing responses being NACK'd when sent" 302 "back"); 303 } 304 } 305 306 // determine the next wakeup time 307 if (!returnQueue.empty()) { 308 309 Tick next = returnQueue.front().first; 310 311 if (tickEvent.scheduled()) { 312 313 if (next < tickEvent.when()) { 314 315 tickEvent.deschedule(); 316 tickEvent.schedule(next); 317 } 318 } else { 319 tickEvent.schedule(next); 320 } 321 } 322 323 return true; 324} 325 326/** 327 * wake up at this time and perform specified actions 328 / 329void 330LdsState::TickEvent::process() 331{ 332* ldsState->process(); 333}	1/* 2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: John Kalamatianos, Joe Gross 34 / 35 36#include "gpu-compute/lds_state.hh" 37 38#include <array> 39#include <cstdio> 40#include <cstdlib> 41 42#include "gpu-compute/compute_unit.hh" 43#include "gpu-compute/gpu_dyn_inst.hh" 44#include "gpu-compute/shader.hh" 45 46/* 47 * the default constructor that works with SWIG 48 / 49LdsState::LdsState(const Params params) : 50 MemObject(params), 51 tickEvent(this), 52 cuPort(name() + ".port", this), 53 maximumSize(params->size), 54 range(params->range), 55 bankConflictPenalty(params->bankConflictPenalty), 56 banks(params->banks) 57{ 58 fatal_if(params->banks <= 0, 59 "Number of LDS banks should be positive number"); 60 fatal_if((params->banks & (params->banks - 1)) != 0, 61 "Number of LDS banks should be a power of 2"); 62 fatal_if(params->size <= 0, 63 "cannot allocate an LDS with a size less than 1"); 64 fatal_if(params->size % 2, 65 "the LDS should be an even number"); 66} 67 68/** 69 * Needed by the SWIG compiler 70 / 71LdsState 72LdsStateParams::create() 73{ 74 return new LdsState(this); 75} 76 77/** 78 * set the parent and name based on the parent 79 / 80void 81LdsState::setParent(ComputeUnit x_parent) 82{ 83 // check that this gets assigned to the same thing each time 84 fatal_if(!x_parent, "x_parent should not be nullptr"); 85 fatal_if(x_parent == parent, 86 "should not be setting the parent twice"); 87 88 parent = x_parent; 89 _name = x_parent->name() + ".LdsState"; 90} 91 92/** 93 * derive the gpu mem packet from the packet and then count the bank conflicts 94 / 95unsigned 96LdsState::countBankConflicts(PacketPtr packet, unsigned bankAccesses) 97{ 98 Packet::SenderState baseSenderState = packet->senderState; 99 while (baseSenderState->predecessor) { 100* baseSenderState = baseSenderState->predecessor; 101 } 102 const ComputeUnit::LDSPort::SenderState senderState = 103* dynamic_cast<ComputeUnit::LDSPort::SenderState >(baseSenderState); 104* 105 fatal_if(!senderState, 106 "did not get the right sort of sender state"); 107 108 GPUDynInstPtr gpuDynInst = senderState->getMemInst(); 109 110 return countBankConflicts(gpuDynInst, bankAccesses); 111} 112 113// Count the total number of bank conflicts for the local memory packet 114unsigned 115LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, 116 unsigned numBankAccesses) 117{ 118* int bank_conflicts = 0; 119 std::vector<int> bank; 120 // the number of LDS banks being touched by the memory instruction 121 int numBanks = std::min(parent->wfSize(), banks); 122 // if the wavefront size is larger than the number of LDS banks, we 123 // need to iterate over all work items to calculate the total 124 // number of bank conflicts 125 int groups = (parent->wfSize() > numBanks) ? 126 (parent->wfSize() / numBanks) : 1; 127 for (int i = 0; i < groups; i++) { 128 // Address Array holding all the work item addresses of an instruction 129 std::vector<Addr> addr_array; 130 addr_array.resize(numBanks, 0); 131 bank.clear(); 132 bank.resize(banks, 0); 133 int max_bank = 0; 134 135 // populate the address array for all active work items 136 for (int j = 0; j < numBanks; j++) { 137 if (gpuDynInst->exec_mask[(inumBanks)+j]) { 138* addr_array[j] = gpuDynInst->addr[(inumBanks)+j]; 139* } else { 140 addr_array[j] = std::numeric_limits<Addr>::max(); 141 } 142 } 143 144 if (gpuDynInst->m_op == Enums::MO_LD \|\| 145 gpuDynInst->m_op == Enums::MO_ST) { 146 // mask identical addresses 147 for (int j = 0; j < numBanks; ++j) { 148 for (int j0 = 0; j0 < j; j0++) { 149 if (addr_array[j] != std::numeric_limits<Addr>::max() 150 && addr_array[j] == addr_array[j0]) { 151 addr_array[j] = std::numeric_limits<Addr>::max(); 152 } 153 } 154 } 155 } 156 // calculate bank conflicts 157 for (int j = 0; j < numBanks; ++j) { 158 if (addr_array[j] != std::numeric_limits<Addr>::max()) { 159 int bankId = addr_array[j] % banks; 160 bank[bankId]++; 161 max_bank = std::max(max_bank, bank[bankId]); 162 // Count the number of LDS banks accessed. 163 // Since we have masked identical addresses all remaining 164 // accesses will need to be serialized if they access 165 // the same bank (bank conflict). 166 (numBankAccesses)++; 167* } 168 } 169 bank_conflicts += max_bank; 170 } 171 panic_if(bank_conflicts > parent->wfSize(), 172 "Max bank conflicts should match num of work items per instr"); 173 return bank_conflicts; 174} 175 176/** 177 * receive the packet from the CU 178 / 179bool 180LdsState::CuSidePort::recvTimingReq(PacketPtr packet) 181{ 182* return ownerLds->processPacket(packet); 183} 184 185GPUDynInstPtr 186LdsState::getDynInstr(PacketPtr packet) 187{ 188 ComputeUnit::LDSPort::SenderState ss = 189* dynamic_cast<ComputeUnit::LDSPort::SenderState >( 190* packet->senderState); 191 return ss->getMemInst(); 192} 193 194/** 195 * process an incoming packet, add it to the return queue 196 / 197bool 198LdsState::processPacket(PacketPtr packet) 199{ 200* unsigned bankAccesses = 0; 201 // the number of conflicts this packet will have when accessing the LDS 202 unsigned bankConflicts = countBankConflicts(packet, &bankAccesses); 203 // count the total number of physical LDS bank accessed 204 parent->ldsBankAccesses += bankAccesses; 205 // count the LDS bank conflicts. A number set to 1 indicates one 206 // access per bank maximum so there are no bank conflicts 207 parent->ldsBankConflictDist.sample(bankConflicts-1); 208 209 GPUDynInstPtr dynInst = getDynInstr(packet); 210 // account for the LDS bank conflict overhead 211 int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() : 212 (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() : 213 parent->loadBusLength(); 214 // delay for accessing the LDS 215 Tick processingTime = 216 parent->shader->ticks(bankConflicts * bankConflictPenalty) + 217 parent->shader->ticks(busLength); 218 // choose (delay + last packet in queue) or (now + delay) as the time to 219 // return this 220 Tick doneAt = earliestReturnTime() + processingTime; 221 // then store it for processing 222 return returnQueuePush(std::make_pair(doneAt, packet)); 223} 224 225/** 226 * add this to the queue of packets to be returned 227 / 228bool 229LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair) 230{ 231* // TODO add time limits (e.g. one packet per cycle) and queue size limits 232 // and implement flow control 233 returnQueue.push(thePair); 234 235 // if there is no set wakeup time, look through the queue 236 if (!tickEvent.scheduled()) { 237 process(); 238 } 239 240 return true; 241} 242 243/** 244 * receive a packet in functional mode 245 / 246void 247LdsState::CuSidePort::recvFunctional(PacketPtr pkt) 248{ 249* fatal("not implemented"); 250} 251 252/** 253 * receive a retry for a response 254 / 255void 256LdsState::CuSidePort::recvRespRetry() 257{ 258* // TODO verify that this is the right way to do this 259 assert(ownerLds->isRetryResp()); 260 ownerLds->setRetryResp(false); 261 ownerLds->process(); 262} 263 264/** 265 * receive a retry 266 / 267void 268LdsState::CuSidePort::recvRetry() 269{ 270* fatal("not implemented"); 271} 272 273/** 274 * look for packets to return at this time 275 / 276bool 277LdsState::process() 278{ 279* Tick now = clockEdge(); 280 281 // send back completed packets 282 while (!returnQueue.empty() && returnQueue.front().first <= now) { 283 PacketPtr packet = returnQueue.front().second; 284 285 ComputeUnit::LDSPort::SenderState ss = 286* dynamic_cast<ComputeUnit::LDSPort::SenderState >( 287* packet->senderState); 288 289 GPUDynInstPtr gpuDynInst = ss->getMemInst(); 290 291 gpuDynInst->initiateAcc(gpuDynInst); 292 293 packet->makeTimingResponse(); 294 295 returnQueue.pop(); 296 297 bool success = cuPort.sendTimingResp(packet); 298 299 if (!success) { 300 retryResp = true; 301 panic("have not handled timing responses being NACK'd when sent" 302 "back"); 303 } 304 } 305 306 // determine the next wakeup time 307 if (!returnQueue.empty()) { 308 309 Tick next = returnQueue.front().first; 310 311 if (tickEvent.scheduled()) { 312 313 if (next < tickEvent.when()) { 314 315 tickEvent.deschedule(); 316 tickEvent.schedule(next); 317 } 318 } else { 319 tickEvent.schedule(next); 320 } 321 } 322 323 return true; 324} 325 326/** 327 * wake up at this time and perform specified actions 328 / 329void 330LdsState::TickEvent::process() 331{ 332* ldsState->process(); 333}
334 335/** 336 * 337 / 338void 339LdsState::regStats() 340{ 341*}