lds_state.cc (11308:7d8836fd043d) lds_state.cc (11523:81332eb10367)
1/*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: John Kalamatianos, Joe Gross
34 */
35
36#include "gpu-compute/lds_state.hh"
37
38#include <array>
39#include <cstdio>
40#include <cstdlib>
41
42#include "gpu-compute/compute_unit.hh"
43#include "gpu-compute/gpu_dyn_inst.hh"
44#include "gpu-compute/shader.hh"
45
46/**
47 * the default constructor that works with SWIG
48 */
49LdsState::LdsState(const Params *params) :
50 MemObject(params),
51 tickEvent(this),
52 cuPort(name() + ".port", this),
53 maximumSize(params->size),
54 range(params->range),
55 bankConflictPenalty(params->bankConflictPenalty),
56 banks(params->banks)
57{
58 fatal_if(params->banks <= 0,
59 "Number of LDS banks should be positive number");
60 fatal_if((params->banks & (params->banks - 1)) != 0,
61 "Number of LDS banks should be a power of 2");
62 fatal_if(params->size <= 0,
63 "cannot allocate an LDS with a size less than 1");
64 fatal_if(params->size % 2,
65 "the LDS should be an even number");
66}
67
68/**
69 * Needed by the SWIG compiler
70 */
71LdsState *
72LdsStateParams::create()
73{
74 return new LdsState(this);
75}
76
77/**
78 * set the parent and name based on the parent
79 */
80void
81LdsState::setParent(ComputeUnit *x_parent)
82{
83 // check that this gets assigned to the same thing each time
84 fatal_if(!x_parent, "x_parent should not be nullptr");
85 fatal_if(x_parent == parent,
86 "should not be setting the parent twice");
87
88 parent = x_parent;
89 _name = x_parent->name() + ".LdsState";
90}
91
92/**
93 * derive the gpu mem packet from the packet and then count the bank conflicts
94 */
95unsigned
96LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
97{
98 Packet::SenderState *baseSenderState = packet->senderState;
99 while (baseSenderState->predecessor) {
100 baseSenderState = baseSenderState->predecessor;
101 }
102 const ComputeUnit::LDSPort::SenderState *senderState =
103 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
104
105 fatal_if(!senderState,
106 "did not get the right sort of sender state");
107
108 GPUDynInstPtr gpuDynInst = senderState->getMemInst();
109
110 return countBankConflicts(gpuDynInst, bankAccesses);
111}
112
113// Count the total number of bank conflicts for the local memory packet
114unsigned
115LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
116 unsigned *numBankAccesses)
117{
118 int bank_conflicts = 0;
119 std::vector<int> bank;
120 // the number of LDS banks being touched by the memory instruction
121 int numBanks = std::min(parent->wfSize(), banks);
122 // if the wavefront size is larger than the number of LDS banks, we
123 // need to iterate over all work items to calculate the total
124 // number of bank conflicts
125 int groups = (parent->wfSize() > numBanks) ?
126 (parent->wfSize() / numBanks) : 1;
127 for (int i = 0; i < groups; i++) {
128 // Address Array holding all the work item addresses of an instruction
129 std::vector<Addr> addr_array;
130 addr_array.resize(numBanks, 0);
131 bank.clear();
132 bank.resize(banks, 0);
133 int max_bank = 0;
134
135 // populate the address array for all active work items
136 for (int j = 0; j < numBanks; j++) {
137 if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
138 addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
139 } else {
140 addr_array[j] = std::numeric_limits<Addr>::max();
141 }
142 }
143
144 if (gpuDynInst->m_op == Enums::MO_LD ||
145 gpuDynInst->m_op == Enums::MO_ST) {
146 // mask identical addresses
147 for (int j = 0; j < numBanks; ++j) {
148 for (int j0 = 0; j0 < j; j0++) {
149 if (addr_array[j] != std::numeric_limits<Addr>::max()
150 && addr_array[j] == addr_array[j0]) {
151 addr_array[j] = std::numeric_limits<Addr>::max();
152 }
153 }
154 }
155 }
156 // calculate bank conflicts
157 for (int j = 0; j < numBanks; ++j) {
158 if (addr_array[j] != std::numeric_limits<Addr>::max()) {
159 int bankId = addr_array[j] % banks;
160 bank[bankId]++;
161 max_bank = std::max(max_bank, bank[bankId]);
162 // Count the number of LDS banks accessed.
163 // Since we have masked identical addresses all remaining
164 // accesses will need to be serialized if they access
165 // the same bank (bank conflict).
166 (*numBankAccesses)++;
167 }
168 }
169 bank_conflicts += max_bank;
170 }
171 panic_if(bank_conflicts > parent->wfSize(),
172 "Max bank conflicts should match num of work items per instr");
173 return bank_conflicts;
174}
175
176/**
177 * receive the packet from the CU
178 */
179bool
180LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
181{
182 return ownerLds->processPacket(packet);
183}
184
185GPUDynInstPtr
186LdsState::getDynInstr(PacketPtr packet)
187{
188 ComputeUnit::LDSPort::SenderState *ss =
189 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
190 packet->senderState);
191 return ss->getMemInst();
192}
193
194/**
195 * process an incoming packet, add it to the return queue
196 */
197bool
198LdsState::processPacket(PacketPtr packet)
199{
200 unsigned bankAccesses = 0;
201 // the number of conflicts this packet will have when accessing the LDS
202 unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
203 // count the total number of physical LDS bank accessed
204 parent->ldsBankAccesses += bankAccesses;
205 // count the LDS bank conflicts. A number set to 1 indicates one
206 // access per bank maximum so there are no bank conflicts
207 parent->ldsBankConflictDist.sample(bankConflicts-1);
208
209 GPUDynInstPtr dynInst = getDynInstr(packet);
210 // account for the LDS bank conflict overhead
211 int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() :
212 (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() :
213 parent->loadBusLength();
214 // delay for accessing the LDS
215 Tick processingTime =
216 parent->shader->ticks(bankConflicts * bankConflictPenalty) +
217 parent->shader->ticks(busLength);
218 // choose (delay + last packet in queue) or (now + delay) as the time to
219 // return this
220 Tick doneAt = earliestReturnTime() + processingTime;
221 // then store it for processing
222 return returnQueuePush(std::make_pair(doneAt, packet));
223}
224
225/**
226 * add this to the queue of packets to be returned
227 */
228bool
229LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
230{
231 // TODO add time limits (e.g. one packet per cycle) and queue size limits
232 // and implement flow control
233 returnQueue.push(thePair);
234
235 // if there is no set wakeup time, look through the queue
236 if (!tickEvent.scheduled()) {
237 process();
238 }
239
240 return true;
241}
242
243/**
244 * receive a packet in functional mode
245 */
246void
247LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
248{
249 fatal("not implemented");
250}
251
252/**
253 * receive a retry for a response
254 */
255void
256LdsState::CuSidePort::recvRespRetry()
257{
258 // TODO verify that this is the right way to do this
259 assert(ownerLds->isRetryResp());
260 ownerLds->setRetryResp(false);
261 ownerLds->process();
262}
263
264/**
265 * receive a retry
266 */
267void
268LdsState::CuSidePort::recvRetry()
269{
270 fatal("not implemented");
271}
272
273/**
274 * look for packets to return at this time
275 */
276bool
277LdsState::process()
278{
279 Tick now = clockEdge();
280
281 // send back completed packets
282 while (!returnQueue.empty() && returnQueue.front().first <= now) {
283 PacketPtr packet = returnQueue.front().second;
284
285 ComputeUnit::LDSPort::SenderState *ss =
286 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
287 packet->senderState);
288
289 GPUDynInstPtr gpuDynInst = ss->getMemInst();
290
291 gpuDynInst->initiateAcc(gpuDynInst);
292
293 packet->makeTimingResponse();
294
295 returnQueue.pop();
296
297 bool success = cuPort.sendTimingResp(packet);
298
299 if (!success) {
300 retryResp = true;
301 panic("have not handled timing responses being NACK'd when sent"
302 "back");
303 }
304 }
305
306 // determine the next wakeup time
307 if (!returnQueue.empty()) {
308
309 Tick next = returnQueue.front().first;
310
311 if (tickEvent.scheduled()) {
312
313 if (next < tickEvent.when()) {
314
315 tickEvent.deschedule();
316 tickEvent.schedule(next);
317 }
318 } else {
319 tickEvent.schedule(next);
320 }
321 }
322
323 return true;
324}
325
326/**
327 * wake up at this time and perform specified actions
328 */
329void
330LdsState::TickEvent::process()
331{
332 ldsState->process();
333}
1/*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: John Kalamatianos, Joe Gross
34 */
35
36#include "gpu-compute/lds_state.hh"
37
38#include <array>
39#include <cstdio>
40#include <cstdlib>
41
42#include "gpu-compute/compute_unit.hh"
43#include "gpu-compute/gpu_dyn_inst.hh"
44#include "gpu-compute/shader.hh"
45
46/**
47 * the default constructor that works with SWIG
48 */
49LdsState::LdsState(const Params *params) :
50 MemObject(params),
51 tickEvent(this),
52 cuPort(name() + ".port", this),
53 maximumSize(params->size),
54 range(params->range),
55 bankConflictPenalty(params->bankConflictPenalty),
56 banks(params->banks)
57{
58 fatal_if(params->banks <= 0,
59 "Number of LDS banks should be positive number");
60 fatal_if((params->banks & (params->banks - 1)) != 0,
61 "Number of LDS banks should be a power of 2");
62 fatal_if(params->size <= 0,
63 "cannot allocate an LDS with a size less than 1");
64 fatal_if(params->size % 2,
65 "the LDS should be an even number");
66}
67
68/**
69 * Needed by the SWIG compiler
70 */
71LdsState *
72LdsStateParams::create()
73{
74 return new LdsState(this);
75}
76
77/**
78 * set the parent and name based on the parent
79 */
80void
81LdsState::setParent(ComputeUnit *x_parent)
82{
83 // check that this gets assigned to the same thing each time
84 fatal_if(!x_parent, "x_parent should not be nullptr");
85 fatal_if(x_parent == parent,
86 "should not be setting the parent twice");
87
88 parent = x_parent;
89 _name = x_parent->name() + ".LdsState";
90}
91
92/**
93 * derive the gpu mem packet from the packet and then count the bank conflicts
94 */
95unsigned
96LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
97{
98 Packet::SenderState *baseSenderState = packet->senderState;
99 while (baseSenderState->predecessor) {
100 baseSenderState = baseSenderState->predecessor;
101 }
102 const ComputeUnit::LDSPort::SenderState *senderState =
103 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
104
105 fatal_if(!senderState,
106 "did not get the right sort of sender state");
107
108 GPUDynInstPtr gpuDynInst = senderState->getMemInst();
109
110 return countBankConflicts(gpuDynInst, bankAccesses);
111}
112
113// Count the total number of bank conflicts for the local memory packet
114unsigned
115LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
116 unsigned *numBankAccesses)
117{
118 int bank_conflicts = 0;
119 std::vector<int> bank;
120 // the number of LDS banks being touched by the memory instruction
121 int numBanks = std::min(parent->wfSize(), banks);
122 // if the wavefront size is larger than the number of LDS banks, we
123 // need to iterate over all work items to calculate the total
124 // number of bank conflicts
125 int groups = (parent->wfSize() > numBanks) ?
126 (parent->wfSize() / numBanks) : 1;
127 for (int i = 0; i < groups; i++) {
128 // Address Array holding all the work item addresses of an instruction
129 std::vector<Addr> addr_array;
130 addr_array.resize(numBanks, 0);
131 bank.clear();
132 bank.resize(banks, 0);
133 int max_bank = 0;
134
135 // populate the address array for all active work items
136 for (int j = 0; j < numBanks; j++) {
137 if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
138 addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
139 } else {
140 addr_array[j] = std::numeric_limits<Addr>::max();
141 }
142 }
143
144 if (gpuDynInst->m_op == Enums::MO_LD ||
145 gpuDynInst->m_op == Enums::MO_ST) {
146 // mask identical addresses
147 for (int j = 0; j < numBanks; ++j) {
148 for (int j0 = 0; j0 < j; j0++) {
149 if (addr_array[j] != std::numeric_limits<Addr>::max()
150 && addr_array[j] == addr_array[j0]) {
151 addr_array[j] = std::numeric_limits<Addr>::max();
152 }
153 }
154 }
155 }
156 // calculate bank conflicts
157 for (int j = 0; j < numBanks; ++j) {
158 if (addr_array[j] != std::numeric_limits<Addr>::max()) {
159 int bankId = addr_array[j] % banks;
160 bank[bankId]++;
161 max_bank = std::max(max_bank, bank[bankId]);
162 // Count the number of LDS banks accessed.
163 // Since we have masked identical addresses all remaining
164 // accesses will need to be serialized if they access
165 // the same bank (bank conflict).
166 (*numBankAccesses)++;
167 }
168 }
169 bank_conflicts += max_bank;
170 }
171 panic_if(bank_conflicts > parent->wfSize(),
172 "Max bank conflicts should match num of work items per instr");
173 return bank_conflicts;
174}
175
176/**
177 * receive the packet from the CU
178 */
179bool
180LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
181{
182 return ownerLds->processPacket(packet);
183}
184
185GPUDynInstPtr
186LdsState::getDynInstr(PacketPtr packet)
187{
188 ComputeUnit::LDSPort::SenderState *ss =
189 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
190 packet->senderState);
191 return ss->getMemInst();
192}
193
194/**
195 * process an incoming packet, add it to the return queue
196 */
197bool
198LdsState::processPacket(PacketPtr packet)
199{
200 unsigned bankAccesses = 0;
201 // the number of conflicts this packet will have when accessing the LDS
202 unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
203 // count the total number of physical LDS bank accessed
204 parent->ldsBankAccesses += bankAccesses;
205 // count the LDS bank conflicts. A number set to 1 indicates one
206 // access per bank maximum so there are no bank conflicts
207 parent->ldsBankConflictDist.sample(bankConflicts-1);
208
209 GPUDynInstPtr dynInst = getDynInstr(packet);
210 // account for the LDS bank conflict overhead
211 int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() :
212 (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() :
213 parent->loadBusLength();
214 // delay for accessing the LDS
215 Tick processingTime =
216 parent->shader->ticks(bankConflicts * bankConflictPenalty) +
217 parent->shader->ticks(busLength);
218 // choose (delay + last packet in queue) or (now + delay) as the time to
219 // return this
220 Tick doneAt = earliestReturnTime() + processingTime;
221 // then store it for processing
222 return returnQueuePush(std::make_pair(doneAt, packet));
223}
224
225/**
226 * add this to the queue of packets to be returned
227 */
228bool
229LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
230{
231 // TODO add time limits (e.g. one packet per cycle) and queue size limits
232 // and implement flow control
233 returnQueue.push(thePair);
234
235 // if there is no set wakeup time, look through the queue
236 if (!tickEvent.scheduled()) {
237 process();
238 }
239
240 return true;
241}
242
243/**
244 * receive a packet in functional mode
245 */
246void
247LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
248{
249 fatal("not implemented");
250}
251
252/**
253 * receive a retry for a response
254 */
255void
256LdsState::CuSidePort::recvRespRetry()
257{
258 // TODO verify that this is the right way to do this
259 assert(ownerLds->isRetryResp());
260 ownerLds->setRetryResp(false);
261 ownerLds->process();
262}
263
264/**
265 * receive a retry
266 */
267void
268LdsState::CuSidePort::recvRetry()
269{
270 fatal("not implemented");
271}
272
273/**
274 * look for packets to return at this time
275 */
276bool
277LdsState::process()
278{
279 Tick now = clockEdge();
280
281 // send back completed packets
282 while (!returnQueue.empty() && returnQueue.front().first <= now) {
283 PacketPtr packet = returnQueue.front().second;
284
285 ComputeUnit::LDSPort::SenderState *ss =
286 dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
287 packet->senderState);
288
289 GPUDynInstPtr gpuDynInst = ss->getMemInst();
290
291 gpuDynInst->initiateAcc(gpuDynInst);
292
293 packet->makeTimingResponse();
294
295 returnQueue.pop();
296
297 bool success = cuPort.sendTimingResp(packet);
298
299 if (!success) {
300 retryResp = true;
301 panic("have not handled timing responses being NACK'd when sent"
302 "back");
303 }
304 }
305
306 // determine the next wakeup time
307 if (!returnQueue.empty()) {
308
309 Tick next = returnQueue.front().first;
310
311 if (tickEvent.scheduled()) {
312
313 if (next < tickEvent.when()) {
314
315 tickEvent.deschedule();
316 tickEvent.schedule(next);
317 }
318 } else {
319 tickEvent.schedule(next);
320 }
321 }
322
323 return true;
324}
325
326/**
327 * wake up at this time and perform specified actions
328 */
329void
330LdsState::TickEvent::process()
331{
332 ldsState->process();
333}
334
335/**
336 *
337 */
338void
339LdsState::regStats()
340{
341}