lds_state.cc revision 11692
1/*
2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: John Kalamatianos, Joe Gross
34 */
35
36#include "gpu-compute/lds_state.hh"
37
38#include <array>
39#include <cstdio>
40#include <cstdlib>
41
42#include "gpu-compute/compute_unit.hh"
43#include "gpu-compute/gpu_dyn_inst.hh"
44#include "gpu-compute/shader.hh"
45
46/**
47 * the default constructor that works with SWIG
48 */
49LdsState::LdsState(const Params *params) :
50    MemObject(params),
51    tickEvent(this),
52    cuPort(name() + ".port", this),
53    maximumSize(params->size),
54    range(params->range),
55    bankConflictPenalty(params->bankConflictPenalty),
56    banks(params->banks)
57{
58    fatal_if(params->banks <= 0,
59             "Number of LDS banks should be positive number");
60    fatal_if((params->banks & (params->banks - 1)) != 0,
61             "Number of LDS banks should be a power of 2");
62    fatal_if(params->size <= 0,
63             "cannot allocate an LDS with a size less than 1");
64    fatal_if(params->size % 2,
65          "the LDS should be an even number");
66}
67
68/**
69 * Needed by the SWIG compiler
70 */
71LdsState *
72LdsStateParams::create()
73{
74    return new LdsState(this);
75}
76
77/**
78 * set the parent and name based on the parent
79 */
80void
81LdsState::setParent(ComputeUnit *x_parent)
82{
83    // check that this gets assigned to the same thing each time
84    fatal_if(!x_parent, "x_parent should not be nullptr");
85    fatal_if(x_parent == parent,
86             "should not be setting the parent twice");
87
88    parent = x_parent;
89    _name = x_parent->name() + ".LdsState";
90}
91
92/**
93 * derive the gpu mem packet from the packet and then count the bank conflicts
94 */
95unsigned
96LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
97{
98    Packet::SenderState *baseSenderState = packet->senderState;
99    while (baseSenderState->predecessor) {
100        baseSenderState = baseSenderState->predecessor;
101    }
102    const ComputeUnit::LDSPort::SenderState *senderState =
103            dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
104
105    fatal_if(!senderState,
106             "did not get the right sort of sender state");
107
108    GPUDynInstPtr gpuDynInst = senderState->getMemInst();
109
110    return countBankConflicts(gpuDynInst, bankAccesses);
111}
112
113// Count the total number of bank conflicts for the local memory packet
114unsigned
115LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
116                             unsigned *numBankAccesses)
117{
118    int bank_conflicts = 0;
119    std::vector<int> bank;
120    // the number of LDS banks being touched by the memory instruction
121    int numBanks = std::min(parent->wfSize(), banks);
122    // if the wavefront size is larger than the number of LDS banks, we
123    // need to iterate over all work items to calculate the total
124    // number of bank conflicts
125    int groups = (parent->wfSize() > numBanks) ?
126        (parent->wfSize() / numBanks) : 1;
127    for (int i = 0; i < groups; i++) {
128        // Address Array holding all the work item addresses of an instruction
129        std::vector<Addr> addr_array;
130        addr_array.resize(numBanks, 0);
131        bank.clear();
132        bank.resize(banks, 0);
133        int max_bank = 0;
134
135        // populate the address array for all active work items
136        for (int j = 0; j < numBanks; j++) {
137            if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
138                addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
139            } else {
140                addr_array[j] = std::numeric_limits<Addr>::max();
141            }
142        }
143
144        if (gpuDynInst->isLoad() || gpuDynInst->isStore()) {
145            // mask identical addresses
146            for (int j = 0; j < numBanks; ++j) {
147                for (int j0 = 0; j0 < j; j0++) {
148                    if (addr_array[j] != std::numeric_limits<Addr>::max()
149                                    && addr_array[j] == addr_array[j0]) {
150                        addr_array[j] = std::numeric_limits<Addr>::max();
151                    }
152                }
153            }
154        }
155        // calculate bank conflicts
156        for (int j = 0; j < numBanks; ++j) {
157            if (addr_array[j] != std::numeric_limits<Addr>::max()) {
158                int bankId = addr_array[j] % banks;
159                bank[bankId]++;
160                max_bank = std::max(max_bank, bank[bankId]);
161                // Count the number of LDS banks accessed.
162                // Since we have masked identical addresses all remaining
163                // accesses will need to be serialized if they access
164                // the same bank (bank conflict).
165                (*numBankAccesses)++;
166            }
167        }
168        bank_conflicts += max_bank;
169    }
170    panic_if(bank_conflicts > parent->wfSize(),
171             "Max bank conflicts should match num of work items per instr");
172    return bank_conflicts;
173}
174
175/**
176 * receive the packet from the CU
177 */
178bool
179LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
180{
181    return ownerLds->processPacket(packet);
182}
183
184GPUDynInstPtr
185LdsState::getDynInstr(PacketPtr packet)
186{
187    ComputeUnit::LDSPort::SenderState *ss =
188        dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
189                     packet->senderState);
190    return ss->getMemInst();
191}
192
193/**
194 * process an incoming packet, add it to the return queue
195 */
196bool
197LdsState::processPacket(PacketPtr packet)
198{
199    unsigned bankAccesses = 0;
200    // the number of conflicts this packet will have when accessing the LDS
201    unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
202    // count the total number of physical LDS bank accessed
203    parent->ldsBankAccesses += bankAccesses;
204    // count the LDS bank conflicts. A number set to 1 indicates one
205    // access per bank maximum so there are no bank conflicts
206    parent->ldsBankConflictDist.sample(bankConflicts-1);
207
208    GPUDynInstPtr dynInst = getDynInstr(packet);
209    // account for the LDS bank conflict overhead
210    int busLength = (dynInst->isLoad()) ? parent->loadBusLength() :
211        (dynInst->isStore()) ? parent->storeBusLength() :
212        parent->loadBusLength();
213    // delay for accessing the LDS
214    Tick processingTime =
215        parent->shader->ticks(bankConflicts * bankConflictPenalty) +
216        parent->shader->ticks(busLength);
217    // choose (delay + last packet in queue) or (now + delay) as the time to
218    // return this
219    Tick doneAt = earliestReturnTime() + processingTime;
220    // then store it for processing
221    return returnQueuePush(std::make_pair(doneAt, packet));
222}
223
224/**
225 * add this to the queue of packets to be returned
226 */
227bool
228LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
229{
230    // TODO add time limits (e.g. one packet per cycle) and queue size limits
231    // and implement flow control
232    returnQueue.push(thePair);
233
234    // if there is no set wakeup time, look through the queue
235    if (!tickEvent.scheduled()) {
236        process();
237    }
238
239    return true;
240}
241
242/**
243 * receive a packet in functional mode
244 */
245void
246LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
247{
248    fatal("not implemented");
249}
250
251/**
252 * receive a retry for a response
253 */
254void
255LdsState::CuSidePort::recvRespRetry()
256{
257    // TODO verify that this is the right way to do this
258    assert(ownerLds->isRetryResp());
259    ownerLds->setRetryResp(false);
260    ownerLds->process();
261}
262
263/**
264 * receive a retry
265 */
266void
267LdsState::CuSidePort::recvRetry()
268{
269    fatal("not implemented");
270}
271
272/**
273 * look for packets to return at this time
274 */
275bool
276LdsState::process()
277{
278    Tick now = clockEdge();
279
280    // send back completed packets
281    while (!returnQueue.empty() && returnQueue.front().first <= now) {
282        PacketPtr packet = returnQueue.front().second;
283
284        ComputeUnit::LDSPort::SenderState *ss =
285            dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
286                            packet->senderState);
287
288        GPUDynInstPtr gpuDynInst = ss->getMemInst();
289
290        gpuDynInst->initiateAcc(gpuDynInst);
291
292        packet->makeTimingResponse();
293
294        returnQueue.pop();
295
296        bool success = cuPort.sendTimingResp(packet);
297
298        if (!success) {
299            retryResp = true;
300            panic("have not handled timing responses being NACK'd when sent"
301                            "back");
302        }
303    }
304
305    // determine the next wakeup time
306    if (!returnQueue.empty()) {
307
308        Tick next = returnQueue.front().first;
309
310        if (tickEvent.scheduled()) {
311
312            if (next < tickEvent.when()) {
313
314                tickEvent.deschedule();
315                tickEvent.schedule(next);
316            }
317        } else {
318            tickEvent.schedule(next);
319        }
320    }
321
322    return true;
323}
324
325/**
326 * wake up at this time and perform specified actions
327 */
328void
329LdsState::TickEvent::process()
330{
331    ldsState->process();
332}
333