wavefront.hh revision 11643
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#ifndef __WAVEFRONT_HH__
37#define __WAVEFRONT_HH__
38
39#include <cassert>
40#include <deque>
41#include <memory>
42#include <stack>
43#include <vector>
44
45#include "base/misc.hh"
46#include "base/types.hh"
47#include "gpu-compute/condition_register_state.hh"
48#include "gpu-compute/lds_state.hh"
49#include "gpu-compute/misc.hh"
50#include "params/Wavefront.hh"
51#include "sim/sim_object.hh"
52
53static const int MAX_NUM_INSTS_PER_WF = 12;
54
55/**
56 * A reconvergence stack entry conveys the necessary state to implement
57 * control flow divergence.
58 */
59struct ReconvergenceStackEntry {
60    /**
61     * PC of current instruction.
62     */
63    uint32_t pc;
64    /**
65     * PC of the immediate post-dominator instruction, i.e., the value of
66     * @a pc for the first instruction that will be executed by the wavefront
67     * when a reconvergence point is reached.
68     */
69    uint32_t rpc;
70    /**
71     * Execution mask.
72     */
73    VectorMask execMask;
74};
75
76/*
77 * Arguments for the hsail opcode call, are user defined and variable length.
78 * The hardware/finalizer can support arguments in hardware or use memory to
79 * pass arguments. For now, let's assume that an unlimited number of arguments
80 * are supported in hardware (the compiler inlines functions whenver it can
81 * anyways, so unless someone is interested in the implications of linking/
82 * library functions, I think this is a reasonable assumption given the typical
83 * size of an OpenCL kernel).
84 *
85 * Note that call args are different than kernel arguments:
86 *   * All work-items in a kernel refer the same set of kernel arguments
87 *   * Each work-item has it's on set of call args. So a call argument at
88 *     address 0x4 is different for work-item 0 and work-item 1.
89 *
90 * Ok, the table below shows an example of how we organize the call arguments in
91 * the CallArgMem class.
92 *
93 * int foo(int arg1, double arg2)
94 *  ___________________________________________________
95 * | 0: return.0 | 4: return.1 | ... | 252: return.63  |
96 * |---------------------------------------------------|
97 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63    |
98 * |---------------------------------------------------|
99 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63   |
100 *  ___________________________________________________
101 */
102class CallArgMem
103{
104  public:
105    // pointer to buffer for storing function arguments
106    uint8_t *mem;
107    int wfSize;
108    // size of function args
109    int funcArgsSizePerItem;
110
111    template<typename CType>
112    int
113    getLaneOffset(int lane, int addr)
114    {
115        return addr * wfSize + sizeof(CType) * lane;
116    }
117
118    CallArgMem(int func_args_size_per_item, int wf_size)
119        : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
120    {
121        mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
122    }
123
124    ~CallArgMem()
125    {
126        free(mem);
127    }
128
129    template<typename CType>
130    uint8_t*
131    getLaneAddr(int lane, int addr)
132    {
133        return mem + getLaneOffset<CType>(lane, addr);
134    }
135
136    template<typename CType>
137    void
138    setLaneAddr(int lane, int addr, CType val)
139    {
140        *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
141    }
142};
143
144class Wavefront : public SimObject
145{
146  public:
147    enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
148    enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
149
150    // Base pointer for array of instruction pointers
151    uint64_t basePtr;
152
153    uint32_t oldBarrierCnt;
154    uint32_t barrierCnt;
155    uint32_t barrierId;
156    uint32_t barrierSlots;
157    status_e status;
158    // HW slot id where the WF is mapped to inside a SIMD unit
159    int wfSlotId;
160    int kernId;
161    // SIMD unit where the WV has been scheduled
162    int simdId;
163    // pointer to parent CU
164    ComputeUnit *computeUnit;
165
166    std::deque<GPUDynInstPtr> instructionBuffer;
167
168    bool pendingFetch;
169    bool dropFetch;
170
171    // Condition Register State (for HSAIL simulations only)
172    class ConditionRegisterState *condRegState;
173    // number of single precision VGPRs required by WF
174    uint32_t maxSpVgprs;
175    // number of double precision VGPRs required by WF
176    uint32_t maxDpVgprs;
177    // map virtual to physical vector register
178    uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
179    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
180    bool isGmInstruction(GPUDynInstPtr ii);
181    bool isLmInstruction(GPUDynInstPtr ii);
182    bool isOldestInstGMem();
183    bool isOldestInstLMem();
184    bool isOldestInstPrivMem();
185    bool isOldestInstFlatMem();
186    bool isOldestInstALU();
187    bool isOldestInstBarrier();
188    // used for passing spill address to DDInstGPU
189    std::vector<Addr> lastAddr;
190    std::vector<uint32_t> workItemId[3];
191    std::vector<uint32_t> workItemFlatId;
192    uint32_t workGroupId[3];
193    uint32_t workGroupSz[3];
194    uint32_t gridSz[3];
195    uint32_t wgId;
196    uint32_t wgSz;
197    // wavefront id within a workgroup
198    uint32_t wfId;
199    uint32_t maxDynWaveId;
200    uint32_t dispatchId;
201    // outstanding global+local memory requests
202    uint32_t outstandingReqs;
203    // memory requests between scoreboard
204    // and execute stage not yet executed
205    uint32_t memReqsInPipe;
206    // outstanding global memory write requests
207    uint32_t outstandingReqsWrGm;
208    // outstanding local memory write requests
209    uint32_t outstandingReqsWrLm;
210    // outstanding global memory read requests
211    uint32_t outstandingReqsRdGm;
212    // outstanding local memory read requests
213    uint32_t outstandingReqsRdLm;
214    uint32_t rdLmReqsInPipe;
215    uint32_t rdGmReqsInPipe;
216    uint32_t wrLmReqsInPipe;
217    uint32_t wrGmReqsInPipe;
218
219    int memTraceBusy;
220    uint64_t lastTrace;
221    // number of vector registers reserved by WF
222    int reservedVectorRegs;
223    // Index into the Vector Register File's namespace where the WF's registers
224    // will live while the WF is executed
225    uint32_t startVgprIndex;
226
227    // Old value of destination gpr (for trace)
228    std::vector<uint32_t> oldVgpr;
229    // Id of destination gpr (for trace)
230    uint32_t oldVgprId;
231    // Tick count of last old_vgpr copy
232    uint64_t oldVgprTcnt;
233
234    // Old value of destination gpr (for trace)
235    std::vector<uint64_t> oldDgpr;
236    // Id of destination gpr (for trace)
237    uint32_t oldDgprId;
238    // Tick count of last old_vgpr copy
239    uint64_t oldDgprTcnt;
240
241    // Execution mask at wavefront start
242    VectorMask initMask;
243
244    // number of barriers this WF has joined
245    std::vector<int> barCnt;
246    int maxBarCnt;
247    // Flag to stall a wave on barrier
248    bool stalledAtBarrier;
249
250    // a pointer to the fraction of the LDS allocated
251    // to this workgroup (thus this wavefront)
252    LdsChunk *ldsChunk;
253
254    // A pointer to the spill area
255    Addr spillBase;
256    // The size of the spill area
257    uint32_t spillSizePerItem;
258    // The vector width of the spill area
259    uint32_t spillWidth;
260
261    // A pointer to the private memory area
262    Addr privBase;
263    // The size of the private memory area
264    uint32_t privSizePerItem;
265
266    // A pointer ot the read-only memory area
267    Addr roBase;
268    // size of the read-only memory area
269    uint32_t roSize;
270
271    // pointer to buffer for storing kernel arguments
272    uint8_t *kernelArgs;
273    // unique WF id over all WFs executed across all CUs
274    uint64_t wfDynId;
275
276    // number of times instruction issue for this wavefront is blocked
277    // due to VRF port availability
278    Stats::Scalar numTimesBlockedDueVrfPortAvail;
279    // number of times an instruction of a WF is blocked from being issued
280    // due to WAR and WAW dependencies
281    Stats::Scalar numTimesBlockedDueWAXDependencies;
282    // number of times an instruction of a WF is blocked from being issued
283    // due to WAR and WAW dependencies
284    Stats::Scalar numTimesBlockedDueRAWDependencies;
285    // distribution of executed instructions based on their register
286    // operands; this is used to highlight the load on the VRF
287    Stats::Distribution srcRegOpDist;
288    Stats::Distribution dstRegOpDist;
289
290    // Functions to operate on call argument memory
291    // argument memory for hsail call instruction
292    CallArgMem *callArgMem;
293    void
294    initCallArgMem(int func_args_size_per_item, int wf_size)
295    {
296        callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
297    }
298
299    template<typename CType>
300    CType
301    readCallArgMem(int lane, int addr)
302    {
303        return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
304    }
305
306    template<typename CType>
307    void
308    writeCallArgMem(int lane, int addr, CType val)
309    {
310        callArgMem->setLaneAddr<CType>(lane, addr, val);
311    }
312
313    typedef WavefrontParams Params;
314    Wavefront(const Params *p);
315    ~Wavefront();
316    virtual void init();
317
318    void
319    setParent(ComputeUnit *cu)
320    {
321        computeUnit = cu;
322    }
323
324    void start(uint64_t _wfDynId, uint64_t _base_ptr);
325    void exec();
326    void updateResources();
327    int ready(itype_e type);
328    bool instructionBufferHasBranch();
329    void regStats();
330    VectorMask getPred() { return execMask() & initMask; }
331
332    bool waitingAtBarrier(int lane);
333
334    void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
335                                  const VectorMask& exec_mask);
336
337    void popFromReconvergenceStack();
338
339    uint32_t pc() const;
340
341    uint32_t rpc() const;
342
343    VectorMask execMask() const;
344
345    bool execMask(int lane) const;
346
347    void pc(uint32_t new_pc);
348
349    void discardFetch();
350
351    /**
352     * Returns the size of the static hardware context of a particular wavefront
353     * This should be updated everytime the context is changed
354     */
355    uint32_t getStaticContextSize() const;
356
357  private:
358    /**
359     * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
360     * to be visited by the wavefront, and the associated execution masks. The
361     * reconvergence stack grows every time the wavefront reaches a divergence
362     * point (branch instruction), and shrinks every time the wavefront
363     * reaches a reconvergence point (immediate post-dominator instruction).
364     */
365    std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
366};
367
368#endif // __WAVEFRONT_HH__
369