wavefront.hh revision 11308
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#ifndef __WAVEFRONT_HH__
37#define __WAVEFRONT_HH__
38
39#include <cassert>
40#include <deque>
41#include <memory>
42#include <stack>
43#include <vector>
44
45#include "base/misc.hh"
46#include "base/types.hh"
47#include "gpu-compute/condition_register_state.hh"
48#include "gpu-compute/lds_state.hh"
49#include "gpu-compute/misc.hh"
50#include "params/Wavefront.hh"
51#include "sim/sim_object.hh"
52
53static const int MAX_NUM_INSTS_PER_WF = 12;
54
55/*
56 * Arguments for the hsail opcode call, are user defined and variable length.
57 * The hardware/finalizer can support arguments in hardware or use memory to
58 * pass arguments. For now, let's assume that an unlimited number of arguments
59 * are supported in hardware (the compiler inlines functions whenver it can
60 * anyways, so unless someone is interested in the implications of linking/
61 * library functions, I think this is a reasonable assumption given the typical
62 * size of an OpenCL kernel).
63 *
64 * Note that call args are different than kernel arguments:
65 *   * All work-items in a kernel refer the same set of kernel arguments
66 *   * Each work-item has it's on set of call args. So a call argument at
67 *     address 0x4 is different for work-item 0 and work-item 1.
68 *
69 * Ok, the table below shows an example of how we organize the call arguments in
70 * the CallArgMem class.
71 *
72 * int foo(int arg1, double arg2)
73 *  ___________________________________________________
74 * | 0: return.0 | 4: return.1 | ... | 252: return.63  |
75 * |---------------------------------------------------|
76 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63    |
77 * |---------------------------------------------------|
78 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63   |
79 *  ___________________________________________________
80 */
81class CallArgMem
82{
83  public:
84    // pointer to buffer for storing function arguments
85    uint8_t *mem;
86    // size of function args
87    int funcArgsSizePerItem;
88
89    template<typename CType>
90    int
91    getLaneOffset(int lane, int addr)
92    {
93        return addr * VSZ + sizeof(CType) * lane;
94    }
95
96    CallArgMem(int func_args_size_per_item)
97      : funcArgsSizePerItem(func_args_size_per_item)
98    {
99        mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
100    }
101
102    ~CallArgMem()
103    {
104        free(mem);
105    }
106
107    template<typename CType>
108    uint8_t*
109    getLaneAddr(int lane, int addr)
110    {
111        return mem + getLaneOffset<CType>(lane, addr);
112    }
113
114    template<typename CType>
115    void
116    setLaneAddr(int lane, int addr, CType val)
117    {
118        *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
119    }
120};
121
122/**
123 * A reconvergence stack entry conveys the necessary state to implement
124 * control flow divergence.
125 */
126class ReconvergenceStackEntry {
127
128  public:
129    ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc,
130                            VectorMask new_mask) : pc(new_pc), rpc(new_rpc),
131                            execMask(new_mask) {
132    }
133
134    /**
135     * PC of current instruction.
136     */
137    uint32_t pc;
138    /**
139     * PC of the immediate post-dominator instruction, i.e., the value of
140     * @a pc for the first instruction that will be executed by the wavefront
141     * when a reconvergence point is reached.
142     */
143    uint32_t rpc;
144    /**
145     * Execution mask.
146     */
147    VectorMask execMask;
148};
149
150class Wavefront : public SimObject
151{
152  public:
153    enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
154    enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
155
156    // Base pointer for array of instruction pointers
157    uint64_t base_ptr;
158
159    uint32_t old_barrier_cnt;
160    uint32_t barrier_cnt;
161    uint32_t barrier_id;
162    uint32_t barrier_slots;
163    status_e status;
164    // HW slot id where the WF is mapped to inside a SIMD unit
165    int wfSlotId;
166    int kern_id;
167    // SIMD unit where the WV has been scheduled
168    int simdId;
169    // pointer to parent CU
170    ComputeUnit *computeUnit;
171
172    std::deque<GPUDynInstPtr> instructionBuffer;
173
174    bool pendingFetch;
175    bool dropFetch;
176
177    // Condition Register State (for HSAIL simulations only)
178    class ConditionRegisterState *condRegState;
179    // number of single precision VGPRs required by WF
180    uint32_t maxSpVgprs;
181    // number of double precision VGPRs required by WF
182    uint32_t maxDpVgprs;
183    // map virtual to physical vector register
184    uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
185    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
186    bool isGmInstruction(GPUDynInstPtr ii);
187    bool isLmInstruction(GPUDynInstPtr ii);
188    bool isOldestInstGMem();
189    bool isOldestInstLMem();
190    bool isOldestInstPrivMem();
191    bool isOldestInstFlatMem();
192    bool isOldestInstALU();
193    bool isOldestInstBarrier();
194    // used for passing spill address to DDInstGPU
195    uint64_t last_addr[VSZ];
196    uint32_t workitemid[3][VSZ];
197    uint32_t workitemFlatId[VSZ];
198    uint32_t workgroupid[3];
199    uint32_t workgroupsz[3];
200    uint32_t gridsz[3];
201    uint32_t wg_id;
202    uint32_t wg_sz;
203    uint32_t dynwaveid;
204    uint32_t maxdynwaveid;
205    uint32_t dispatchid;
206    // outstanding global+local memory requests
207    uint32_t outstanding_reqs;
208    // memory requests between scoreboard
209    // and execute stage not yet executed
210    uint32_t mem_reqs_in_pipe;
211    // outstanding global memory write requests
212    uint32_t outstanding_reqs_wr_gm;
213    // outstanding local memory write requests
214    uint32_t outstanding_reqs_wr_lm;
215    // outstanding global memory read requests
216    uint32_t outstanding_reqs_rd_gm;
217    // outstanding local memory read requests
218    uint32_t outstanding_reqs_rd_lm;
219    uint32_t rd_lm_reqs_in_pipe;
220    uint32_t rd_gm_reqs_in_pipe;
221    uint32_t wr_lm_reqs_in_pipe;
222    uint32_t wr_gm_reqs_in_pipe;
223
224    int mem_trace_busy;
225    uint64_t last_trace;
226    // number of vector registers reserved by WF
227    int reservedVectorRegs;
228    // Index into the Vector Register File's namespace where the WF's registers
229    // will live while the WF is executed
230    uint32_t startVgprIndex;
231
232    // Old value of destination gpr (for trace)
233    uint32_t old_vgpr[VSZ];
234    // Id of destination gpr (for trace)
235    uint32_t old_vgpr_id;
236    // Tick count of last old_vgpr copy
237    uint64_t old_vgpr_tcnt;
238
239    // Old value of destination gpr (for trace)
240    uint64_t old_dgpr[VSZ];
241    // Id of destination gpr (for trace)
242    uint32_t old_dgpr_id;
243    // Tick count of last old_vgpr copy
244    uint64_t old_dgpr_tcnt;
245
246    // Execution mask at wavefront start
247    VectorMask init_mask;
248
249    // number of barriers this WF has joined
250    int bar_cnt[VSZ];
251    int max_bar_cnt;
252    // Flag to stall a wave on barrier
253    bool stalledAtBarrier;
254
255    // a pointer to the fraction of the LDS allocated
256    // to this workgroup (thus this wavefront)
257    LdsChunk *ldsChunk;
258
259    // A pointer to the spill area
260    Addr spillBase;
261    // The size of the spill area
262    uint32_t spillSizePerItem;
263    // The vector width of the spill area
264    uint32_t spillWidth;
265
266    // A pointer to the private memory area
267    Addr privBase;
268    // The size of the private memory area
269    uint32_t privSizePerItem;
270
271    // A pointer ot the read-only memory area
272    Addr roBase;
273    // size of the read-only memory area
274    uint32_t roSize;
275
276    // pointer to buffer for storing kernel arguments
277    uint8_t *kernelArgs;
278    // unique WF id over all WFs executed across all CUs
279    uint64_t wfDynId;
280
281    // number of times instruction issue for this wavefront is blocked
282    // due to VRF port availability
283    Stats::Scalar numTimesBlockedDueVrfPortAvail;
284    // number of times an instruction of a WF is blocked from being issued
285    // due to WAR and WAW dependencies
286    Stats::Scalar numTimesBlockedDueWAXDependencies;
287    // number of times an instruction of a WF is blocked from being issued
288    // due to WAR and WAW dependencies
289    Stats::Scalar numTimesBlockedDueRAWDependencies;
290    // distribution of executed instructions based on their register
291    // operands; this is used to highlight the load on the VRF
292    Stats::Distribution srcRegOpDist;
293    Stats::Distribution dstRegOpDist;
294
295    // Functions to operate on call argument memory
296    // argument memory for hsail call instruction
297    CallArgMem *callArgMem;
298    void
299    initCallArgMem(int func_args_size_per_item)
300    {
301        callArgMem = new CallArgMem(func_args_size_per_item);
302    }
303
304    template<typename CType>
305    CType
306    readCallArgMem(int lane, int addr)
307    {
308        return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
309    }
310
311    template<typename CType>
312    void
313    writeCallArgMem(int lane, int addr, CType val)
314    {
315        callArgMem->setLaneAddr<CType>(lane, addr, val);
316    }
317
318    typedef WavefrontParams Params;
319    Wavefront(const Params *p);
320    ~Wavefront();
321    virtual void init();
322
323    void
324    setParent(ComputeUnit *cu)
325    {
326        computeUnit = cu;
327    }
328
329    void start(uint64_t _wfDynId, uint64_t _base_ptr);
330
331    void exec();
332    void updateResources();
333    int ready(itype_e type);
334    bool instructionBufferHasBranch();
335    void regStats();
336    VectorMask get_pred() { return execMask() & init_mask; }
337
338    bool waitingAtBarrier(int lane);
339
340    void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
341                                  const VectorMask& exec_mask);
342
343    void popFromReconvergenceStack();
344
345    uint32_t pc() const;
346
347    uint32_t rpc() const;
348
349    VectorMask execMask() const;
350
351    bool execMask(int lane) const;
352
353    void pc(uint32_t new_pc);
354
355    void discardFetch();
356
357  private:
358    /**
359     * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
360     * to be visited by the wavefront, and the associated execution masks. The
361     * reconvergence stack grows every time the wavefront reaches a divergence
362     * point (branch instruction), and shrinks every time the wavefront
363     * reaches a reconvergence point (immediate post-dominator instruction).
364     */
365    std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
366};
367
368#endif // __WAVEFRONT_HH__
369