1/*
2 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Lisa Hsu
34 */
35
36#ifndef __WAVEFRONT_HH__
37#define __WAVEFRONT_HH__
38
39#include <cassert>
40#include <deque>
41#include <memory>
42#include <stack>
43#include <vector>
44
45#include "arch/gpu_isa.hh"
46#include "base/logging.hh"
47#include "base/types.hh"
48#include "config/the_gpu_isa.hh"
49#include "gpu-compute/condition_register_state.hh"
50#include "gpu-compute/lds_state.hh"
51#include "gpu-compute/misc.hh"
52#include "gpu-compute/ndrange.hh"
53#include "params/Wavefront.hh"
54#include "sim/sim_object.hh"
55
56static const int MAX_NUM_INSTS_PER_WF = 12;
57
58/**
59 * A reconvergence stack entry conveys the necessary state to implement
60 * control flow divergence.
61 */
62struct ReconvergenceStackEntry {
63    /**
64     * PC of current instruction.
65     */
66    uint32_t pc;
67    /**
68     * PC of the immediate post-dominator instruction, i.e., the value of
69     * @a pc for the first instruction that will be executed by the wavefront
70     * when a reconvergence point is reached.
71     */
72    uint32_t rpc;
73    /**
74     * Execution mask.
75     */
76    VectorMask execMask;
77};
78
79/*
80 * Arguments for the hsail opcode call, are user defined and variable length.
81 * The hardware/finalizer can support arguments in hardware or use memory to
82 * pass arguments. For now, let's assume that an unlimited number of arguments
83 * are supported in hardware (the compiler inlines functions whenver it can
84 * anyways, so unless someone is interested in the implications of linking/
85 * library functions, I think this is a reasonable assumption given the typical
86 * size of an OpenCL kernel).
87 *
88 * Note that call args are different than kernel arguments:
89 *   * All work-items in a kernel refer the same set of kernel arguments
90 *   * Each work-item has it's on set of call args. So a call argument at
91 *     address 0x4 is different for work-item 0 and work-item 1.
92 *
93 * Ok, the table below shows an example of how we organize the call arguments in
94 * the CallArgMem class.
95 *
96 * int foo(int arg1, double arg2)
97 *  ___________________________________________________
98 * | 0: return.0 | 4: return.1 | ... | 252: return.63  |
99 * |---------------------------------------------------|
100 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63    |
101 * |---------------------------------------------------|
102 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63   |
103 *  ___________________________________________________
104 */
105class CallArgMem
106{
107  public:
108    // pointer to buffer for storing function arguments
109    uint8_t *mem;
110    int wfSize;
111    // size of function args
112    int funcArgsSizePerItem;
113
114    template<typename CType>
115    int
116    getLaneOffset(int lane, int addr)
117    {
118        return addr * wfSize + sizeof(CType) * lane;
119    }
120
121    CallArgMem(int func_args_size_per_item, int wf_size)
122        : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
123    {
124        mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
125    }
126
127    ~CallArgMem()
128    {
129        free(mem);
130    }
131
132    template<typename CType>
133    uint8_t*
134    getLaneAddr(int lane, int addr)
135    {
136        return mem + getLaneOffset<CType>(lane, addr);
137    }
138
139    template<typename CType>
140    void
141    setLaneAddr(int lane, int addr, CType val)
142    {
143        *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
144    }
145};
146
147class Wavefront : public SimObject
148{
149  public:
150    enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
151    enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
152
153    // Base pointer for array of instruction pointers
154    uint64_t basePtr;
155
156    uint32_t oldBarrierCnt;
157    uint32_t barrierCnt;
158    uint32_t barrierId;
159    uint32_t barrierSlots;
160    status_e status;
161    // HW slot id where the WF is mapped to inside a SIMD unit
162    int wfSlotId;
163    int kernId;
164    // SIMD unit where the WV has been scheduled
165    int simdId;
166    // pointer to parent CU
167    ComputeUnit *computeUnit;
168
169    std::deque<GPUDynInstPtr> instructionBuffer;
170
171    bool pendingFetch;
172    bool dropFetch;
173
174    // Condition Register State (for HSAIL simulations only)
175    class ConditionRegisterState *condRegState;
176    // number of single precision VGPRs required by WF
177    uint32_t maxSpVgprs;
178    // number of double precision VGPRs required by WF
179    uint32_t maxDpVgprs;
180    // map virtual to physical vector register
181    uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
182    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
183    bool isGmInstruction(GPUDynInstPtr ii);
184    bool isLmInstruction(GPUDynInstPtr ii);
185    bool isOldestInstGMem();
186    bool isOldestInstLMem();
187    bool isOldestInstPrivMem();
188    bool isOldestInstFlatMem();
189    bool isOldestInstALU();
190    bool isOldestInstBarrier();
191    // used for passing spill address to DDInstGPU
192    std::vector<Addr> lastAddr;
193    std::vector<uint32_t> workItemId[3];
194    std::vector<uint32_t> workItemFlatId;
195    /* kernel launch parameters */
196    uint32_t workGroupId[3];
197    uint32_t workGroupSz[3];
198    uint32_t gridSz[3];
199    uint32_t wgId;
200    uint32_t wgSz;
201    /* the actual WG size can differ than the maximum size */
202    uint32_t actualWgSz[3];
203    uint32_t actualWgSzTotal;
204    void computeActualWgSz(NDRange *ndr);
205    // wavefront id within a workgroup
206    uint32_t wfId;
207    uint32_t maxDynWaveId;
208    uint32_t dispatchId;
209    // outstanding global+local memory requests
210    uint32_t outstandingReqs;
211    // memory requests between scoreboard
212    // and execute stage not yet executed
213    uint32_t memReqsInPipe;
214    // outstanding global memory write requests
215    uint32_t outstandingReqsWrGm;
216    // outstanding local memory write requests
217    uint32_t outstandingReqsWrLm;
218    // outstanding global memory read requests
219    uint32_t outstandingReqsRdGm;
220    // outstanding local memory read requests
221    uint32_t outstandingReqsRdLm;
222    uint32_t rdLmReqsInPipe;
223    uint32_t rdGmReqsInPipe;
224    uint32_t wrLmReqsInPipe;
225    uint32_t wrGmReqsInPipe;
226
227    int memTraceBusy;
228    uint64_t lastTrace;
229    // number of vector registers reserved by WF
230    int reservedVectorRegs;
231    // Index into the Vector Register File's namespace where the WF's registers
232    // will live while the WF is executed
233    uint32_t startVgprIndex;
234
235    // Old value of destination gpr (for trace)
236    std::vector<uint32_t> oldVgpr;
237    // Id of destination gpr (for trace)
238    uint32_t oldVgprId;
239    // Tick count of last old_vgpr copy
240    uint64_t oldVgprTcnt;
241
242    // Old value of destination gpr (for trace)
243    std::vector<uint64_t> oldDgpr;
244    // Id of destination gpr (for trace)
245    uint32_t oldDgprId;
246    // Tick count of last old_vgpr copy
247    uint64_t oldDgprTcnt;
248
249    // Execution mask at wavefront start
250    VectorMask initMask;
251
252    // number of barriers this WF has joined
253    std::vector<int> barCnt;
254    int maxBarCnt;
255    // Flag to stall a wave on barrier
256    bool stalledAtBarrier;
257
258    // a pointer to the fraction of the LDS allocated
259    // to this workgroup (thus this wavefront)
260    LdsChunk *ldsChunk;
261
262    // A pointer to the spill area
263    Addr spillBase;
264    // The size of the spill area
265    uint32_t spillSizePerItem;
266    // The vector width of the spill area
267    uint32_t spillWidth;
268
269    // A pointer to the private memory area
270    Addr privBase;
271    // The size of the private memory area
272    uint32_t privSizePerItem;
273
274    // A pointer ot the read-only memory area
275    Addr roBase;
276    // size of the read-only memory area
277    uint32_t roSize;
278
279    // pointer to buffer for storing kernel arguments
280    uint8_t *kernelArgs;
281    // unique WF id over all WFs executed across all CUs
282    uint64_t wfDynId;
283
284    // number of times instruction issue for this wavefront is blocked
285    // due to VRF port availability
286    Stats::Scalar numTimesBlockedDueVrfPortAvail;
287    // number of times an instruction of a WF is blocked from being issued
288    // due to WAR and WAW dependencies
289    Stats::Scalar numTimesBlockedDueWAXDependencies;
290    // number of times an instruction of a WF is blocked from being issued
291    // due to WAR and WAW dependencies
292    Stats::Scalar numTimesBlockedDueRAWDependencies;
293    // distribution of executed instructions based on their register
294    // operands; this is used to highlight the load on the VRF
295    Stats::Distribution srcRegOpDist;
296    Stats::Distribution dstRegOpDist;
297
298    // Functions to operate on call argument memory
299    // argument memory for hsail call instruction
300    CallArgMem *callArgMem;
301    void
302    initCallArgMem(int func_args_size_per_item, int wf_size)
303    {
304        callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
305    }
306
307    template<typename CType>
308    CType
309    readCallArgMem(int lane, int addr)
310    {
311        return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
312    }
313
314    template<typename CType>
315    void
316    writeCallArgMem(int lane, int addr, CType val)
317    {
318        callArgMem->setLaneAddr<CType>(lane, addr, val);
319    }
320
321    typedef WavefrontParams Params;
322    Wavefront(const Params *p);
323    ~Wavefront();
324    virtual void init();
325
326    void
327    setParent(ComputeUnit *cu)
328    {
329        computeUnit = cu;
330    }
331
332    void start(uint64_t _wfDynId, uint64_t _base_ptr);
333    void exec();
334    void updateResources();
335    int ready(itype_e type);
336    bool instructionBufferHasBranch();
337    void regStats();
338    VectorMask getPred() { return execMask() & initMask; }
339
340    bool waitingAtBarrier(int lane);
341
342    void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
343                                  const VectorMask& exec_mask);
344
345    void popFromReconvergenceStack();
346
347    uint32_t pc() const;
348
349    uint32_t rpc() const;
350
351    VectorMask execMask() const;
352
353    bool execMask(int lane) const;
354
355    void pc(uint32_t new_pc);
356
357    void discardFetch();
358
359    /**
360     * Returns the size of the static hardware context of a particular wavefront
361     * This should be updated everytime the context is changed
362     */
363    uint32_t getStaticContextSize() const;
364
365    /**
366     * Returns the hardware context as a stream of bytes
367     * This method is designed for HSAIL execution
368     */
369    void getContext(const void *out);
370
371    /**
372     * Sets the hardware context fromt a stream of bytes
373     * This method is designed for HSAIL execution
374     */
375    void setContext(const void *in);
376
377    TheGpuISA::GPUISA&
378    gpuISA()
379    {
380        return _gpuISA;
381    }
382
383  private:
384    TheGpuISA::GPUISA _gpuISA;
385    /**
386     * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
387     * to be visited by the wavefront, and the associated execution masks. The
388     * reconvergence stack grows every time the wavefront reaches a divergence
389     * point (branch instruction), and shrinks every time the wavefront
390     * reaches a reconvergence point (immediate post-dominator instruction).
391     */
392    std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
393};
394
395#endif // __WAVEFRONT_HH__
396