Cross Reference: /gem5/src/gpu-compute/wavefront.hh

Deleted Added

sdiff udiff text old ( 11534:7106f550afad ) new ( 11639:2e8d4bd8108d )

full compact

1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#ifndef __WAVEFRONT_HH__
37#define __WAVEFRONT_HH__
38
39#include <cassert>
40#include <deque>
41#include <memory>
42#include <stack>
43#include <vector>
44
45#include "base/misc.hh"
46#include "base/types.hh"
47#include "gpu-compute/condition_register_state.hh"
48#include "gpu-compute/lds_state.hh"
49#include "gpu-compute/misc.hh"
50#include "params/Wavefront.hh"
51#include "sim/sim_object.hh"
52
53static const int MAX_NUM_INSTS_PER_WF = 12;
54
55/*
56 * Arguments for the hsail opcode call, are user defined and variable length.
57 * The hardware/finalizer can support arguments in hardware or use memory to
58 * pass arguments. For now, let's assume that an unlimited number of arguments
59 * are supported in hardware (the compiler inlines functions whenver it can
60 * anyways, so unless someone is interested in the implications of linking/
61 * library functions, I think this is a reasonable assumption given the typical
62 * size of an OpenCL kernel).
63 *
64 * Note that call args are different than kernel arguments:
65 * * All work-items in a kernel refer the same set of kernel arguments
66 * * Each work-item has it's on set of call args. So a call argument at
67 * address 0x4 is different for work-item 0 and work-item 1.
68 *
69 * Ok, the table below shows an example of how we organize the call arguments in
70 * the CallArgMem class.
71 *
72 * int foo(int arg1, double arg2)
73 * ___________________________________________________
74 * | 0: return.0 | 4: return.1 | ... | 252: return.63 |
75 * |---------------------------------------------------|
76 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 |
77 * |---------------------------------------------------|
78 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 |
79 * ___________________________________________________
80 */
81class CallArgMem
82{
83 public:
84 // pointer to buffer for storing function arguments
85 uint8_t *mem;
86 int wfSize;
87 // size of function args
88 int funcArgsSizePerItem;
89
90 template<typename CType>
91 int
92 getLaneOffset(int lane, int addr)
93 {
94 return addr * wfSize + sizeof(CType) * lane;
95 }
96
97 CallArgMem(int func_args_size_per_item, int wf_size)
98 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
99 {
100 mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
101 }
102
103 ~CallArgMem()
104 {
105 free(mem);
106 }
107
108 template<typename CType>
109 uint8_t*
110 getLaneAddr(int lane, int addr)
111 {
112 return mem + getLaneOffset<CType>(lane, addr);
113 }
114
115 template<typename CType>
116 void
117 setLaneAddr(int lane, int addr, CType val)
118 {
119 *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
120 }
121};
122
123/**
124 * A reconvergence stack entry conveys the necessary state to implement
125 * control flow divergence.
126 */
127class ReconvergenceStackEntry {
128
129 public:
130 ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc,
131 VectorMask new_mask) : pc(new_pc), rpc(new_rpc),
132 execMask(new_mask) {
133 }
134
135 /**
136 * PC of current instruction.
137 */
138 uint32_t pc;
139 /**
140 * PC of the immediate post-dominator instruction, i.e., the value of
141 * @a pc for the first instruction that will be executed by the wavefront
142 * when a reconvergence point is reached.
143 */
144 uint32_t rpc;
145 /**
146 * Execution mask.
147 */
148 VectorMask execMask;
149};
150
151class Wavefront : public SimObject
152{
153 public:
154 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
155 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
156
157 // Base pointer for array of instruction pointers
158 uint64_t base_ptr;
159
160 uint32_t old_barrier_cnt;
161 uint32_t barrier_cnt;
162 uint32_t barrier_id;
163 uint32_t barrier_slots;
164 status_e status;
165 // HW slot id where the WF is mapped to inside a SIMD unit
166 int wfSlotId;
167 int kern_id;
168 // SIMD unit where the WV has been scheduled
169 int simdId;
170 // pointer to parent CU
171 ComputeUnit *computeUnit;
172
173 std::deque<GPUDynInstPtr> instructionBuffer;
174
175 bool pendingFetch;
176 bool dropFetch;
177
178 // Condition Register State (for HSAIL simulations only)
179 class ConditionRegisterState *condRegState;
180 // number of single precision VGPRs required by WF
181 uint32_t maxSpVgprs;
182 // number of double precision VGPRs required by WF
183 uint32_t maxDpVgprs;
184 // map virtual to physical vector register
185 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
186 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
187 bool isGmInstruction(GPUDynInstPtr ii);
188 bool isLmInstruction(GPUDynInstPtr ii);
189 bool isOldestInstGMem();
190 bool isOldestInstLMem();
191 bool isOldestInstPrivMem();
192 bool isOldestInstFlatMem();
193 bool isOldestInstALU();
194 bool isOldestInstBarrier();
195 // used for passing spill address to DDInstGPU
196 std::vector<Addr> last_addr;
197 std::vector<uint32_t> workitemid[3];
198 std::vector<uint32_t> workitemFlatId;
199 uint32_t workgroupid[3];
200 uint32_t workgroupsz[3];
201 uint32_t gridsz[3];
202 uint32_t wg_id;
203 uint32_t wg_sz;
204 uint32_t dynwaveid;
205 uint32_t maxdynwaveid;
206 uint32_t dispatchid;
207 // outstanding global+local memory requests
208 uint32_t outstanding_reqs;
209 // memory requests between scoreboard
210 // and execute stage not yet executed
211 uint32_t mem_reqs_in_pipe;
212 // outstanding global memory write requests
213 uint32_t outstanding_reqs_wr_gm;
214 // outstanding local memory write requests
215 uint32_t outstanding_reqs_wr_lm;
216 // outstanding global memory read requests
217 uint32_t outstanding_reqs_rd_gm;
218 // outstanding local memory read requests
219 uint32_t outstanding_reqs_rd_lm;
220 uint32_t rd_lm_reqs_in_pipe;
221 uint32_t rd_gm_reqs_in_pipe;
222 uint32_t wr_lm_reqs_in_pipe;
223 uint32_t wr_gm_reqs_in_pipe;
224
225 int mem_trace_busy;
226 uint64_t last_trace;
227 // number of vector registers reserved by WF
228 int reservedVectorRegs;
229 // Index into the Vector Register File's namespace where the WF's registers
230 // will live while the WF is executed
231 uint32_t startVgprIndex;
232
233 // Old value of destination gpr (for trace)
234 std::vector<uint32_t> old_vgpr;
235 // Id of destination gpr (for trace)
236 uint32_t old_vgpr_id;
237 // Tick count of last old_vgpr copy
238 uint64_t old_vgpr_tcnt;
239
240 // Old value of destination gpr (for trace)
241 std::vector<uint64_t> old_dgpr;
242 // Id of destination gpr (for trace)
243 uint32_t old_dgpr_id;
244 // Tick count of last old_vgpr copy
245 uint64_t old_dgpr_tcnt;
246
247 // Execution mask at wavefront start
248 VectorMask init_mask;
249
250 // number of barriers this WF has joined
251 std::vector<int> bar_cnt;
252 int max_bar_cnt;
253 // Flag to stall a wave on barrier
254 bool stalledAtBarrier;
255
256 // a pointer to the fraction of the LDS allocated
257 // to this workgroup (thus this wavefront)
258 LdsChunk *ldsChunk;
259
260 // A pointer to the spill area
261 Addr spillBase;
262 // The size of the spill area
263 uint32_t spillSizePerItem;
264 // The vector width of the spill area
265 uint32_t spillWidth;
266
267 // A pointer to the private memory area
268 Addr privBase;
269 // The size of the private memory area
270 uint32_t privSizePerItem;
271
272 // A pointer ot the read-only memory area
273 Addr roBase;
274 // size of the read-only memory area
275 uint32_t roSize;
276
277 // pointer to buffer for storing kernel arguments
278 uint8_t *kernelArgs;
279 // unique WF id over all WFs executed across all CUs
280 uint64_t wfDynId;
281
282 // number of times instruction issue for this wavefront is blocked
283 // due to VRF port availability
284 Stats::Scalar numTimesBlockedDueVrfPortAvail;
285 // number of times an instruction of a WF is blocked from being issued
286 // due to WAR and WAW dependencies
287 Stats::Scalar numTimesBlockedDueWAXDependencies;
288 // number of times an instruction of a WF is blocked from being issued
289 // due to WAR and WAW dependencies
290 Stats::Scalar numTimesBlockedDueRAWDependencies;
291 // distribution of executed instructions based on their register
292 // operands; this is used to highlight the load on the VRF
293 Stats::Distribution srcRegOpDist;
294 Stats::Distribution dstRegOpDist;
295
296 // Functions to operate on call argument memory
297 // argument memory for hsail call instruction
298 CallArgMem *callArgMem;
299 void
300 initCallArgMem(int func_args_size_per_item, int wf_size)
301 {
302 callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
303 }
304
305 template<typename CType>
306 CType
307 readCallArgMem(int lane, int addr)
308 {
309 return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
310 }
311
312 template<typename CType>
313 void
314 writeCallArgMem(int lane, int addr, CType val)
315 {
316 callArgMem->setLaneAddr<CType>(lane, addr, val);
317 }
318
319 typedef WavefrontParams Params;
320 Wavefront(const Params *p);
321 ~Wavefront();
322 virtual void init();
323
324 void
325 setParent(ComputeUnit *cu)
326 {
327 computeUnit = cu;
328 }
329
330 void start(uint64_t _wfDynId, uint64_t _base_ptr);
331 void exec();
332 void updateResources();
333 int ready(itype_e type);
334 bool instructionBufferHasBranch();
335 void regStats();
336 VectorMask get_pred() { return execMask() & init_mask; }
337
338 bool waitingAtBarrier(int lane);
339
340 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
341 const VectorMask& exec_mask);
342
343 void popFromReconvergenceStack();
344
345 uint32_t pc() const;
346
347 uint32_t rpc() const;
348
349 VectorMask execMask() const;
350
351 bool execMask(int lane) const;
352
353 void pc(uint32_t new_pc);
354
355 void discardFetch();
356
357 private:
358 /**
359 * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
360 * to be visited by the wavefront, and the associated execution masks. The
361 * reconvergence stack grows every time the wavefront reaches a divergence
362 * point (branch instruction), and shrinks every time the wavefront
363 * reaches a reconvergence point (immediate post-dominator instruction).
364 */
365 std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
366};
367
368#endif // __WAVEFRONT_HH__