wavefront.hh (11644:d426728892fe) wavefront.hh (11657:5fad5a37d6fc)
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#ifndef __WAVEFRONT_HH__
37#define __WAVEFRONT_HH__
38
39#include <cassert>
40#include <deque>
41#include <memory>
42#include <stack>
43#include <vector>
44
45#include "base/misc.hh"
46#include "base/types.hh"
47#include "gpu-compute/condition_register_state.hh"
48#include "gpu-compute/lds_state.hh"
49#include "gpu-compute/misc.hh"
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36#ifndef __WAVEFRONT_HH__
37#define __WAVEFRONT_HH__
38
39#include <cassert>
40#include <deque>
41#include <memory>
42#include <stack>
43#include <vector>
44
45#include "base/misc.hh"
46#include "base/types.hh"
47#include "gpu-compute/condition_register_state.hh"
48#include "gpu-compute/lds_state.hh"
49#include "gpu-compute/misc.hh"
50#include "gpu-compute/ndrange.hh"
50#include "params/Wavefront.hh"
51#include "sim/sim_object.hh"
52
53static const int MAX_NUM_INSTS_PER_WF = 12;
54
55/**
56 * A reconvergence stack entry conveys the necessary state to implement
57 * control flow divergence.
58 */
59struct ReconvergenceStackEntry {
60 /**
61 * PC of current instruction.
62 */
63 uint32_t pc;
64 /**
65 * PC of the immediate post-dominator instruction, i.e., the value of
66 * @a pc for the first instruction that will be executed by the wavefront
67 * when a reconvergence point is reached.
68 */
69 uint32_t rpc;
70 /**
71 * Execution mask.
72 */
73 VectorMask execMask;
74};
75
76/*
77 * Arguments for the hsail opcode call, are user defined and variable length.
78 * The hardware/finalizer can support arguments in hardware or use memory to
79 * pass arguments. For now, let's assume that an unlimited number of arguments
80 * are supported in hardware (the compiler inlines functions whenver it can
81 * anyways, so unless someone is interested in the implications of linking/
82 * library functions, I think this is a reasonable assumption given the typical
83 * size of an OpenCL kernel).
84 *
85 * Note that call args are different than kernel arguments:
86 * * All work-items in a kernel refer the same set of kernel arguments
87 * * Each work-item has it's on set of call args. So a call argument at
88 * address 0x4 is different for work-item 0 and work-item 1.
89 *
90 * Ok, the table below shows an example of how we organize the call arguments in
91 * the CallArgMem class.
92 *
93 * int foo(int arg1, double arg2)
94 * ___________________________________________________
95 * | 0: return.0 | 4: return.1 | ... | 252: return.63 |
96 * |---------------------------------------------------|
97 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 |
98 * |---------------------------------------------------|
99 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 |
100 * ___________________________________________________
101 */
102class CallArgMem
103{
104 public:
105 // pointer to buffer for storing function arguments
106 uint8_t *mem;
107 int wfSize;
108 // size of function args
109 int funcArgsSizePerItem;
110
111 template<typename CType>
112 int
113 getLaneOffset(int lane, int addr)
114 {
115 return addr * wfSize + sizeof(CType) * lane;
116 }
117
118 CallArgMem(int func_args_size_per_item, int wf_size)
119 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
120 {
121 mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
122 }
123
124 ~CallArgMem()
125 {
126 free(mem);
127 }
128
129 template<typename CType>
130 uint8_t*
131 getLaneAddr(int lane, int addr)
132 {
133 return mem + getLaneOffset<CType>(lane, addr);
134 }
135
136 template<typename CType>
137 void
138 setLaneAddr(int lane, int addr, CType val)
139 {
140 *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
141 }
142};
143
144class Wavefront : public SimObject
145{
146 public:
147 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
148 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
149
150 // Base pointer for array of instruction pointers
151 uint64_t basePtr;
152
153 uint32_t oldBarrierCnt;
154 uint32_t barrierCnt;
155 uint32_t barrierId;
156 uint32_t barrierSlots;
157 status_e status;
158 // HW slot id where the WF is mapped to inside a SIMD unit
159 int wfSlotId;
160 int kernId;
161 // SIMD unit where the WV has been scheduled
162 int simdId;
163 // pointer to parent CU
164 ComputeUnit *computeUnit;
165
166 std::deque<GPUDynInstPtr> instructionBuffer;
167
168 bool pendingFetch;
169 bool dropFetch;
170
171 // Condition Register State (for HSAIL simulations only)
172 class ConditionRegisterState *condRegState;
173 // number of single precision VGPRs required by WF
174 uint32_t maxSpVgprs;
175 // number of double precision VGPRs required by WF
176 uint32_t maxDpVgprs;
177 // map virtual to physical vector register
178 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
179 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
180 bool isGmInstruction(GPUDynInstPtr ii);
181 bool isLmInstruction(GPUDynInstPtr ii);
182 bool isOldestInstGMem();
183 bool isOldestInstLMem();
184 bool isOldestInstPrivMem();
185 bool isOldestInstFlatMem();
186 bool isOldestInstALU();
187 bool isOldestInstBarrier();
188 // used for passing spill address to DDInstGPU
189 std::vector<Addr> lastAddr;
190 std::vector<uint32_t> workItemId[3];
191 std::vector<uint32_t> workItemFlatId;
51#include "params/Wavefront.hh"
52#include "sim/sim_object.hh"
53
54static const int MAX_NUM_INSTS_PER_WF = 12;
55
56/**
57 * A reconvergence stack entry conveys the necessary state to implement
58 * control flow divergence.
59 */
60struct ReconvergenceStackEntry {
61 /**
62 * PC of current instruction.
63 */
64 uint32_t pc;
65 /**
66 * PC of the immediate post-dominator instruction, i.e., the value of
67 * @a pc for the first instruction that will be executed by the wavefront
68 * when a reconvergence point is reached.
69 */
70 uint32_t rpc;
71 /**
72 * Execution mask.
73 */
74 VectorMask execMask;
75};
76
77/*
78 * Arguments for the hsail opcode call, are user defined and variable length.
79 * The hardware/finalizer can support arguments in hardware or use memory to
80 * pass arguments. For now, let's assume that an unlimited number of arguments
81 * are supported in hardware (the compiler inlines functions whenver it can
82 * anyways, so unless someone is interested in the implications of linking/
83 * library functions, I think this is a reasonable assumption given the typical
84 * size of an OpenCL kernel).
85 *
86 * Note that call args are different than kernel arguments:
87 * * All work-items in a kernel refer the same set of kernel arguments
88 * * Each work-item has it's on set of call args. So a call argument at
89 * address 0x4 is different for work-item 0 and work-item 1.
90 *
91 * Ok, the table below shows an example of how we organize the call arguments in
92 * the CallArgMem class.
93 *
94 * int foo(int arg1, double arg2)
95 * ___________________________________________________
96 * | 0: return.0 | 4: return.1 | ... | 252: return.63 |
97 * |---------------------------------------------------|
98 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 |
99 * |---------------------------------------------------|
100 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 |
101 * ___________________________________________________
102 */
103class CallArgMem
104{
105 public:
106 // pointer to buffer for storing function arguments
107 uint8_t *mem;
108 int wfSize;
109 // size of function args
110 int funcArgsSizePerItem;
111
112 template<typename CType>
113 int
114 getLaneOffset(int lane, int addr)
115 {
116 return addr * wfSize + sizeof(CType) * lane;
117 }
118
119 CallArgMem(int func_args_size_per_item, int wf_size)
120 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
121 {
122 mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
123 }
124
125 ~CallArgMem()
126 {
127 free(mem);
128 }
129
130 template<typename CType>
131 uint8_t*
132 getLaneAddr(int lane, int addr)
133 {
134 return mem + getLaneOffset<CType>(lane, addr);
135 }
136
137 template<typename CType>
138 void
139 setLaneAddr(int lane, int addr, CType val)
140 {
141 *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
142 }
143};
144
145class Wavefront : public SimObject
146{
147 public:
148 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
149 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
150
151 // Base pointer for array of instruction pointers
152 uint64_t basePtr;
153
154 uint32_t oldBarrierCnt;
155 uint32_t barrierCnt;
156 uint32_t barrierId;
157 uint32_t barrierSlots;
158 status_e status;
159 // HW slot id where the WF is mapped to inside a SIMD unit
160 int wfSlotId;
161 int kernId;
162 // SIMD unit where the WV has been scheduled
163 int simdId;
164 // pointer to parent CU
165 ComputeUnit *computeUnit;
166
167 std::deque<GPUDynInstPtr> instructionBuffer;
168
169 bool pendingFetch;
170 bool dropFetch;
171
172 // Condition Register State (for HSAIL simulations only)
173 class ConditionRegisterState *condRegState;
174 // number of single precision VGPRs required by WF
175 uint32_t maxSpVgprs;
176 // number of double precision VGPRs required by WF
177 uint32_t maxDpVgprs;
178 // map virtual to physical vector register
179 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
180 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
181 bool isGmInstruction(GPUDynInstPtr ii);
182 bool isLmInstruction(GPUDynInstPtr ii);
183 bool isOldestInstGMem();
184 bool isOldestInstLMem();
185 bool isOldestInstPrivMem();
186 bool isOldestInstFlatMem();
187 bool isOldestInstALU();
188 bool isOldestInstBarrier();
189 // used for passing spill address to DDInstGPU
190 std::vector<Addr> lastAddr;
191 std::vector<uint32_t> workItemId[3];
192 std::vector<uint32_t> workItemFlatId;
193 /* kernel launch parameters */
192 uint32_t workGroupId[3];
193 uint32_t workGroupSz[3];
194 uint32_t gridSz[3];
195 uint32_t wgId;
196 uint32_t wgSz;
194 uint32_t workGroupId[3];
195 uint32_t workGroupSz[3];
196 uint32_t gridSz[3];
197 uint32_t wgId;
198 uint32_t wgSz;
199 /* the actual WG size can differ than the maximum size */
200 uint32_t actualWgSz[3];
201 uint32_t actualWgSzTotal;
202 void computeActualWgSz(NDRange *ndr);
197 // wavefront id within a workgroup
198 uint32_t wfId;
199 uint32_t maxDynWaveId;
200 uint32_t dispatchId;
201 // outstanding global+local memory requests
202 uint32_t outstandingReqs;
203 // memory requests between scoreboard
204 // and execute stage not yet executed
205 uint32_t memReqsInPipe;
206 // outstanding global memory write requests
207 uint32_t outstandingReqsWrGm;
208 // outstanding local memory write requests
209 uint32_t outstandingReqsWrLm;
210 // outstanding global memory read requests
211 uint32_t outstandingReqsRdGm;
212 // outstanding local memory read requests
213 uint32_t outstandingReqsRdLm;
214 uint32_t rdLmReqsInPipe;
215 uint32_t rdGmReqsInPipe;
216 uint32_t wrLmReqsInPipe;
217 uint32_t wrGmReqsInPipe;
218
219 int memTraceBusy;
220 uint64_t lastTrace;
221 // number of vector registers reserved by WF
222 int reservedVectorRegs;
223 // Index into the Vector Register File's namespace where the WF's registers
224 // will live while the WF is executed
225 uint32_t startVgprIndex;
226
227 // Old value of destination gpr (for trace)
228 std::vector<uint32_t> oldVgpr;
229 // Id of destination gpr (for trace)
230 uint32_t oldVgprId;
231 // Tick count of last old_vgpr copy
232 uint64_t oldVgprTcnt;
233
234 // Old value of destination gpr (for trace)
235 std::vector<uint64_t> oldDgpr;
236 // Id of destination gpr (for trace)
237 uint32_t oldDgprId;
238 // Tick count of last old_vgpr copy
239 uint64_t oldDgprTcnt;
240
241 // Execution mask at wavefront start
242 VectorMask initMask;
243
244 // number of barriers this WF has joined
245 std::vector<int> barCnt;
246 int maxBarCnt;
247 // Flag to stall a wave on barrier
248 bool stalledAtBarrier;
249
250 // a pointer to the fraction of the LDS allocated
251 // to this workgroup (thus this wavefront)
252 LdsChunk *ldsChunk;
253
254 // A pointer to the spill area
255 Addr spillBase;
256 // The size of the spill area
257 uint32_t spillSizePerItem;
258 // The vector width of the spill area
259 uint32_t spillWidth;
260
261 // A pointer to the private memory area
262 Addr privBase;
263 // The size of the private memory area
264 uint32_t privSizePerItem;
265
266 // A pointer ot the read-only memory area
267 Addr roBase;
268 // size of the read-only memory area
269 uint32_t roSize;
270
271 // pointer to buffer for storing kernel arguments
272 uint8_t *kernelArgs;
273 // unique WF id over all WFs executed across all CUs
274 uint64_t wfDynId;
275
276 // number of times instruction issue for this wavefront is blocked
277 // due to VRF port availability
278 Stats::Scalar numTimesBlockedDueVrfPortAvail;
279 // number of times an instruction of a WF is blocked from being issued
280 // due to WAR and WAW dependencies
281 Stats::Scalar numTimesBlockedDueWAXDependencies;
282 // number of times an instruction of a WF is blocked from being issued
283 // due to WAR and WAW dependencies
284 Stats::Scalar numTimesBlockedDueRAWDependencies;
285 // distribution of executed instructions based on their register
286 // operands; this is used to highlight the load on the VRF
287 Stats::Distribution srcRegOpDist;
288 Stats::Distribution dstRegOpDist;
289
290 // Functions to operate on call argument memory
291 // argument memory for hsail call instruction
292 CallArgMem *callArgMem;
293 void
294 initCallArgMem(int func_args_size_per_item, int wf_size)
295 {
296 callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
297 }
298
299 template<typename CType>
300 CType
301 readCallArgMem(int lane, int addr)
302 {
303 return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
304 }
305
306 template<typename CType>
307 void
308 writeCallArgMem(int lane, int addr, CType val)
309 {
310 callArgMem->setLaneAddr<CType>(lane, addr, val);
311 }
312
313 typedef WavefrontParams Params;
314 Wavefront(const Params *p);
315 ~Wavefront();
316 virtual void init();
317
318 void
319 setParent(ComputeUnit *cu)
320 {
321 computeUnit = cu;
322 }
323
324 void start(uint64_t _wfDynId, uint64_t _base_ptr);
325 void exec();
326 void updateResources();
327 int ready(itype_e type);
328 bool instructionBufferHasBranch();
329 void regStats();
330 VectorMask getPred() { return execMask() & initMask; }
331
332 bool waitingAtBarrier(int lane);
333
334 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
335 const VectorMask& exec_mask);
336
337 void popFromReconvergenceStack();
338
339 uint32_t pc() const;
340
341 uint32_t rpc() const;
342
343 VectorMask execMask() const;
344
345 bool execMask(int lane) const;
346
347 void pc(uint32_t new_pc);
348
349 void discardFetch();
350
351 /**
352 * Returns the size of the static hardware context of a particular wavefront
353 * This should be updated everytime the context is changed
354 */
355 uint32_t getStaticContextSize() const;
356
357 /**
358 * Returns the hardware context as a stream of bytes
359 * This method is designed for HSAIL execution
360 */
361 void getContext(const void *out);
362
363 /**
364 * Sets the hardware context fromt a stream of bytes
365 * This method is designed for HSAIL execution
366 */
367 void setContext(const void *in);
368
369 private:
370 /**
371 * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
372 * to be visited by the wavefront, and the associated execution masks. The
373 * reconvergence stack grows every time the wavefront reaches a divergence
374 * point (branch instruction), and shrinks every time the wavefront
375 * reaches a reconvergence point (immediate post-dominator instruction).
376 */
377 std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
378};
379
380#endif // __WAVEFRONT_HH__
203 // wavefront id within a workgroup
204 uint32_t wfId;
205 uint32_t maxDynWaveId;
206 uint32_t dispatchId;
207 // outstanding global+local memory requests
208 uint32_t outstandingReqs;
209 // memory requests between scoreboard
210 // and execute stage not yet executed
211 uint32_t memReqsInPipe;
212 // outstanding global memory write requests
213 uint32_t outstandingReqsWrGm;
214 // outstanding local memory write requests
215 uint32_t outstandingReqsWrLm;
216 // outstanding global memory read requests
217 uint32_t outstandingReqsRdGm;
218 // outstanding local memory read requests
219 uint32_t outstandingReqsRdLm;
220 uint32_t rdLmReqsInPipe;
221 uint32_t rdGmReqsInPipe;
222 uint32_t wrLmReqsInPipe;
223 uint32_t wrGmReqsInPipe;
224
225 int memTraceBusy;
226 uint64_t lastTrace;
227 // number of vector registers reserved by WF
228 int reservedVectorRegs;
229 // Index into the Vector Register File's namespace where the WF's registers
230 // will live while the WF is executed
231 uint32_t startVgprIndex;
232
233 // Old value of destination gpr (for trace)
234 std::vector<uint32_t> oldVgpr;
235 // Id of destination gpr (for trace)
236 uint32_t oldVgprId;
237 // Tick count of last old_vgpr copy
238 uint64_t oldVgprTcnt;
239
240 // Old value of destination gpr (for trace)
241 std::vector<uint64_t> oldDgpr;
242 // Id of destination gpr (for trace)
243 uint32_t oldDgprId;
244 // Tick count of last old_vgpr copy
245 uint64_t oldDgprTcnt;
246
247 // Execution mask at wavefront start
248 VectorMask initMask;
249
250 // number of barriers this WF has joined
251 std::vector<int> barCnt;
252 int maxBarCnt;
253 // Flag to stall a wave on barrier
254 bool stalledAtBarrier;
255
256 // a pointer to the fraction of the LDS allocated
257 // to this workgroup (thus this wavefront)
258 LdsChunk *ldsChunk;
259
260 // A pointer to the spill area
261 Addr spillBase;
262 // The size of the spill area
263 uint32_t spillSizePerItem;
264 // The vector width of the spill area
265 uint32_t spillWidth;
266
267 // A pointer to the private memory area
268 Addr privBase;
269 // The size of the private memory area
270 uint32_t privSizePerItem;
271
272 // A pointer ot the read-only memory area
273 Addr roBase;
274 // size of the read-only memory area
275 uint32_t roSize;
276
277 // pointer to buffer for storing kernel arguments
278 uint8_t *kernelArgs;
279 // unique WF id over all WFs executed across all CUs
280 uint64_t wfDynId;
281
282 // number of times instruction issue for this wavefront is blocked
283 // due to VRF port availability
284 Stats::Scalar numTimesBlockedDueVrfPortAvail;
285 // number of times an instruction of a WF is blocked from being issued
286 // due to WAR and WAW dependencies
287 Stats::Scalar numTimesBlockedDueWAXDependencies;
288 // number of times an instruction of a WF is blocked from being issued
289 // due to WAR and WAW dependencies
290 Stats::Scalar numTimesBlockedDueRAWDependencies;
291 // distribution of executed instructions based on their register
292 // operands; this is used to highlight the load on the VRF
293 Stats::Distribution srcRegOpDist;
294 Stats::Distribution dstRegOpDist;
295
296 // Functions to operate on call argument memory
297 // argument memory for hsail call instruction
298 CallArgMem *callArgMem;
299 void
300 initCallArgMem(int func_args_size_per_item, int wf_size)
301 {
302 callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
303 }
304
305 template<typename CType>
306 CType
307 readCallArgMem(int lane, int addr)
308 {
309 return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
310 }
311
312 template<typename CType>
313 void
314 writeCallArgMem(int lane, int addr, CType val)
315 {
316 callArgMem->setLaneAddr<CType>(lane, addr, val);
317 }
318
319 typedef WavefrontParams Params;
320 Wavefront(const Params *p);
321 ~Wavefront();
322 virtual void init();
323
324 void
325 setParent(ComputeUnit *cu)
326 {
327 computeUnit = cu;
328 }
329
330 void start(uint64_t _wfDynId, uint64_t _base_ptr);
331 void exec();
332 void updateResources();
333 int ready(itype_e type);
334 bool instructionBufferHasBranch();
335 void regStats();
336 VectorMask getPred() { return execMask() & initMask; }
337
338 bool waitingAtBarrier(int lane);
339
340 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
341 const VectorMask& exec_mask);
342
343 void popFromReconvergenceStack();
344
345 uint32_t pc() const;
346
347 uint32_t rpc() const;
348
349 VectorMask execMask() const;
350
351 bool execMask(int lane) const;
352
353 void pc(uint32_t new_pc);
354
355 void discardFetch();
356
357 /**
358 * Returns the size of the static hardware context of a particular wavefront
359 * This should be updated everytime the context is changed
360 */
361 uint32_t getStaticContextSize() const;
362
363 /**
364 * Returns the hardware context as a stream of bytes
365 * This method is designed for HSAIL execution
366 */
367 void getContext(const void *out);
368
369 /**
370 * Sets the hardware context fromt a stream of bytes
371 * This method is designed for HSAIL execution
372 */
373 void setContext(const void *in);
374
375 private:
376 /**
377 * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
378 * to be visited by the wavefront, and the associated execution masks. The
379 * reconvergence stack grows every time the wavefront reaches a divergence
380 * point (branch instruction), and shrinks every time the wavefront
381 * reaches a reconvergence point (immediate post-dominator instruction).
382 */
383 std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
384};
385
386#endif // __WAVEFRONT_HH__