Cross Reference: /gem5/src/gpu-compute/wavefront.hh

Deleted Added

sdiff udiff text old ( 11641:a9f0711e7230 ) new ( 11643:42a1873be45c )

full compact

wavefront.hh (11641:a9f0711e7230)	wavefront.hh (11643:42a1873be45c)
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 / 35 36#ifndef __WAVEFRONT_HH__ 37#define __WAVEFRONT_HH__ 38 39#include <cassert> 40#include <deque> 41#include <memory> 42#include <stack> 43#include <vector> 44 45#include "base/misc.hh" 46#include "base/types.hh" 47#include "gpu-compute/condition_register_state.hh" 48#include "gpu-compute/lds_state.hh" 49#include "gpu-compute/misc.hh" 50#include "params/Wavefront.hh" 51#include "sim/sim_object.hh" 52 53static const int MAX_NUM_INSTS_PER_WF = 12; 54 55/* 56 * A reconvergence stack entry conveys the necessary state to implement 57 * control flow divergence. 58 / 59struct ReconvergenceStackEntry { 60 /* 61 * PC of current instruction. 62 / 63 uint32_t pc; 64 /* 65 * PC of the immediate post-dominator instruction, i.e., the value of 66 * @a pc for the first instruction that will be executed by the wavefront 67 * when a reconvergence point is reached. 68 / 69 uint32_t rpc; 70 /* 71 * Execution mask. 72 / 73 VectorMask execMask; 74}; 75 76/ 77 * Arguments for the hsail opcode call, are user defined and variable length. 78 * The hardware/finalizer can support arguments in hardware or use memory to 79 * pass arguments. For now, let's assume that an unlimited number of arguments 80 * are supported in hardware (the compiler inlines functions whenver it can 81 * anyways, so unless someone is interested in the implications of linking/ 82 * library functions, I think this is a reasonable assumption given the typical 83 * size of an OpenCL kernel). 84 * 85 * Note that call args are different than kernel arguments: 86 * * All work-items in a kernel refer the same set of kernel arguments 87 * * Each work-item has it's on set of call args. So a call argument at 88 * address 0x4 is different for work-item 0 and work-item 1. 89 * 90 * Ok, the table below shows an example of how we organize the call arguments in 91 * the CallArgMem class. 92 * 93 * int foo(int arg1, double arg2) 94 * ___________________________________________________ 95 * \| 0: return.0 \| 4: return.1 \| ... \| 252: return.63 \| 96 * \|---------------------------------------------------\| 97 * \| 256: arg1.0 \| 260: arg1.1 \| ... \| 508: arg1.63 \| 98 * \|---------------------------------------------------\| 99 * \| 512: arg2.0 \| 520: arg2.1 \| ... \| 1016: arg2.63 \| 100 * ___________________________________________________ 101 / 102class CallArgMem 103{ 104* public: 105 // pointer to buffer for storing function arguments 106 uint8_t mem; 107* int wfSize; 108 // size of function args 109 int funcArgsSizePerItem; 110 111 template<typename CType> 112 int 113 getLaneOffset(int lane, int addr) 114 { 115 return addr * wfSize + sizeof(CType) * lane; 116 } 117 118 CallArgMem(int func_args_size_per_item, int wf_size) 119 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) 120 { 121 mem = (uint8_t)malloc(funcArgsSizePerItem wfSize); 122 } 123 124 ~CallArgMem() 125 { 126 free(mem); 127 } 128 129 template<typename CType> 130 uint8_t* 131 getLaneAddr(int lane, int addr) 132 { 133 return mem + getLaneOffset<CType>(lane, addr); 134 } 135 136 template<typename CType> 137 void 138 setLaneAddr(int lane, int addr, CType val) 139 { 140 ((CType)(mem + getLaneOffset<CType>(lane, addr))) = val; 141 } 142}; 143 144class Wavefront : public SimObject 145{ 146 public: 147 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; 148 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; 149 150 // Base pointer for array of instruction pointers 151 uint64_t basePtr; 152 153 uint32_t oldBarrierCnt; 154 uint32_t barrierCnt; 155 uint32_t barrierId; 156 uint32_t barrierSlots; 157 status_e status; 158 // HW slot id where the WF is mapped to inside a SIMD unit 159 int wfSlotId; 160 int kernId; 161 // SIMD unit where the WV has been scheduled 162 int simdId; 163 // pointer to parent CU 164 ComputeUnit computeUnit; 165* 166 std::deque<GPUDynInstPtr> instructionBuffer; 167 168 bool pendingFetch; 169 bool dropFetch; 170 171 // Condition Register State (for HSAIL simulations only) 172 class ConditionRegisterState condRegState; 173* // number of single precision VGPRs required by WF 174 uint32_t maxSpVgprs; 175 // number of double precision VGPRs required by WF 176 uint32_t maxDpVgprs; 177 // map virtual to physical vector register 178 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); 179 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 180 bool isGmInstruction(GPUDynInstPtr ii); 181 bool isLmInstruction(GPUDynInstPtr ii); 182 bool isOldestInstGMem(); 183 bool isOldestInstLMem(); 184 bool isOldestInstPrivMem(); 185 bool isOldestInstFlatMem(); 186 bool isOldestInstALU(); 187 bool isOldestInstBarrier(); 188 // used for passing spill address to DDInstGPU 189 std::vector<Addr> lastAddr; 190 std::vector<uint32_t> workItemId[3]; 191 std::vector<uint32_t> workItemFlatId; 192 uint32_t workGroupId[3]; 193 uint32_t workGroupSz[3]; 194 uint32_t gridSz[3]; 195 uint32_t wgId; 196 uint32_t wgSz;	1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 / 35 36#ifndef __WAVEFRONT_HH__ 37#define __WAVEFRONT_HH__ 38 39#include <cassert> 40#include <deque> 41#include <memory> 42#include <stack> 43#include <vector> 44 45#include "base/misc.hh" 46#include "base/types.hh" 47#include "gpu-compute/condition_register_state.hh" 48#include "gpu-compute/lds_state.hh" 49#include "gpu-compute/misc.hh" 50#include "params/Wavefront.hh" 51#include "sim/sim_object.hh" 52 53static const int MAX_NUM_INSTS_PER_WF = 12; 54 55/* 56 * A reconvergence stack entry conveys the necessary state to implement 57 * control flow divergence. 58 / 59struct ReconvergenceStackEntry { 60 /* 61 * PC of current instruction. 62 / 63 uint32_t pc; 64 /* 65 * PC of the immediate post-dominator instruction, i.e., the value of 66 * @a pc for the first instruction that will be executed by the wavefront 67 * when a reconvergence point is reached. 68 / 69 uint32_t rpc; 70 /* 71 * Execution mask. 72 / 73 VectorMask execMask; 74}; 75 76/ 77 * Arguments for the hsail opcode call, are user defined and variable length. 78 * The hardware/finalizer can support arguments in hardware or use memory to 79 * pass arguments. For now, let's assume that an unlimited number of arguments 80 * are supported in hardware (the compiler inlines functions whenver it can 81 * anyways, so unless someone is interested in the implications of linking/ 82 * library functions, I think this is a reasonable assumption given the typical 83 * size of an OpenCL kernel). 84 * 85 * Note that call args are different than kernel arguments: 86 * * All work-items in a kernel refer the same set of kernel arguments 87 * * Each work-item has it's on set of call args. So a call argument at 88 * address 0x4 is different for work-item 0 and work-item 1. 89 * 90 * Ok, the table below shows an example of how we organize the call arguments in 91 * the CallArgMem class. 92 * 93 * int foo(int arg1, double arg2) 94 * ___________________________________________________ 95 * \| 0: return.0 \| 4: return.1 \| ... \| 252: return.63 \| 96 * \|---------------------------------------------------\| 97 * \| 256: arg1.0 \| 260: arg1.1 \| ... \| 508: arg1.63 \| 98 * \|---------------------------------------------------\| 99 * \| 512: arg2.0 \| 520: arg2.1 \| ... \| 1016: arg2.63 \| 100 * ___________________________________________________ 101 / 102class CallArgMem 103{ 104* public: 105 // pointer to buffer for storing function arguments 106 uint8_t mem; 107* int wfSize; 108 // size of function args 109 int funcArgsSizePerItem; 110 111 template<typename CType> 112 int 113 getLaneOffset(int lane, int addr) 114 { 115 return addr * wfSize + sizeof(CType) * lane; 116 } 117 118 CallArgMem(int func_args_size_per_item, int wf_size) 119 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) 120 { 121 mem = (uint8_t)malloc(funcArgsSizePerItem wfSize); 122 } 123 124 ~CallArgMem() 125 { 126 free(mem); 127 } 128 129 template<typename CType> 130 uint8_t* 131 getLaneAddr(int lane, int addr) 132 { 133 return mem + getLaneOffset<CType>(lane, addr); 134 } 135 136 template<typename CType> 137 void 138 setLaneAddr(int lane, int addr, CType val) 139 { 140 ((CType)(mem + getLaneOffset<CType>(lane, addr))) = val; 141 } 142}; 143 144class Wavefront : public SimObject 145{ 146 public: 147 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; 148 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; 149 150 // Base pointer for array of instruction pointers 151 uint64_t basePtr; 152 153 uint32_t oldBarrierCnt; 154 uint32_t barrierCnt; 155 uint32_t barrierId; 156 uint32_t barrierSlots; 157 status_e status; 158 // HW slot id where the WF is mapped to inside a SIMD unit 159 int wfSlotId; 160 int kernId; 161 // SIMD unit where the WV has been scheduled 162 int simdId; 163 // pointer to parent CU 164 ComputeUnit computeUnit; 165* 166 std::deque<GPUDynInstPtr> instructionBuffer; 167 168 bool pendingFetch; 169 bool dropFetch; 170 171 // Condition Register State (for HSAIL simulations only) 172 class ConditionRegisterState condRegState; 173* // number of single precision VGPRs required by WF 174 uint32_t maxSpVgprs; 175 // number of double precision VGPRs required by WF 176 uint32_t maxDpVgprs; 177 // map virtual to physical vector register 178 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); 179 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 180 bool isGmInstruction(GPUDynInstPtr ii); 181 bool isLmInstruction(GPUDynInstPtr ii); 182 bool isOldestInstGMem(); 183 bool isOldestInstLMem(); 184 bool isOldestInstPrivMem(); 185 bool isOldestInstFlatMem(); 186 bool isOldestInstALU(); 187 bool isOldestInstBarrier(); 188 // used for passing spill address to DDInstGPU 189 std::vector<Addr> lastAddr; 190 std::vector<uint32_t> workItemId[3]; 191 std::vector<uint32_t> workItemFlatId; 192 uint32_t workGroupId[3]; 193 uint32_t workGroupSz[3]; 194 uint32_t gridSz[3]; 195 uint32_t wgId; 196 uint32_t wgSz;
197 uint32_t dynWaveId;	197 // wavefront id within a workgroup 198 uint32_t wfId;
198 uint32_t maxDynWaveId; 199 uint32_t dispatchId; 200 // outstanding global+local memory requests 201 uint32_t outstandingReqs; 202 // memory requests between scoreboard 203 // and execute stage not yet executed 204 uint32_t memReqsInPipe; 205 // outstanding global memory write requests 206 uint32_t outstandingReqsWrGm; 207 // outstanding local memory write requests 208 uint32_t outstandingReqsWrLm; 209 // outstanding global memory read requests 210 uint32_t outstandingReqsRdGm; 211 // outstanding local memory read requests 212 uint32_t outstandingReqsRdLm; 213 uint32_t rdLmReqsInPipe; 214 uint32_t rdGmReqsInPipe; 215 uint32_t wrLmReqsInPipe; 216 uint32_t wrGmReqsInPipe; 217 218 int memTraceBusy; 219 uint64_t lastTrace; 220 // number of vector registers reserved by WF 221 int reservedVectorRegs; 222 // Index into the Vector Register File's namespace where the WF's registers 223 // will live while the WF is executed 224 uint32_t startVgprIndex; 225 226 // Old value of destination gpr (for trace) 227 std::vector<uint32_t> oldVgpr; 228 // Id of destination gpr (for trace) 229 uint32_t oldVgprId; 230 // Tick count of last old_vgpr copy 231 uint64_t oldVgprTcnt; 232 233 // Old value of destination gpr (for trace) 234 std::vector<uint64_t> oldDgpr; 235 // Id of destination gpr (for trace) 236 uint32_t oldDgprId; 237 // Tick count of last old_vgpr copy 238 uint64_t oldDgprTcnt; 239 240 // Execution mask at wavefront start 241 VectorMask initMask; 242 243 // number of barriers this WF has joined 244 std::vector<int> barCnt; 245 int maxBarCnt; 246 // Flag to stall a wave on barrier 247 bool stalledAtBarrier; 248 249 // a pointer to the fraction of the LDS allocated 250 // to this workgroup (thus this wavefront) 251 LdsChunk ldsChunk; 252* 253 // A pointer to the spill area 254 Addr spillBase; 255 // The size of the spill area 256 uint32_t spillSizePerItem; 257 // The vector width of the spill area 258 uint32_t spillWidth; 259 260 // A pointer to the private memory area 261 Addr privBase; 262 // The size of the private memory area 263 uint32_t privSizePerItem; 264 265 // A pointer ot the read-only memory area 266 Addr roBase; 267 // size of the read-only memory area 268 uint32_t roSize; 269 270 // pointer to buffer for storing kernel arguments 271 uint8_t kernelArgs; 272* // unique WF id over all WFs executed across all CUs 273 uint64_t wfDynId; 274 275 // number of times instruction issue for this wavefront is blocked 276 // due to VRF port availability 277 Stats::Scalar numTimesBlockedDueVrfPortAvail; 278 // number of times an instruction of a WF is blocked from being issued 279 // due to WAR and WAW dependencies 280 Stats::Scalar numTimesBlockedDueWAXDependencies; 281 // number of times an instruction of a WF is blocked from being issued 282 // due to WAR and WAW dependencies 283 Stats::Scalar numTimesBlockedDueRAWDependencies; 284 // distribution of executed instructions based on their register 285 // operands; this is used to highlight the load on the VRF 286 Stats::Distribution srcRegOpDist; 287 Stats::Distribution dstRegOpDist; 288 289 // Functions to operate on call argument memory 290 // argument memory for hsail call instruction 291 CallArgMem callArgMem; 292* void 293 initCallArgMem(int func_args_size_per_item, int wf_size) 294 { 295 callArgMem = new CallArgMem(func_args_size_per_item, wf_size); 296 } 297 298 template<typename CType> 299 CType 300 readCallArgMem(int lane, int addr) 301 { 302 return ((CType)(callArgMem->getLaneAddr<CType>(lane, addr))); 303 } 304 305 template<typename CType> 306 void 307 writeCallArgMem(int lane, int addr, CType val) 308 { 309 callArgMem->setLaneAddr<CType>(lane, addr, val); 310 } 311 312 typedef WavefrontParams Params; 313 Wavefront(const Params p); 314* ~Wavefront(); 315 virtual void init(); 316 317 void 318 setParent(ComputeUnit cu) 319* { 320 computeUnit = cu; 321 } 322 323 void start(uint64_t _wfDynId, uint64_t _base_ptr); 324 void exec(); 325 void updateResources(); 326 int ready(itype_e type); 327 bool instructionBufferHasBranch(); 328 void regStats(); 329 VectorMask getPred() { return execMask() & initMask; } 330 331 bool waitingAtBarrier(int lane); 332 333 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 334 const VectorMask& exec_mask); 335 336 void popFromReconvergenceStack(); 337 338 uint32_t pc() const; 339 340 uint32_t rpc() const; 341 342 VectorMask execMask() const; 343 344 bool execMask(int lane) const; 345 346 void pc(uint32_t new_pc); 347 348 void discardFetch(); 349 350 /** 351 * Returns the size of the static hardware context of a particular wavefront 352 * This should be updated everytime the context is changed 353 / 354* uint32_t getStaticContextSize() const; 355 356 private: 357 /** 358 * Stack containing Control Flow Graph nodes (i.e., kernel instructions) 359 * to be visited by the wavefront, and the associated execution masks. The 360 * reconvergence stack grows every time the wavefront reaches a divergence 361 * point (branch instruction), and shrinks every time the wavefront 362 * reaches a reconvergence point (immediate post-dominator instruction). 363 / 364* std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; 365}; 366 367#endif // __WAVEFRONT_HH__	199 uint32_t maxDynWaveId; 200 uint32_t dispatchId; 201 // outstanding global+local memory requests 202 uint32_t outstandingReqs; 203 // memory requests between scoreboard 204 // and execute stage not yet executed 205 uint32_t memReqsInPipe; 206 // outstanding global memory write requests 207 uint32_t outstandingReqsWrGm; 208 // outstanding local memory write requests 209 uint32_t outstandingReqsWrLm; 210 // outstanding global memory read requests 211 uint32_t outstandingReqsRdGm; 212 // outstanding local memory read requests 213 uint32_t outstandingReqsRdLm; 214 uint32_t rdLmReqsInPipe; 215 uint32_t rdGmReqsInPipe; 216 uint32_t wrLmReqsInPipe; 217 uint32_t wrGmReqsInPipe; 218 219 int memTraceBusy; 220 uint64_t lastTrace; 221 // number of vector registers reserved by WF 222 int reservedVectorRegs; 223 // Index into the Vector Register File's namespace where the WF's registers 224 // will live while the WF is executed 225 uint32_t startVgprIndex; 226 227 // Old value of destination gpr (for trace) 228 std::vector<uint32_t> oldVgpr; 229 // Id of destination gpr (for trace) 230 uint32_t oldVgprId; 231 // Tick count of last old_vgpr copy 232 uint64_t oldVgprTcnt; 233 234 // Old value of destination gpr (for trace) 235 std::vector<uint64_t> oldDgpr; 236 // Id of destination gpr (for trace) 237 uint32_t oldDgprId; 238 // Tick count of last old_vgpr copy 239 uint64_t oldDgprTcnt; 240 241 // Execution mask at wavefront start 242 VectorMask initMask; 243 244 // number of barriers this WF has joined 245 std::vector<int> barCnt; 246 int maxBarCnt; 247 // Flag to stall a wave on barrier 248 bool stalledAtBarrier; 249 250 // a pointer to the fraction of the LDS allocated 251 // to this workgroup (thus this wavefront) 252 LdsChunk ldsChunk; 253* 254 // A pointer to the spill area 255 Addr spillBase; 256 // The size of the spill area 257 uint32_t spillSizePerItem; 258 // The vector width of the spill area 259 uint32_t spillWidth; 260 261 // A pointer to the private memory area 262 Addr privBase; 263 // The size of the private memory area 264 uint32_t privSizePerItem; 265 266 // A pointer ot the read-only memory area 267 Addr roBase; 268 // size of the read-only memory area 269 uint32_t roSize; 270 271 // pointer to buffer for storing kernel arguments 272 uint8_t kernelArgs; 273* // unique WF id over all WFs executed across all CUs 274 uint64_t wfDynId; 275 276 // number of times instruction issue for this wavefront is blocked 277 // due to VRF port availability 278 Stats::Scalar numTimesBlockedDueVrfPortAvail; 279 // number of times an instruction of a WF is blocked from being issued 280 // due to WAR and WAW dependencies 281 Stats::Scalar numTimesBlockedDueWAXDependencies; 282 // number of times an instruction of a WF is blocked from being issued 283 // due to WAR and WAW dependencies 284 Stats::Scalar numTimesBlockedDueRAWDependencies; 285 // distribution of executed instructions based on their register 286 // operands; this is used to highlight the load on the VRF 287 Stats::Distribution srcRegOpDist; 288 Stats::Distribution dstRegOpDist; 289 290 // Functions to operate on call argument memory 291 // argument memory for hsail call instruction 292 CallArgMem callArgMem; 293* void 294 initCallArgMem(int func_args_size_per_item, int wf_size) 295 { 296 callArgMem = new CallArgMem(func_args_size_per_item, wf_size); 297 } 298 299 template<typename CType> 300 CType 301 readCallArgMem(int lane, int addr) 302 { 303 return ((CType)(callArgMem->getLaneAddr<CType>(lane, addr))); 304 } 305 306 template<typename CType> 307 void 308 writeCallArgMem(int lane, int addr, CType val) 309 { 310 callArgMem->setLaneAddr<CType>(lane, addr, val); 311 } 312 313 typedef WavefrontParams Params; 314 Wavefront(const Params p); 315* ~Wavefront(); 316 virtual void init(); 317 318 void 319 setParent(ComputeUnit cu) 320* { 321 computeUnit = cu; 322 } 323 324 void start(uint64_t _wfDynId, uint64_t _base_ptr); 325 void exec(); 326 void updateResources(); 327 int ready(itype_e type); 328 bool instructionBufferHasBranch(); 329 void regStats(); 330 VectorMask getPred() { return execMask() & initMask; } 331 332 bool waitingAtBarrier(int lane); 333 334 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 335 const VectorMask& exec_mask); 336 337 void popFromReconvergenceStack(); 338 339 uint32_t pc() const; 340 341 uint32_t rpc() const; 342 343 VectorMask execMask() const; 344 345 bool execMask(int lane) const; 346 347 void pc(uint32_t new_pc); 348 349 void discardFetch(); 350 351 /** 352 * Returns the size of the static hardware context of a particular wavefront 353 * This should be updated everytime the context is changed 354 / 355* uint32_t getStaticContextSize() const; 356 357 private: 358 /** 359 * Stack containing Control Flow Graph nodes (i.e., kernel instructions) 360 * to be visited by the wavefront, and the associated execution masks. The 361 * reconvergence stack grows every time the wavefront reaches a divergence 362 * point (branch instruction), and shrinks every time the wavefront 363 * reaches a reconvergence point (immediate post-dominator instruction). 364 / 365* std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; 366}; 367 368#endif // __WAVEFRONT_HH__