1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#ifndef __WAVEFRONT_HH__ 37#define __WAVEFRONT_HH__ 38 39#include <cassert> 40#include <deque> 41#include <memory> 42#include <stack> 43#include <vector> 44 45#include "base/misc.hh" 46#include "base/types.hh" 47#include "gpu-compute/condition_register_state.hh" 48#include "gpu-compute/lds_state.hh" 49#include "gpu-compute/misc.hh"
| 1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#ifndef __WAVEFRONT_HH__ 37#define __WAVEFRONT_HH__ 38 39#include <cassert> 40#include <deque> 41#include <memory> 42#include <stack> 43#include <vector> 44 45#include "base/misc.hh" 46#include "base/types.hh" 47#include "gpu-compute/condition_register_state.hh" 48#include "gpu-compute/lds_state.hh" 49#include "gpu-compute/misc.hh"
|
| 50#include "gpu-compute/ndrange.hh"
|
50#include "params/Wavefront.hh" 51#include "sim/sim_object.hh" 52 53static const int MAX_NUM_INSTS_PER_WF = 12; 54 55/** 56 * A reconvergence stack entry conveys the necessary state to implement 57 * control flow divergence. 58 */ 59struct ReconvergenceStackEntry { 60 /** 61 * PC of current instruction. 62 */ 63 uint32_t pc; 64 /** 65 * PC of the immediate post-dominator instruction, i.e., the value of 66 * @a pc for the first instruction that will be executed by the wavefront 67 * when a reconvergence point is reached. 68 */ 69 uint32_t rpc; 70 /** 71 * Execution mask. 72 */ 73 VectorMask execMask; 74}; 75 76/* 77 * Arguments for the hsail opcode call, are user defined and variable length. 78 * The hardware/finalizer can support arguments in hardware or use memory to 79 * pass arguments. For now, let's assume that an unlimited number of arguments 80 * are supported in hardware (the compiler inlines functions whenver it can 81 * anyways, so unless someone is interested in the implications of linking/ 82 * library functions, I think this is a reasonable assumption given the typical 83 * size of an OpenCL kernel). 84 * 85 * Note that call args are different than kernel arguments: 86 * * All work-items in a kernel refer the same set of kernel arguments 87 * * Each work-item has it's on set of call args. So a call argument at 88 * address 0x4 is different for work-item 0 and work-item 1. 89 * 90 * Ok, the table below shows an example of how we organize the call arguments in 91 * the CallArgMem class. 92 * 93 * int foo(int arg1, double arg2) 94 * ___________________________________________________ 95 * | 0: return.0 | 4: return.1 | ... | 252: return.63 | 96 * |---------------------------------------------------| 97 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | 98 * |---------------------------------------------------| 99 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | 100 * ___________________________________________________ 101 */ 102class CallArgMem 103{ 104 public: 105 // pointer to buffer for storing function arguments 106 uint8_t *mem; 107 int wfSize; 108 // size of function args 109 int funcArgsSizePerItem; 110 111 template<typename CType> 112 int 113 getLaneOffset(int lane, int addr) 114 { 115 return addr * wfSize + sizeof(CType) * lane; 116 } 117 118 CallArgMem(int func_args_size_per_item, int wf_size) 119 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) 120 { 121 mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); 122 } 123 124 ~CallArgMem() 125 { 126 free(mem); 127 } 128 129 template<typename CType> 130 uint8_t* 131 getLaneAddr(int lane, int addr) 132 { 133 return mem + getLaneOffset<CType>(lane, addr); 134 } 135 136 template<typename CType> 137 void 138 setLaneAddr(int lane, int addr, CType val) 139 { 140 *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val; 141 } 142}; 143 144class Wavefront : public SimObject 145{ 146 public: 147 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; 148 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; 149 150 // Base pointer for array of instruction pointers 151 uint64_t basePtr; 152 153 uint32_t oldBarrierCnt; 154 uint32_t barrierCnt; 155 uint32_t barrierId; 156 uint32_t barrierSlots; 157 status_e status; 158 // HW slot id where the WF is mapped to inside a SIMD unit 159 int wfSlotId; 160 int kernId; 161 // SIMD unit where the WV has been scheduled 162 int simdId; 163 // pointer to parent CU 164 ComputeUnit *computeUnit; 165 166 std::deque<GPUDynInstPtr> instructionBuffer; 167 168 bool pendingFetch; 169 bool dropFetch; 170 171 // Condition Register State (for HSAIL simulations only) 172 class ConditionRegisterState *condRegState; 173 // number of single precision VGPRs required by WF 174 uint32_t maxSpVgprs; 175 // number of double precision VGPRs required by WF 176 uint32_t maxDpVgprs; 177 // map virtual to physical vector register 178 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); 179 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 180 bool isGmInstruction(GPUDynInstPtr ii); 181 bool isLmInstruction(GPUDynInstPtr ii); 182 bool isOldestInstGMem(); 183 bool isOldestInstLMem(); 184 bool isOldestInstPrivMem(); 185 bool isOldestInstFlatMem(); 186 bool isOldestInstALU(); 187 bool isOldestInstBarrier(); 188 // used for passing spill address to DDInstGPU 189 std::vector<Addr> lastAddr; 190 std::vector<uint32_t> workItemId[3]; 191 std::vector<uint32_t> workItemFlatId;
| 51#include "params/Wavefront.hh" 52#include "sim/sim_object.hh" 53 54static const int MAX_NUM_INSTS_PER_WF = 12; 55 56/** 57 * A reconvergence stack entry conveys the necessary state to implement 58 * control flow divergence. 59 */ 60struct ReconvergenceStackEntry { 61 /** 62 * PC of current instruction. 63 */ 64 uint32_t pc; 65 /** 66 * PC of the immediate post-dominator instruction, i.e., the value of 67 * @a pc for the first instruction that will be executed by the wavefront 68 * when a reconvergence point is reached. 69 */ 70 uint32_t rpc; 71 /** 72 * Execution mask. 73 */ 74 VectorMask execMask; 75}; 76 77/* 78 * Arguments for the hsail opcode call, are user defined and variable length. 79 * The hardware/finalizer can support arguments in hardware or use memory to 80 * pass arguments. For now, let's assume that an unlimited number of arguments 81 * are supported in hardware (the compiler inlines functions whenver it can 82 * anyways, so unless someone is interested in the implications of linking/ 83 * library functions, I think this is a reasonable assumption given the typical 84 * size of an OpenCL kernel). 85 * 86 * Note that call args are different than kernel arguments: 87 * * All work-items in a kernel refer the same set of kernel arguments 88 * * Each work-item has it's on set of call args. So a call argument at 89 * address 0x4 is different for work-item 0 and work-item 1. 90 * 91 * Ok, the table below shows an example of how we organize the call arguments in 92 * the CallArgMem class. 93 * 94 * int foo(int arg1, double arg2) 95 * ___________________________________________________ 96 * | 0: return.0 | 4: return.1 | ... | 252: return.63 | 97 * |---------------------------------------------------| 98 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | 99 * |---------------------------------------------------| 100 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | 101 * ___________________________________________________ 102 */ 103class CallArgMem 104{ 105 public: 106 // pointer to buffer for storing function arguments 107 uint8_t *mem; 108 int wfSize; 109 // size of function args 110 int funcArgsSizePerItem; 111 112 template<typename CType> 113 int 114 getLaneOffset(int lane, int addr) 115 { 116 return addr * wfSize + sizeof(CType) * lane; 117 } 118 119 CallArgMem(int func_args_size_per_item, int wf_size) 120 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) 121 { 122 mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); 123 } 124 125 ~CallArgMem() 126 { 127 free(mem); 128 } 129 130 template<typename CType> 131 uint8_t* 132 getLaneAddr(int lane, int addr) 133 { 134 return mem + getLaneOffset<CType>(lane, addr); 135 } 136 137 template<typename CType> 138 void 139 setLaneAddr(int lane, int addr, CType val) 140 { 141 *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val; 142 } 143}; 144 145class Wavefront : public SimObject 146{ 147 public: 148 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; 149 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; 150 151 // Base pointer for array of instruction pointers 152 uint64_t basePtr; 153 154 uint32_t oldBarrierCnt; 155 uint32_t barrierCnt; 156 uint32_t barrierId; 157 uint32_t barrierSlots; 158 status_e status; 159 // HW slot id where the WF is mapped to inside a SIMD unit 160 int wfSlotId; 161 int kernId; 162 // SIMD unit where the WV has been scheduled 163 int simdId; 164 // pointer to parent CU 165 ComputeUnit *computeUnit; 166 167 std::deque<GPUDynInstPtr> instructionBuffer; 168 169 bool pendingFetch; 170 bool dropFetch; 171 172 // Condition Register State (for HSAIL simulations only) 173 class ConditionRegisterState *condRegState; 174 // number of single precision VGPRs required by WF 175 uint32_t maxSpVgprs; 176 // number of double precision VGPRs required by WF 177 uint32_t maxDpVgprs; 178 // map virtual to physical vector register 179 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); 180 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 181 bool isGmInstruction(GPUDynInstPtr ii); 182 bool isLmInstruction(GPUDynInstPtr ii); 183 bool isOldestInstGMem(); 184 bool isOldestInstLMem(); 185 bool isOldestInstPrivMem(); 186 bool isOldestInstFlatMem(); 187 bool isOldestInstALU(); 188 bool isOldestInstBarrier(); 189 // used for passing spill address to DDInstGPU 190 std::vector<Addr> lastAddr; 191 std::vector<uint32_t> workItemId[3]; 192 std::vector<uint32_t> workItemFlatId;
|
| 193 /* kernel launch parameters */
|
192 uint32_t workGroupId[3]; 193 uint32_t workGroupSz[3]; 194 uint32_t gridSz[3]; 195 uint32_t wgId; 196 uint32_t wgSz;
| 194 uint32_t workGroupId[3]; 195 uint32_t workGroupSz[3]; 196 uint32_t gridSz[3]; 197 uint32_t wgId; 198 uint32_t wgSz;
|
| 199 /* the actual WG size can differ than the maximum size */ 200 uint32_t actualWgSz[3]; 201 uint32_t actualWgSzTotal; 202 void computeActualWgSz(NDRange *ndr);
|
197 // wavefront id within a workgroup 198 uint32_t wfId; 199 uint32_t maxDynWaveId; 200 uint32_t dispatchId; 201 // outstanding global+local memory requests 202 uint32_t outstandingReqs; 203 // memory requests between scoreboard 204 // and execute stage not yet executed 205 uint32_t memReqsInPipe; 206 // outstanding global memory write requests 207 uint32_t outstandingReqsWrGm; 208 // outstanding local memory write requests 209 uint32_t outstandingReqsWrLm; 210 // outstanding global memory read requests 211 uint32_t outstandingReqsRdGm; 212 // outstanding local memory read requests 213 uint32_t outstandingReqsRdLm; 214 uint32_t rdLmReqsInPipe; 215 uint32_t rdGmReqsInPipe; 216 uint32_t wrLmReqsInPipe; 217 uint32_t wrGmReqsInPipe; 218 219 int memTraceBusy; 220 uint64_t lastTrace; 221 // number of vector registers reserved by WF 222 int reservedVectorRegs; 223 // Index into the Vector Register File's namespace where the WF's registers 224 // will live while the WF is executed 225 uint32_t startVgprIndex; 226 227 // Old value of destination gpr (for trace) 228 std::vector<uint32_t> oldVgpr; 229 // Id of destination gpr (for trace) 230 uint32_t oldVgprId; 231 // Tick count of last old_vgpr copy 232 uint64_t oldVgprTcnt; 233 234 // Old value of destination gpr (for trace) 235 std::vector<uint64_t> oldDgpr; 236 // Id of destination gpr (for trace) 237 uint32_t oldDgprId; 238 // Tick count of last old_vgpr copy 239 uint64_t oldDgprTcnt; 240 241 // Execution mask at wavefront start 242 VectorMask initMask; 243 244 // number of barriers this WF has joined 245 std::vector<int> barCnt; 246 int maxBarCnt; 247 // Flag to stall a wave on barrier 248 bool stalledAtBarrier; 249 250 // a pointer to the fraction of the LDS allocated 251 // to this workgroup (thus this wavefront) 252 LdsChunk *ldsChunk; 253 254 // A pointer to the spill area 255 Addr spillBase; 256 // The size of the spill area 257 uint32_t spillSizePerItem; 258 // The vector width of the spill area 259 uint32_t spillWidth; 260 261 // A pointer to the private memory area 262 Addr privBase; 263 // The size of the private memory area 264 uint32_t privSizePerItem; 265 266 // A pointer ot the read-only memory area 267 Addr roBase; 268 // size of the read-only memory area 269 uint32_t roSize; 270 271 // pointer to buffer for storing kernel arguments 272 uint8_t *kernelArgs; 273 // unique WF id over all WFs executed across all CUs 274 uint64_t wfDynId; 275 276 // number of times instruction issue for this wavefront is blocked 277 // due to VRF port availability 278 Stats::Scalar numTimesBlockedDueVrfPortAvail; 279 // number of times an instruction of a WF is blocked from being issued 280 // due to WAR and WAW dependencies 281 Stats::Scalar numTimesBlockedDueWAXDependencies; 282 // number of times an instruction of a WF is blocked from being issued 283 // due to WAR and WAW dependencies 284 Stats::Scalar numTimesBlockedDueRAWDependencies; 285 // distribution of executed instructions based on their register 286 // operands; this is used to highlight the load on the VRF 287 Stats::Distribution srcRegOpDist; 288 Stats::Distribution dstRegOpDist; 289 290 // Functions to operate on call argument memory 291 // argument memory for hsail call instruction 292 CallArgMem *callArgMem; 293 void 294 initCallArgMem(int func_args_size_per_item, int wf_size) 295 { 296 callArgMem = new CallArgMem(func_args_size_per_item, wf_size); 297 } 298 299 template<typename CType> 300 CType 301 readCallArgMem(int lane, int addr) 302 { 303 return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr))); 304 } 305 306 template<typename CType> 307 void 308 writeCallArgMem(int lane, int addr, CType val) 309 { 310 callArgMem->setLaneAddr<CType>(lane, addr, val); 311 } 312 313 typedef WavefrontParams Params; 314 Wavefront(const Params *p); 315 ~Wavefront(); 316 virtual void init(); 317 318 void 319 setParent(ComputeUnit *cu) 320 { 321 computeUnit = cu; 322 } 323 324 void start(uint64_t _wfDynId, uint64_t _base_ptr); 325 void exec(); 326 void updateResources(); 327 int ready(itype_e type); 328 bool instructionBufferHasBranch(); 329 void regStats(); 330 VectorMask getPred() { return execMask() & initMask; } 331 332 bool waitingAtBarrier(int lane); 333 334 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 335 const VectorMask& exec_mask); 336 337 void popFromReconvergenceStack(); 338 339 uint32_t pc() const; 340 341 uint32_t rpc() const; 342 343 VectorMask execMask() const; 344 345 bool execMask(int lane) const; 346 347 void pc(uint32_t new_pc); 348 349 void discardFetch(); 350 351 /** 352 * Returns the size of the static hardware context of a particular wavefront 353 * This should be updated everytime the context is changed 354 */ 355 uint32_t getStaticContextSize() const; 356 357 /** 358 * Returns the hardware context as a stream of bytes 359 * This method is designed for HSAIL execution 360 */ 361 void getContext(const void *out); 362 363 /** 364 * Sets the hardware context fromt a stream of bytes 365 * This method is designed for HSAIL execution 366 */ 367 void setContext(const void *in); 368 369 private: 370 /** 371 * Stack containing Control Flow Graph nodes (i.e., kernel instructions) 372 * to be visited by the wavefront, and the associated execution masks. The 373 * reconvergence stack grows every time the wavefront reaches a divergence 374 * point (branch instruction), and shrinks every time the wavefront 375 * reaches a reconvergence point (immediate post-dominator instruction). 376 */ 377 std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; 378}; 379 380#endif // __WAVEFRONT_HH__
| 203 // wavefront id within a workgroup 204 uint32_t wfId; 205 uint32_t maxDynWaveId; 206 uint32_t dispatchId; 207 // outstanding global+local memory requests 208 uint32_t outstandingReqs; 209 // memory requests between scoreboard 210 // and execute stage not yet executed 211 uint32_t memReqsInPipe; 212 // outstanding global memory write requests 213 uint32_t outstandingReqsWrGm; 214 // outstanding local memory write requests 215 uint32_t outstandingReqsWrLm; 216 // outstanding global memory read requests 217 uint32_t outstandingReqsRdGm; 218 // outstanding local memory read requests 219 uint32_t outstandingReqsRdLm; 220 uint32_t rdLmReqsInPipe; 221 uint32_t rdGmReqsInPipe; 222 uint32_t wrLmReqsInPipe; 223 uint32_t wrGmReqsInPipe; 224 225 int memTraceBusy; 226 uint64_t lastTrace; 227 // number of vector registers reserved by WF 228 int reservedVectorRegs; 229 // Index into the Vector Register File's namespace where the WF's registers 230 // will live while the WF is executed 231 uint32_t startVgprIndex; 232 233 // Old value of destination gpr (for trace) 234 std::vector<uint32_t> oldVgpr; 235 // Id of destination gpr (for trace) 236 uint32_t oldVgprId; 237 // Tick count of last old_vgpr copy 238 uint64_t oldVgprTcnt; 239 240 // Old value of destination gpr (for trace) 241 std::vector<uint64_t> oldDgpr; 242 // Id of destination gpr (for trace) 243 uint32_t oldDgprId; 244 // Tick count of last old_vgpr copy 245 uint64_t oldDgprTcnt; 246 247 // Execution mask at wavefront start 248 VectorMask initMask; 249 250 // number of barriers this WF has joined 251 std::vector<int> barCnt; 252 int maxBarCnt; 253 // Flag to stall a wave on barrier 254 bool stalledAtBarrier; 255 256 // a pointer to the fraction of the LDS allocated 257 // to this workgroup (thus this wavefront) 258 LdsChunk *ldsChunk; 259 260 // A pointer to the spill area 261 Addr spillBase; 262 // The size of the spill area 263 uint32_t spillSizePerItem; 264 // The vector width of the spill area 265 uint32_t spillWidth; 266 267 // A pointer to the private memory area 268 Addr privBase; 269 // The size of the private memory area 270 uint32_t privSizePerItem; 271 272 // A pointer ot the read-only memory area 273 Addr roBase; 274 // size of the read-only memory area 275 uint32_t roSize; 276 277 // pointer to buffer for storing kernel arguments 278 uint8_t *kernelArgs; 279 // unique WF id over all WFs executed across all CUs 280 uint64_t wfDynId; 281 282 // number of times instruction issue for this wavefront is blocked 283 // due to VRF port availability 284 Stats::Scalar numTimesBlockedDueVrfPortAvail; 285 // number of times an instruction of a WF is blocked from being issued 286 // due to WAR and WAW dependencies 287 Stats::Scalar numTimesBlockedDueWAXDependencies; 288 // number of times an instruction of a WF is blocked from being issued 289 // due to WAR and WAW dependencies 290 Stats::Scalar numTimesBlockedDueRAWDependencies; 291 // distribution of executed instructions based on their register 292 // operands; this is used to highlight the load on the VRF 293 Stats::Distribution srcRegOpDist; 294 Stats::Distribution dstRegOpDist; 295 296 // Functions to operate on call argument memory 297 // argument memory for hsail call instruction 298 CallArgMem *callArgMem; 299 void 300 initCallArgMem(int func_args_size_per_item, int wf_size) 301 { 302 callArgMem = new CallArgMem(func_args_size_per_item, wf_size); 303 } 304 305 template<typename CType> 306 CType 307 readCallArgMem(int lane, int addr) 308 { 309 return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr))); 310 } 311 312 template<typename CType> 313 void 314 writeCallArgMem(int lane, int addr, CType val) 315 { 316 callArgMem->setLaneAddr<CType>(lane, addr, val); 317 } 318 319 typedef WavefrontParams Params; 320 Wavefront(const Params *p); 321 ~Wavefront(); 322 virtual void init(); 323 324 void 325 setParent(ComputeUnit *cu) 326 { 327 computeUnit = cu; 328 } 329 330 void start(uint64_t _wfDynId, uint64_t _base_ptr); 331 void exec(); 332 void updateResources(); 333 int ready(itype_e type); 334 bool instructionBufferHasBranch(); 335 void regStats(); 336 VectorMask getPred() { return execMask() & initMask; } 337 338 bool waitingAtBarrier(int lane); 339 340 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 341 const VectorMask& exec_mask); 342 343 void popFromReconvergenceStack(); 344 345 uint32_t pc() const; 346 347 uint32_t rpc() const; 348 349 VectorMask execMask() const; 350 351 bool execMask(int lane) const; 352 353 void pc(uint32_t new_pc); 354 355 void discardFetch(); 356 357 /** 358 * Returns the size of the static hardware context of a particular wavefront 359 * This should be updated everytime the context is changed 360 */ 361 uint32_t getStaticContextSize() const; 362 363 /** 364 * Returns the hardware context as a stream of bytes 365 * This method is designed for HSAIL execution 366 */ 367 void getContext(const void *out); 368 369 /** 370 * Sets the hardware context fromt a stream of bytes 371 * This method is designed for HSAIL execution 372 */ 373 void setContext(const void *in); 374 375 private: 376 /** 377 * Stack containing Control Flow Graph nodes (i.e., kernel instructions) 378 * to be visited by the wavefront, and the associated execution masks. The 379 * reconvergence stack grows every time the wavefront reaches a divergence 380 * point (branch instruction), and shrinks every time the wavefront 381 * reaches a reconvergence point (immediate post-dominator instruction). 382 */ 383 std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; 384}; 385 386#endif // __WAVEFRONT_HH__
|