wavefront.hh revision 11657
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#ifndef __WAVEFRONT_HH__ 37#define __WAVEFRONT_HH__ 38 39#include <cassert> 40#include <deque> 41#include <memory> 42#include <stack> 43#include <vector> 44 45#include "base/misc.hh" 46#include "base/types.hh" 47#include "gpu-compute/condition_register_state.hh" 48#include "gpu-compute/lds_state.hh" 49#include "gpu-compute/misc.hh" 50#include "gpu-compute/ndrange.hh" 51#include "params/Wavefront.hh" 52#include "sim/sim_object.hh" 53 54static const int MAX_NUM_INSTS_PER_WF = 12; 55 56/** 57 * A reconvergence stack entry conveys the necessary state to implement 58 * control flow divergence. 59 */ 60struct ReconvergenceStackEntry { 61 /** 62 * PC of current instruction. 63 */ 64 uint32_t pc; 65 /** 66 * PC of the immediate post-dominator instruction, i.e., the value of 67 * @a pc for the first instruction that will be executed by the wavefront 68 * when a reconvergence point is reached. 69 */ 70 uint32_t rpc; 71 /** 72 * Execution mask. 73 */ 74 VectorMask execMask; 75}; 76 77/* 78 * Arguments for the hsail opcode call, are user defined and variable length. 79 * The hardware/finalizer can support arguments in hardware or use memory to 80 * pass arguments. For now, let's assume that an unlimited number of arguments 81 * are supported in hardware (the compiler inlines functions whenver it can 82 * anyways, so unless someone is interested in the implications of linking/ 83 * library functions, I think this is a reasonable assumption given the typical 84 * size of an OpenCL kernel). 85 * 86 * Note that call args are different than kernel arguments: 87 * * All work-items in a kernel refer the same set of kernel arguments 88 * * Each work-item has it's on set of call args. So a call argument at 89 * address 0x4 is different for work-item 0 and work-item 1. 90 * 91 * Ok, the table below shows an example of how we organize the call arguments in 92 * the CallArgMem class. 93 * 94 * int foo(int arg1, double arg2) 95 * ___________________________________________________ 96 * | 0: return.0 | 4: return.1 | ... | 252: return.63 | 97 * |---------------------------------------------------| 98 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | 99 * |---------------------------------------------------| 100 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | 101 * ___________________________________________________ 102 */ 103class CallArgMem 104{ 105 public: 106 // pointer to buffer for storing function arguments 107 uint8_t *mem; 108 int wfSize; 109 // size of function args 110 int funcArgsSizePerItem; 111 112 template<typename CType> 113 int 114 getLaneOffset(int lane, int addr) 115 { 116 return addr * wfSize + sizeof(CType) * lane; 117 } 118 119 CallArgMem(int func_args_size_per_item, int wf_size) 120 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) 121 { 122 mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); 123 } 124 125 ~CallArgMem() 126 { 127 free(mem); 128 } 129 130 template<typename CType> 131 uint8_t* 132 getLaneAddr(int lane, int addr) 133 { 134 return mem + getLaneOffset<CType>(lane, addr); 135 } 136 137 template<typename CType> 138 void 139 setLaneAddr(int lane, int addr, CType val) 140 { 141 *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val; 142 } 143}; 144 145class Wavefront : public SimObject 146{ 147 public: 148 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; 149 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; 150 151 // Base pointer for array of instruction pointers 152 uint64_t basePtr; 153 154 uint32_t oldBarrierCnt; 155 uint32_t barrierCnt; 156 uint32_t barrierId; 157 uint32_t barrierSlots; 158 status_e status; 159 // HW slot id where the WF is mapped to inside a SIMD unit 160 int wfSlotId; 161 int kernId; 162 // SIMD unit where the WV has been scheduled 163 int simdId; 164 // pointer to parent CU 165 ComputeUnit *computeUnit; 166 167 std::deque<GPUDynInstPtr> instructionBuffer; 168 169 bool pendingFetch; 170 bool dropFetch; 171 172 // Condition Register State (for HSAIL simulations only) 173 class ConditionRegisterState *condRegState; 174 // number of single precision VGPRs required by WF 175 uint32_t maxSpVgprs; 176 // number of double precision VGPRs required by WF 177 uint32_t maxDpVgprs; 178 // map virtual to physical vector register 179 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); 180 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 181 bool isGmInstruction(GPUDynInstPtr ii); 182 bool isLmInstruction(GPUDynInstPtr ii); 183 bool isOldestInstGMem(); 184 bool isOldestInstLMem(); 185 bool isOldestInstPrivMem(); 186 bool isOldestInstFlatMem(); 187 bool isOldestInstALU(); 188 bool isOldestInstBarrier(); 189 // used for passing spill address to DDInstGPU 190 std::vector<Addr> lastAddr; 191 std::vector<uint32_t> workItemId[3]; 192 std::vector<uint32_t> workItemFlatId; 193 /* kernel launch parameters */ 194 uint32_t workGroupId[3]; 195 uint32_t workGroupSz[3]; 196 uint32_t gridSz[3]; 197 uint32_t wgId; 198 uint32_t wgSz; 199 /* the actual WG size can differ than the maximum size */ 200 uint32_t actualWgSz[3]; 201 uint32_t actualWgSzTotal; 202 void computeActualWgSz(NDRange *ndr); 203 // wavefront id within a workgroup 204 uint32_t wfId; 205 uint32_t maxDynWaveId; 206 uint32_t dispatchId; 207 // outstanding global+local memory requests 208 uint32_t outstandingReqs; 209 // memory requests between scoreboard 210 // and execute stage not yet executed 211 uint32_t memReqsInPipe; 212 // outstanding global memory write requests 213 uint32_t outstandingReqsWrGm; 214 // outstanding local memory write requests 215 uint32_t outstandingReqsWrLm; 216 // outstanding global memory read requests 217 uint32_t outstandingReqsRdGm; 218 // outstanding local memory read requests 219 uint32_t outstandingReqsRdLm; 220 uint32_t rdLmReqsInPipe; 221 uint32_t rdGmReqsInPipe; 222 uint32_t wrLmReqsInPipe; 223 uint32_t wrGmReqsInPipe; 224 225 int memTraceBusy; 226 uint64_t lastTrace; 227 // number of vector registers reserved by WF 228 int reservedVectorRegs; 229 // Index into the Vector Register File's namespace where the WF's registers 230 // will live while the WF is executed 231 uint32_t startVgprIndex; 232 233 // Old value of destination gpr (for trace) 234 std::vector<uint32_t> oldVgpr; 235 // Id of destination gpr (for trace) 236 uint32_t oldVgprId; 237 // Tick count of last old_vgpr copy 238 uint64_t oldVgprTcnt; 239 240 // Old value of destination gpr (for trace) 241 std::vector<uint64_t> oldDgpr; 242 // Id of destination gpr (for trace) 243 uint32_t oldDgprId; 244 // Tick count of last old_vgpr copy 245 uint64_t oldDgprTcnt; 246 247 // Execution mask at wavefront start 248 VectorMask initMask; 249 250 // number of barriers this WF has joined 251 std::vector<int> barCnt; 252 int maxBarCnt; 253 // Flag to stall a wave on barrier 254 bool stalledAtBarrier; 255 256 // a pointer to the fraction of the LDS allocated 257 // to this workgroup (thus this wavefront) 258 LdsChunk *ldsChunk; 259 260 // A pointer to the spill area 261 Addr spillBase; 262 // The size of the spill area 263 uint32_t spillSizePerItem; 264 // The vector width of the spill area 265 uint32_t spillWidth; 266 267 // A pointer to the private memory area 268 Addr privBase; 269 // The size of the private memory area 270 uint32_t privSizePerItem; 271 272 // A pointer ot the read-only memory area 273 Addr roBase; 274 // size of the read-only memory area 275 uint32_t roSize; 276 277 // pointer to buffer for storing kernel arguments 278 uint8_t *kernelArgs; 279 // unique WF id over all WFs executed across all CUs 280 uint64_t wfDynId; 281 282 // number of times instruction issue for this wavefront is blocked 283 // due to VRF port availability 284 Stats::Scalar numTimesBlockedDueVrfPortAvail; 285 // number of times an instruction of a WF is blocked from being issued 286 // due to WAR and WAW dependencies 287 Stats::Scalar numTimesBlockedDueWAXDependencies; 288 // number of times an instruction of a WF is blocked from being issued 289 // due to WAR and WAW dependencies 290 Stats::Scalar numTimesBlockedDueRAWDependencies; 291 // distribution of executed instructions based on their register 292 // operands; this is used to highlight the load on the VRF 293 Stats::Distribution srcRegOpDist; 294 Stats::Distribution dstRegOpDist; 295 296 // Functions to operate on call argument memory 297 // argument memory for hsail call instruction 298 CallArgMem *callArgMem; 299 void 300 initCallArgMem(int func_args_size_per_item, int wf_size) 301 { 302 callArgMem = new CallArgMem(func_args_size_per_item, wf_size); 303 } 304 305 template<typename CType> 306 CType 307 readCallArgMem(int lane, int addr) 308 { 309 return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr))); 310 } 311 312 template<typename CType> 313 void 314 writeCallArgMem(int lane, int addr, CType val) 315 { 316 callArgMem->setLaneAddr<CType>(lane, addr, val); 317 } 318 319 typedef WavefrontParams Params; 320 Wavefront(const Params *p); 321 ~Wavefront(); 322 virtual void init(); 323 324 void 325 setParent(ComputeUnit *cu) 326 { 327 computeUnit = cu; 328 } 329 330 void start(uint64_t _wfDynId, uint64_t _base_ptr); 331 void exec(); 332 void updateResources(); 333 int ready(itype_e type); 334 bool instructionBufferHasBranch(); 335 void regStats(); 336 VectorMask getPred() { return execMask() & initMask; } 337 338 bool waitingAtBarrier(int lane); 339 340 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 341 const VectorMask& exec_mask); 342 343 void popFromReconvergenceStack(); 344 345 uint32_t pc() const; 346 347 uint32_t rpc() const; 348 349 VectorMask execMask() const; 350 351 bool execMask(int lane) const; 352 353 void pc(uint32_t new_pc); 354 355 void discardFetch(); 356 357 /** 358 * Returns the size of the static hardware context of a particular wavefront 359 * This should be updated everytime the context is changed 360 */ 361 uint32_t getStaticContextSize() const; 362 363 /** 364 * Returns the hardware context as a stream of bytes 365 * This method is designed for HSAIL execution 366 */ 367 void getContext(const void *out); 368 369 /** 370 * Sets the hardware context fromt a stream of bytes 371 * This method is designed for HSAIL execution 372 */ 373 void setContext(const void *in); 374 375 private: 376 /** 377 * Stack containing Control Flow Graph nodes (i.e., kernel instructions) 378 * to be visited by the wavefront, and the associated execution masks. The 379 * reconvergence stack grows every time the wavefront reaches a divergence 380 * point (branch instruction), and shrinks every time the wavefront 381 * reaches a reconvergence point (immediate post-dominator instruction). 382 */ 383 std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; 384}; 385 386#endif // __WAVEFRONT_HH__ 387