wavefront.hh revision 11641:a9f0711e7230
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#ifndef __WAVEFRONT_HH__ 37#define __WAVEFRONT_HH__ 38 39#include <cassert> 40#include <deque> 41#include <memory> 42#include <stack> 43#include <vector> 44 45#include "base/misc.hh" 46#include "base/types.hh" 47#include "gpu-compute/condition_register_state.hh" 48#include "gpu-compute/lds_state.hh" 49#include "gpu-compute/misc.hh" 50#include "params/Wavefront.hh" 51#include "sim/sim_object.hh" 52 53static const int MAX_NUM_INSTS_PER_WF = 12; 54 55/** 56 * A reconvergence stack entry conveys the necessary state to implement 57 * control flow divergence. 58 */ 59struct ReconvergenceStackEntry { 60 /** 61 * PC of current instruction. 62 */ 63 uint32_t pc; 64 /** 65 * PC of the immediate post-dominator instruction, i.e., the value of 66 * @a pc for the first instruction that will be executed by the wavefront 67 * when a reconvergence point is reached. 68 */ 69 uint32_t rpc; 70 /** 71 * Execution mask. 72 */ 73 VectorMask execMask; 74}; 75 76/* 77 * Arguments for the hsail opcode call, are user defined and variable length. 78 * The hardware/finalizer can support arguments in hardware or use memory to 79 * pass arguments. For now, let's assume that an unlimited number of arguments 80 * are supported in hardware (the compiler inlines functions whenver it can 81 * anyways, so unless someone is interested in the implications of linking/ 82 * library functions, I think this is a reasonable assumption given the typical 83 * size of an OpenCL kernel). 84 * 85 * Note that call args are different than kernel arguments: 86 * * All work-items in a kernel refer the same set of kernel arguments 87 * * Each work-item has it's on set of call args. So a call argument at 88 * address 0x4 is different for work-item 0 and work-item 1. 89 * 90 * Ok, the table below shows an example of how we organize the call arguments in 91 * the CallArgMem class. 92 * 93 * int foo(int arg1, double arg2) 94 * ___________________________________________________ 95 * | 0: return.0 | 4: return.1 | ... | 252: return.63 | 96 * |---------------------------------------------------| 97 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | 98 * |---------------------------------------------------| 99 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | 100 * ___________________________________________________ 101 */ 102class CallArgMem 103{ 104 public: 105 // pointer to buffer for storing function arguments 106 uint8_t *mem; 107 int wfSize; 108 // size of function args 109 int funcArgsSizePerItem; 110 111 template<typename CType> 112 int 113 getLaneOffset(int lane, int addr) 114 { 115 return addr * wfSize + sizeof(CType) * lane; 116 } 117 118 CallArgMem(int func_args_size_per_item, int wf_size) 119 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) 120 { 121 mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); 122 } 123 124 ~CallArgMem() 125 { 126 free(mem); 127 } 128 129 template<typename CType> 130 uint8_t* 131 getLaneAddr(int lane, int addr) 132 { 133 return mem + getLaneOffset<CType>(lane, addr); 134 } 135 136 template<typename CType> 137 void 138 setLaneAddr(int lane, int addr, CType val) 139 { 140 *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val; 141 } 142}; 143 144class Wavefront : public SimObject 145{ 146 public: 147 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; 148 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; 149 150 // Base pointer for array of instruction pointers 151 uint64_t basePtr; 152 153 uint32_t oldBarrierCnt; 154 uint32_t barrierCnt; 155 uint32_t barrierId; 156 uint32_t barrierSlots; 157 status_e status; 158 // HW slot id where the WF is mapped to inside a SIMD unit 159 int wfSlotId; 160 int kernId; 161 // SIMD unit where the WV has been scheduled 162 int simdId; 163 // pointer to parent CU 164 ComputeUnit *computeUnit; 165 166 std::deque<GPUDynInstPtr> instructionBuffer; 167 168 bool pendingFetch; 169 bool dropFetch; 170 171 // Condition Register State (for HSAIL simulations only) 172 class ConditionRegisterState *condRegState; 173 // number of single precision VGPRs required by WF 174 uint32_t maxSpVgprs; 175 // number of double precision VGPRs required by WF 176 uint32_t maxDpVgprs; 177 // map virtual to physical vector register 178 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); 179 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 180 bool isGmInstruction(GPUDynInstPtr ii); 181 bool isLmInstruction(GPUDynInstPtr ii); 182 bool isOldestInstGMem(); 183 bool isOldestInstLMem(); 184 bool isOldestInstPrivMem(); 185 bool isOldestInstFlatMem(); 186 bool isOldestInstALU(); 187 bool isOldestInstBarrier(); 188 // used for passing spill address to DDInstGPU 189 std::vector<Addr> lastAddr; 190 std::vector<uint32_t> workItemId[3]; 191 std::vector<uint32_t> workItemFlatId; 192 uint32_t workGroupId[3]; 193 uint32_t workGroupSz[3]; 194 uint32_t gridSz[3]; 195 uint32_t wgId; 196 uint32_t wgSz; 197 uint32_t dynWaveId; 198 uint32_t maxDynWaveId; 199 uint32_t dispatchId; 200 // outstanding global+local memory requests 201 uint32_t outstandingReqs; 202 // memory requests between scoreboard 203 // and execute stage not yet executed 204 uint32_t memReqsInPipe; 205 // outstanding global memory write requests 206 uint32_t outstandingReqsWrGm; 207 // outstanding local memory write requests 208 uint32_t outstandingReqsWrLm; 209 // outstanding global memory read requests 210 uint32_t outstandingReqsRdGm; 211 // outstanding local memory read requests 212 uint32_t outstandingReqsRdLm; 213 uint32_t rdLmReqsInPipe; 214 uint32_t rdGmReqsInPipe; 215 uint32_t wrLmReqsInPipe; 216 uint32_t wrGmReqsInPipe; 217 218 int memTraceBusy; 219 uint64_t lastTrace; 220 // number of vector registers reserved by WF 221 int reservedVectorRegs; 222 // Index into the Vector Register File's namespace where the WF's registers 223 // will live while the WF is executed 224 uint32_t startVgprIndex; 225 226 // Old value of destination gpr (for trace) 227 std::vector<uint32_t> oldVgpr; 228 // Id of destination gpr (for trace) 229 uint32_t oldVgprId; 230 // Tick count of last old_vgpr copy 231 uint64_t oldVgprTcnt; 232 233 // Old value of destination gpr (for trace) 234 std::vector<uint64_t> oldDgpr; 235 // Id of destination gpr (for trace) 236 uint32_t oldDgprId; 237 // Tick count of last old_vgpr copy 238 uint64_t oldDgprTcnt; 239 240 // Execution mask at wavefront start 241 VectorMask initMask; 242 243 // number of barriers this WF has joined 244 std::vector<int> barCnt; 245 int maxBarCnt; 246 // Flag to stall a wave on barrier 247 bool stalledAtBarrier; 248 249 // a pointer to the fraction of the LDS allocated 250 // to this workgroup (thus this wavefront) 251 LdsChunk *ldsChunk; 252 253 // A pointer to the spill area 254 Addr spillBase; 255 // The size of the spill area 256 uint32_t spillSizePerItem; 257 // The vector width of the spill area 258 uint32_t spillWidth; 259 260 // A pointer to the private memory area 261 Addr privBase; 262 // The size of the private memory area 263 uint32_t privSizePerItem; 264 265 // A pointer ot the read-only memory area 266 Addr roBase; 267 // size of the read-only memory area 268 uint32_t roSize; 269 270 // pointer to buffer for storing kernel arguments 271 uint8_t *kernelArgs; 272 // unique WF id over all WFs executed across all CUs 273 uint64_t wfDynId; 274 275 // number of times instruction issue for this wavefront is blocked 276 // due to VRF port availability 277 Stats::Scalar numTimesBlockedDueVrfPortAvail; 278 // number of times an instruction of a WF is blocked from being issued 279 // due to WAR and WAW dependencies 280 Stats::Scalar numTimesBlockedDueWAXDependencies; 281 // number of times an instruction of a WF is blocked from being issued 282 // due to WAR and WAW dependencies 283 Stats::Scalar numTimesBlockedDueRAWDependencies; 284 // distribution of executed instructions based on their register 285 // operands; this is used to highlight the load on the VRF 286 Stats::Distribution srcRegOpDist; 287 Stats::Distribution dstRegOpDist; 288 289 // Functions to operate on call argument memory 290 // argument memory for hsail call instruction 291 CallArgMem *callArgMem; 292 void 293 initCallArgMem(int func_args_size_per_item, int wf_size) 294 { 295 callArgMem = new CallArgMem(func_args_size_per_item, wf_size); 296 } 297 298 template<typename CType> 299 CType 300 readCallArgMem(int lane, int addr) 301 { 302 return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr))); 303 } 304 305 template<typename CType> 306 void 307 writeCallArgMem(int lane, int addr, CType val) 308 { 309 callArgMem->setLaneAddr<CType>(lane, addr, val); 310 } 311 312 typedef WavefrontParams Params; 313 Wavefront(const Params *p); 314 ~Wavefront(); 315 virtual void init(); 316 317 void 318 setParent(ComputeUnit *cu) 319 { 320 computeUnit = cu; 321 } 322 323 void start(uint64_t _wfDynId, uint64_t _base_ptr); 324 void exec(); 325 void updateResources(); 326 int ready(itype_e type); 327 bool instructionBufferHasBranch(); 328 void regStats(); 329 VectorMask getPred() { return execMask() & initMask; } 330 331 bool waitingAtBarrier(int lane); 332 333 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 334 const VectorMask& exec_mask); 335 336 void popFromReconvergenceStack(); 337 338 uint32_t pc() const; 339 340 uint32_t rpc() const; 341 342 VectorMask execMask() const; 343 344 bool execMask(int lane) const; 345 346 void pc(uint32_t new_pc); 347 348 void discardFetch(); 349 350 /** 351 * Returns the size of the static hardware context of a particular wavefront 352 * This should be updated everytime the context is changed 353 */ 354 uint32_t getStaticContextSize() const; 355 356 private: 357 /** 358 * Stack containing Control Flow Graph nodes (i.e., kernel instructions) 359 * to be visited by the wavefront, and the associated execution masks. The 360 * reconvergence stack grows every time the wavefront reaches a divergence 361 * point (branch instruction), and shrinks every time the wavefront 362 * reaches a reconvergence point (immediate post-dominator instruction). 363 */ 364 std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; 365}; 366 367#endif // __WAVEFRONT_HH__ 368