wavefront.hh revision 11643
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#ifndef __WAVEFRONT_HH__ 37#define __WAVEFRONT_HH__ 38 39#include <cassert> 40#include <deque> 41#include <memory> 42#include <stack> 43#include <vector> 44 45#include "base/misc.hh" 46#include "base/types.hh" 47#include "gpu-compute/condition_register_state.hh" 48#include "gpu-compute/lds_state.hh" 49#include "gpu-compute/misc.hh" 50#include "params/Wavefront.hh" 51#include "sim/sim_object.hh" 52 53static const int MAX_NUM_INSTS_PER_WF = 12; 54 55/** 56 * A reconvergence stack entry conveys the necessary state to implement 57 * control flow divergence. 58 */ 59struct ReconvergenceStackEntry { 60 /** 61 * PC of current instruction. 62 */ 63 uint32_t pc; 64 /** 65 * PC of the immediate post-dominator instruction, i.e., the value of 66 * @a pc for the first instruction that will be executed by the wavefront 67 * when a reconvergence point is reached. 68 */ 69 uint32_t rpc; 70 /** 71 * Execution mask. 72 */ 73 VectorMask execMask; 74}; 75 76/* 77 * Arguments for the hsail opcode call, are user defined and variable length. 78 * The hardware/finalizer can support arguments in hardware or use memory to 79 * pass arguments. For now, let's assume that an unlimited number of arguments 80 * are supported in hardware (the compiler inlines functions whenver it can 81 * anyways, so unless someone is interested in the implications of linking/ 82 * library functions, I think this is a reasonable assumption given the typical 83 * size of an OpenCL kernel). 84 * 85 * Note that call args are different than kernel arguments: 86 * * All work-items in a kernel refer the same set of kernel arguments 87 * * Each work-item has it's on set of call args. So a call argument at 88 * address 0x4 is different for work-item 0 and work-item 1. 89 * 90 * Ok, the table below shows an example of how we organize the call arguments in 91 * the CallArgMem class. 92 * 93 * int foo(int arg1, double arg2) 94 * ___________________________________________________ 95 * | 0: return.0 | 4: return.1 | ... | 252: return.63 | 96 * |---------------------------------------------------| 97 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | 98 * |---------------------------------------------------| 99 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | 100 * ___________________________________________________ 101 */ 102class CallArgMem 103{ 104 public: 105 // pointer to buffer for storing function arguments 106 uint8_t *mem; 107 int wfSize; 108 // size of function args 109 int funcArgsSizePerItem; 110 111 template<typename CType> 112 int 113 getLaneOffset(int lane, int addr) 114 { 115 return addr * wfSize + sizeof(CType) * lane; 116 } 117 118 CallArgMem(int func_args_size_per_item, int wf_size) 119 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) 120 { 121 mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); 122 } 123 124 ~CallArgMem() 125 { 126 free(mem); 127 } 128 129 template<typename CType> 130 uint8_t* 131 getLaneAddr(int lane, int addr) 132 { 133 return mem + getLaneOffset<CType>(lane, addr); 134 } 135 136 template<typename CType> 137 void 138 setLaneAddr(int lane, int addr, CType val) 139 { 140 *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val; 141 } 142}; 143 144class Wavefront : public SimObject 145{ 146 public: 147 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; 148 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; 149 150 // Base pointer for array of instruction pointers 151 uint64_t basePtr; 152 153 uint32_t oldBarrierCnt; 154 uint32_t barrierCnt; 155 uint32_t barrierId; 156 uint32_t barrierSlots; 157 status_e status; 158 // HW slot id where the WF is mapped to inside a SIMD unit 159 int wfSlotId; 160 int kernId; 161 // SIMD unit where the WV has been scheduled 162 int simdId; 163 // pointer to parent CU 164 ComputeUnit *computeUnit; 165 166 std::deque<GPUDynInstPtr> instructionBuffer; 167 168 bool pendingFetch; 169 bool dropFetch; 170 171 // Condition Register State (for HSAIL simulations only) 172 class ConditionRegisterState *condRegState; 173 // number of single precision VGPRs required by WF 174 uint32_t maxSpVgprs; 175 // number of double precision VGPRs required by WF 176 uint32_t maxDpVgprs; 177 // map virtual to physical vector register 178 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); 179 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 180 bool isGmInstruction(GPUDynInstPtr ii); 181 bool isLmInstruction(GPUDynInstPtr ii); 182 bool isOldestInstGMem(); 183 bool isOldestInstLMem(); 184 bool isOldestInstPrivMem(); 185 bool isOldestInstFlatMem(); 186 bool isOldestInstALU(); 187 bool isOldestInstBarrier(); 188 // used for passing spill address to DDInstGPU 189 std::vector<Addr> lastAddr; 190 std::vector<uint32_t> workItemId[3]; 191 std::vector<uint32_t> workItemFlatId; 192 uint32_t workGroupId[3]; 193 uint32_t workGroupSz[3]; 194 uint32_t gridSz[3]; 195 uint32_t wgId; 196 uint32_t wgSz; 197 // wavefront id within a workgroup 198 uint32_t wfId; 199 uint32_t maxDynWaveId; 200 uint32_t dispatchId; 201 // outstanding global+local memory requests 202 uint32_t outstandingReqs; 203 // memory requests between scoreboard 204 // and execute stage not yet executed 205 uint32_t memReqsInPipe; 206 // outstanding global memory write requests 207 uint32_t outstandingReqsWrGm; 208 // outstanding local memory write requests 209 uint32_t outstandingReqsWrLm; 210 // outstanding global memory read requests 211 uint32_t outstandingReqsRdGm; 212 // outstanding local memory read requests 213 uint32_t outstandingReqsRdLm; 214 uint32_t rdLmReqsInPipe; 215 uint32_t rdGmReqsInPipe; 216 uint32_t wrLmReqsInPipe; 217 uint32_t wrGmReqsInPipe; 218 219 int memTraceBusy; 220 uint64_t lastTrace; 221 // number of vector registers reserved by WF 222 int reservedVectorRegs; 223 // Index into the Vector Register File's namespace where the WF's registers 224 // will live while the WF is executed 225 uint32_t startVgprIndex; 226 227 // Old value of destination gpr (for trace) 228 std::vector<uint32_t> oldVgpr; 229 // Id of destination gpr (for trace) 230 uint32_t oldVgprId; 231 // Tick count of last old_vgpr copy 232 uint64_t oldVgprTcnt; 233 234 // Old value of destination gpr (for trace) 235 std::vector<uint64_t> oldDgpr; 236 // Id of destination gpr (for trace) 237 uint32_t oldDgprId; 238 // Tick count of last old_vgpr copy 239 uint64_t oldDgprTcnt; 240 241 // Execution mask at wavefront start 242 VectorMask initMask; 243 244 // number of barriers this WF has joined 245 std::vector<int> barCnt; 246 int maxBarCnt; 247 // Flag to stall a wave on barrier 248 bool stalledAtBarrier; 249 250 // a pointer to the fraction of the LDS allocated 251 // to this workgroup (thus this wavefront) 252 LdsChunk *ldsChunk; 253 254 // A pointer to the spill area 255 Addr spillBase; 256 // The size of the spill area 257 uint32_t spillSizePerItem; 258 // The vector width of the spill area 259 uint32_t spillWidth; 260 261 // A pointer to the private memory area 262 Addr privBase; 263 // The size of the private memory area 264 uint32_t privSizePerItem; 265 266 // A pointer ot the read-only memory area 267 Addr roBase; 268 // size of the read-only memory area 269 uint32_t roSize; 270 271 // pointer to buffer for storing kernel arguments 272 uint8_t *kernelArgs; 273 // unique WF id over all WFs executed across all CUs 274 uint64_t wfDynId; 275 276 // number of times instruction issue for this wavefront is blocked 277 // due to VRF port availability 278 Stats::Scalar numTimesBlockedDueVrfPortAvail; 279 // number of times an instruction of a WF is blocked from being issued 280 // due to WAR and WAW dependencies 281 Stats::Scalar numTimesBlockedDueWAXDependencies; 282 // number of times an instruction of a WF is blocked from being issued 283 // due to WAR and WAW dependencies 284 Stats::Scalar numTimesBlockedDueRAWDependencies; 285 // distribution of executed instructions based on their register 286 // operands; this is used to highlight the load on the VRF 287 Stats::Distribution srcRegOpDist; 288 Stats::Distribution dstRegOpDist; 289 290 // Functions to operate on call argument memory 291 // argument memory for hsail call instruction 292 CallArgMem *callArgMem; 293 void 294 initCallArgMem(int func_args_size_per_item, int wf_size) 295 { 296 callArgMem = new CallArgMem(func_args_size_per_item, wf_size); 297 } 298 299 template<typename CType> 300 CType 301 readCallArgMem(int lane, int addr) 302 { 303 return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr))); 304 } 305 306 template<typename CType> 307 void 308 writeCallArgMem(int lane, int addr, CType val) 309 { 310 callArgMem->setLaneAddr<CType>(lane, addr, val); 311 } 312 313 typedef WavefrontParams Params; 314 Wavefront(const Params *p); 315 ~Wavefront(); 316 virtual void init(); 317 318 void 319 setParent(ComputeUnit *cu) 320 { 321 computeUnit = cu; 322 } 323 324 void start(uint64_t _wfDynId, uint64_t _base_ptr); 325 void exec(); 326 void updateResources(); 327 int ready(itype_e type); 328 bool instructionBufferHasBranch(); 329 void regStats(); 330 VectorMask getPred() { return execMask() & initMask; } 331 332 bool waitingAtBarrier(int lane); 333 334 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 335 const VectorMask& exec_mask); 336 337 void popFromReconvergenceStack(); 338 339 uint32_t pc() const; 340 341 uint32_t rpc() const; 342 343 VectorMask execMask() const; 344 345 bool execMask(int lane) const; 346 347 void pc(uint32_t new_pc); 348 349 void discardFetch(); 350 351 /** 352 * Returns the size of the static hardware context of a particular wavefront 353 * This should be updated everytime the context is changed 354 */ 355 uint32_t getStaticContextSize() const; 356 357 private: 358 /** 359 * Stack containing Control Flow Graph nodes (i.e., kernel instructions) 360 * to be visited by the wavefront, and the associated execution masks. The 361 * reconvergence stack grows every time the wavefront reaches a divergence 362 * point (branch instruction), and shrinks every time the wavefront 363 * reaches a reconvergence point (immediate post-dominator instruction). 364 */ 365 std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; 366}; 367 368#endif // __WAVEFRONT_HH__ 369