wavefront.hh revision 11640:aa846ec8cd8d
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#ifndef __WAVEFRONT_HH__ 37#define __WAVEFRONT_HH__ 38 39#include <cassert> 40#include <deque> 41#include <memory> 42#include <stack> 43#include <vector> 44 45#include "base/misc.hh" 46#include "base/types.hh" 47#include "gpu-compute/condition_register_state.hh" 48#include "gpu-compute/lds_state.hh" 49#include "gpu-compute/misc.hh" 50#include "params/Wavefront.hh" 51#include "sim/sim_object.hh" 52 53static const int MAX_NUM_INSTS_PER_WF = 12; 54 55/* 56 * Arguments for the hsail opcode call, are user defined and variable length. 57 * The hardware/finalizer can support arguments in hardware or use memory to 58 * pass arguments. For now, let's assume that an unlimited number of arguments 59 * are supported in hardware (the compiler inlines functions whenver it can 60 * anyways, so unless someone is interested in the implications of linking/ 61 * library functions, I think this is a reasonable assumption given the typical 62 * size of an OpenCL kernel). 63 * 64 * Note that call args are different than kernel arguments: 65 * * All work-items in a kernel refer the same set of kernel arguments 66 * * Each work-item has it's on set of call args. So a call argument at 67 * address 0x4 is different for work-item 0 and work-item 1. 68 * 69 * Ok, the table below shows an example of how we organize the call arguments in 70 * the CallArgMem class. 71 * 72 * int foo(int arg1, double arg2) 73 * ___________________________________________________ 74 * | 0: return.0 | 4: return.1 | ... | 252: return.63 | 75 * |---------------------------------------------------| 76 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | 77 * |---------------------------------------------------| 78 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | 79 * ___________________________________________________ 80 */ 81class CallArgMem 82{ 83 public: 84 // pointer to buffer for storing function arguments 85 uint8_t *mem; 86 int wfSize; 87 // size of function args 88 int funcArgsSizePerItem; 89 90 template<typename CType> 91 int 92 getLaneOffset(int lane, int addr) 93 { 94 return addr * wfSize + sizeof(CType) * lane; 95 } 96 97 CallArgMem(int func_args_size_per_item, int wf_size) 98 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) 99 { 100 mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); 101 } 102 103 ~CallArgMem() 104 { 105 free(mem); 106 } 107 108 template<typename CType> 109 uint8_t* 110 getLaneAddr(int lane, int addr) 111 { 112 return mem + getLaneOffset<CType>(lane, addr); 113 } 114 115 template<typename CType> 116 void 117 setLaneAddr(int lane, int addr, CType val) 118 { 119 *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val; 120 } 121}; 122 123/** 124 * A reconvergence stack entry conveys the necessary state to implement 125 * control flow divergence. 126 */ 127class ReconvergenceStackEntry { 128 129 public: 130 ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc, 131 VectorMask new_mask) : pc(new_pc), rpc(new_rpc), 132 execMask(new_mask) { 133 } 134 135 /** 136 * PC of current instruction. 137 */ 138 uint32_t pc; 139 /** 140 * PC of the immediate post-dominator instruction, i.e., the value of 141 * @a pc for the first instruction that will be executed by the wavefront 142 * when a reconvergence point is reached. 143 */ 144 uint32_t rpc; 145 /** 146 * Execution mask. 147 */ 148 VectorMask execMask; 149}; 150 151class Wavefront : public SimObject 152{ 153 public: 154 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; 155 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; 156 157 // Base pointer for array of instruction pointers 158 uint64_t basePtr; 159 160 uint32_t oldBarrierCnt; 161 uint32_t barrierCnt; 162 uint32_t barrierId; 163 uint32_t barrierSlots; 164 status_e status; 165 // HW slot id where the WF is mapped to inside a SIMD unit 166 int wfSlotId; 167 int kernId; 168 // SIMD unit where the WV has been scheduled 169 int simdId; 170 // pointer to parent CU 171 ComputeUnit *computeUnit; 172 173 std::deque<GPUDynInstPtr> instructionBuffer; 174 175 bool pendingFetch; 176 bool dropFetch; 177 178 // Condition Register State (for HSAIL simulations only) 179 class ConditionRegisterState *condRegState; 180 // number of single precision VGPRs required by WF 181 uint32_t maxSpVgprs; 182 // number of double precision VGPRs required by WF 183 uint32_t maxDpVgprs; 184 // map virtual to physical vector register 185 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); 186 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 187 bool isGmInstruction(GPUDynInstPtr ii); 188 bool isLmInstruction(GPUDynInstPtr ii); 189 bool isOldestInstGMem(); 190 bool isOldestInstLMem(); 191 bool isOldestInstPrivMem(); 192 bool isOldestInstFlatMem(); 193 bool isOldestInstALU(); 194 bool isOldestInstBarrier(); 195 // used for passing spill address to DDInstGPU 196 std::vector<Addr> lastAddr; 197 std::vector<uint32_t> workItemId[3]; 198 std::vector<uint32_t> workItemFlatId; 199 uint32_t workGroupId[3]; 200 uint32_t workGroupSz[3]; 201 uint32_t gridSz[3]; 202 uint32_t wgId; 203 uint32_t wgSz; 204 uint32_t dynWaveId; 205 uint32_t maxDynWaveId; 206 uint32_t dispatchId; 207 // outstanding global+local memory requests 208 uint32_t outstandingReqs; 209 // memory requests between scoreboard 210 // and execute stage not yet executed 211 uint32_t memReqsInPipe; 212 // outstanding global memory write requests 213 uint32_t outstandingReqsWrGm; 214 // outstanding local memory write requests 215 uint32_t outstandingReqsWrLm; 216 // outstanding global memory read requests 217 uint32_t outstandingReqsRdGm; 218 // outstanding local memory read requests 219 uint32_t outstandingReqsRdLm; 220 uint32_t rdLmReqsInPipe; 221 uint32_t rdGmReqsInPipe; 222 uint32_t wrLmReqsInPipe; 223 uint32_t wrGmReqsInPipe; 224 225 int memTraceBusy; 226 uint64_t lastTrace; 227 // number of vector registers reserved by WF 228 int reservedVectorRegs; 229 // Index into the Vector Register File's namespace where the WF's registers 230 // will live while the WF is executed 231 uint32_t startVgprIndex; 232 233 // Old value of destination gpr (for trace) 234 std::vector<uint32_t> oldVgpr; 235 // Id of destination gpr (for trace) 236 uint32_t oldVgprId; 237 // Tick count of last old_vgpr copy 238 uint64_t oldVgprTcnt; 239 240 // Old value of destination gpr (for trace) 241 std::vector<uint64_t> oldDgpr; 242 // Id of destination gpr (for trace) 243 uint32_t oldDgprId; 244 // Tick count of last old_vgpr copy 245 uint64_t oldDgprTcnt; 246 247 // Execution mask at wavefront start 248 VectorMask initMask; 249 250 // number of barriers this WF has joined 251 std::vector<int> barCnt; 252 int maxBarCnt; 253 // Flag to stall a wave on barrier 254 bool stalledAtBarrier; 255 256 // a pointer to the fraction of the LDS allocated 257 // to this workgroup (thus this wavefront) 258 LdsChunk *ldsChunk; 259 260 // A pointer to the spill area 261 Addr spillBase; 262 // The size of the spill area 263 uint32_t spillSizePerItem; 264 // The vector width of the spill area 265 uint32_t spillWidth; 266 267 // A pointer to the private memory area 268 Addr privBase; 269 // The size of the private memory area 270 uint32_t privSizePerItem; 271 272 // A pointer ot the read-only memory area 273 Addr roBase; 274 // size of the read-only memory area 275 uint32_t roSize; 276 277 // pointer to buffer for storing kernel arguments 278 uint8_t *kernelArgs; 279 // unique WF id over all WFs executed across all CUs 280 uint64_t wfDynId; 281 282 // number of times instruction issue for this wavefront is blocked 283 // due to VRF port availability 284 Stats::Scalar numTimesBlockedDueVrfPortAvail; 285 // number of times an instruction of a WF is blocked from being issued 286 // due to WAR and WAW dependencies 287 Stats::Scalar numTimesBlockedDueWAXDependencies; 288 // number of times an instruction of a WF is blocked from being issued 289 // due to WAR and WAW dependencies 290 Stats::Scalar numTimesBlockedDueRAWDependencies; 291 // distribution of executed instructions based on their register 292 // operands; this is used to highlight the load on the VRF 293 Stats::Distribution srcRegOpDist; 294 Stats::Distribution dstRegOpDist; 295 296 // Functions to operate on call argument memory 297 // argument memory for hsail call instruction 298 CallArgMem *callArgMem; 299 void 300 initCallArgMem(int func_args_size_per_item, int wf_size) 301 { 302 callArgMem = new CallArgMem(func_args_size_per_item, wf_size); 303 } 304 305 template<typename CType> 306 CType 307 readCallArgMem(int lane, int addr) 308 { 309 return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr))); 310 } 311 312 template<typename CType> 313 void 314 writeCallArgMem(int lane, int addr, CType val) 315 { 316 callArgMem->setLaneAddr<CType>(lane, addr, val); 317 } 318 319 typedef WavefrontParams Params; 320 Wavefront(const Params *p); 321 ~Wavefront(); 322 virtual void init(); 323 324 void 325 setParent(ComputeUnit *cu) 326 { 327 computeUnit = cu; 328 } 329 330 void start(uint64_t _wfDynId, uint64_t _base_ptr); 331 void exec(); 332 void updateResources(); 333 int ready(itype_e type); 334 bool instructionBufferHasBranch(); 335 void regStats(); 336 VectorMask getPred() { return execMask() & initMask; } 337 338 bool waitingAtBarrier(int lane); 339 340 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 341 const VectorMask& exec_mask); 342 343 void popFromReconvergenceStack(); 344 345 uint32_t pc() const; 346 347 uint32_t rpc() const; 348 349 VectorMask execMask() const; 350 351 bool execMask(int lane) const; 352 353 void pc(uint32_t new_pc); 354 355 void discardFetch(); 356 357 /** 358 * Returns the size of the static hardware context of a particular wavefront 359 * This should be updated everytime the context is changed 360 */ 361 uint32_t getStaticContextSize() const; 362 363 private: 364 /** 365 * Stack containing Control Flow Graph nodes (i.e., kernel instructions) 366 * to be visited by the wavefront, and the associated execution masks. The 367 * reconvergence stack grows every time the wavefront reaches a divergence 368 * point (branch instruction), and shrinks every time the wavefront 369 * reaches a reconvergence point (immediate post-dominator instruction). 370 */ 371 std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; 372}; 373 374#endif // __WAVEFRONT_HH__ 375