1/* 2 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Lisa Hsu 34 */ 35 36#ifndef __WAVEFRONT_HH__ 37#define __WAVEFRONT_HH__ 38 39#include <cassert> 40#include <deque> 41#include <memory> 42#include <stack> 43#include <vector> 44 45#include "arch/gpu_isa.hh" 46#include "base/logging.hh" 47#include "base/types.hh" 48#include "config/the_gpu_isa.hh" 49#include "gpu-compute/condition_register_state.hh" 50#include "gpu-compute/lds_state.hh" 51#include "gpu-compute/misc.hh" 52#include "gpu-compute/ndrange.hh" 53#include "params/Wavefront.hh" 54#include "sim/sim_object.hh" 55 56static const int MAX_NUM_INSTS_PER_WF = 12; 57 58/** 59 * A reconvergence stack entry conveys the necessary state to implement 60 * control flow divergence. 61 */ 62struct ReconvergenceStackEntry { 63 /** 64 * PC of current instruction. 65 */ 66 uint32_t pc; 67 /** 68 * PC of the immediate post-dominator instruction, i.e., the value of 69 * @a pc for the first instruction that will be executed by the wavefront 70 * when a reconvergence point is reached. 71 */ 72 uint32_t rpc; 73 /** 74 * Execution mask. 75 */ 76 VectorMask execMask; 77}; 78 79/* 80 * Arguments for the hsail opcode call, are user defined and variable length. 81 * The hardware/finalizer can support arguments in hardware or use memory to 82 * pass arguments. For now, let's assume that an unlimited number of arguments 83 * are supported in hardware (the compiler inlines functions whenver it can 84 * anyways, so unless someone is interested in the implications of linking/ 85 * library functions, I think this is a reasonable assumption given the typical 86 * size of an OpenCL kernel). 87 * 88 * Note that call args are different than kernel arguments: 89 * * All work-items in a kernel refer the same set of kernel arguments 90 * * Each work-item has it's on set of call args. So a call argument at 91 * address 0x4 is different for work-item 0 and work-item 1. 92 * 93 * Ok, the table below shows an example of how we organize the call arguments in 94 * the CallArgMem class. 95 * 96 * int foo(int arg1, double arg2) 97 * ___________________________________________________ 98 * | 0: return.0 | 4: return.1 | ... | 252: return.63 | 99 * |---------------------------------------------------| 100 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | 101 * |---------------------------------------------------| 102 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | 103 * ___________________________________________________ 104 */ 105class CallArgMem 106{ 107 public: 108 // pointer to buffer for storing function arguments 109 uint8_t *mem; 110 int wfSize; 111 // size of function args 112 int funcArgsSizePerItem; 113 114 template<typename CType> 115 int 116 getLaneOffset(int lane, int addr) 117 { 118 return addr * wfSize + sizeof(CType) * lane; 119 } 120 121 CallArgMem(int func_args_size_per_item, int wf_size) 122 : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item) 123 { 124 mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize); 125 } 126 127 ~CallArgMem() 128 { 129 free(mem); 130 } 131 132 template<typename CType> 133 uint8_t* 134 getLaneAddr(int lane, int addr) 135 { 136 return mem + getLaneOffset<CType>(lane, addr); 137 } 138 139 template<typename CType> 140 void 141 setLaneAddr(int lane, int addr, CType val) 142 { 143 *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val; 144 } 145}; 146 147class Wavefront : public SimObject 148{ 149 public: 150 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; 151 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; 152 153 // Base pointer for array of instruction pointers 154 uint64_t basePtr; 155 156 uint32_t oldBarrierCnt; 157 uint32_t barrierCnt; 158 uint32_t barrierId; 159 uint32_t barrierSlots; 160 status_e status; 161 // HW slot id where the WF is mapped to inside a SIMD unit 162 int wfSlotId; 163 int kernId; 164 // SIMD unit where the WV has been scheduled 165 int simdId; 166 // pointer to parent CU 167 ComputeUnit *computeUnit; 168 169 std::deque<GPUDynInstPtr> instructionBuffer; 170 171 bool pendingFetch; 172 bool dropFetch; 173 174 // Condition Register State (for HSAIL simulations only) 175 class ConditionRegisterState *condRegState; 176 // number of single precision VGPRs required by WF 177 uint32_t maxSpVgprs; 178 // number of double precision VGPRs required by WF 179 uint32_t maxDpVgprs; 180 // map virtual to physical vector register 181 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); 182 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 183 bool isGmInstruction(GPUDynInstPtr ii); 184 bool isLmInstruction(GPUDynInstPtr ii); 185 bool isOldestInstGMem(); 186 bool isOldestInstLMem(); 187 bool isOldestInstPrivMem(); 188 bool isOldestInstFlatMem(); 189 bool isOldestInstALU(); 190 bool isOldestInstBarrier(); 191 // used for passing spill address to DDInstGPU 192 std::vector<Addr> lastAddr; 193 std::vector<uint32_t> workItemId[3]; 194 std::vector<uint32_t> workItemFlatId; 195 /* kernel launch parameters */ 196 uint32_t workGroupId[3]; 197 uint32_t workGroupSz[3]; 198 uint32_t gridSz[3]; 199 uint32_t wgId; 200 uint32_t wgSz; 201 /* the actual WG size can differ than the maximum size */ 202 uint32_t actualWgSz[3]; 203 uint32_t actualWgSzTotal; 204 void computeActualWgSz(NDRange *ndr); 205 // wavefront id within a workgroup 206 uint32_t wfId; 207 uint32_t maxDynWaveId; 208 uint32_t dispatchId; 209 // outstanding global+local memory requests 210 uint32_t outstandingReqs; 211 // memory requests between scoreboard 212 // and execute stage not yet executed 213 uint32_t memReqsInPipe; 214 // outstanding global memory write requests 215 uint32_t outstandingReqsWrGm; 216 // outstanding local memory write requests 217 uint32_t outstandingReqsWrLm; 218 // outstanding global memory read requests 219 uint32_t outstandingReqsRdGm; 220 // outstanding local memory read requests 221 uint32_t outstandingReqsRdLm; 222 uint32_t rdLmReqsInPipe; 223 uint32_t rdGmReqsInPipe; 224 uint32_t wrLmReqsInPipe; 225 uint32_t wrGmReqsInPipe; 226 227 int memTraceBusy; 228 uint64_t lastTrace; 229 // number of vector registers reserved by WF 230 int reservedVectorRegs; 231 // Index into the Vector Register File's namespace where the WF's registers 232 // will live while the WF is executed 233 uint32_t startVgprIndex; 234 235 // Old value of destination gpr (for trace) 236 std::vector<uint32_t> oldVgpr; 237 // Id of destination gpr (for trace) 238 uint32_t oldVgprId; 239 // Tick count of last old_vgpr copy 240 uint64_t oldVgprTcnt; 241 242 // Old value of destination gpr (for trace) 243 std::vector<uint64_t> oldDgpr; 244 // Id of destination gpr (for trace) 245 uint32_t oldDgprId; 246 // Tick count of last old_vgpr copy 247 uint64_t oldDgprTcnt; 248 249 // Execution mask at wavefront start 250 VectorMask initMask; 251 252 // number of barriers this WF has joined 253 std::vector<int> barCnt; 254 int maxBarCnt; 255 // Flag to stall a wave on barrier 256 bool stalledAtBarrier; 257 258 // a pointer to the fraction of the LDS allocated 259 // to this workgroup (thus this wavefront) 260 LdsChunk *ldsChunk; 261 262 // A pointer to the spill area 263 Addr spillBase; 264 // The size of the spill area 265 uint32_t spillSizePerItem; 266 // The vector width of the spill area 267 uint32_t spillWidth; 268 269 // A pointer to the private memory area 270 Addr privBase; 271 // The size of the private memory area 272 uint32_t privSizePerItem; 273 274 // A pointer ot the read-only memory area 275 Addr roBase; 276 // size of the read-only memory area 277 uint32_t roSize; 278 279 // pointer to buffer for storing kernel arguments 280 uint8_t *kernelArgs; 281 // unique WF id over all WFs executed across all CUs 282 uint64_t wfDynId; 283 284 // number of times instruction issue for this wavefront is blocked 285 // due to VRF port availability 286 Stats::Scalar numTimesBlockedDueVrfPortAvail; 287 // number of times an instruction of a WF is blocked from being issued 288 // due to WAR and WAW dependencies 289 Stats::Scalar numTimesBlockedDueWAXDependencies; 290 // number of times an instruction of a WF is blocked from being issued 291 // due to WAR and WAW dependencies 292 Stats::Scalar numTimesBlockedDueRAWDependencies; 293 // distribution of executed instructions based on their register 294 // operands; this is used to highlight the load on the VRF 295 Stats::Distribution srcRegOpDist; 296 Stats::Distribution dstRegOpDist; 297 298 // Functions to operate on call argument memory 299 // argument memory for hsail call instruction 300 CallArgMem *callArgMem; 301 void 302 initCallArgMem(int func_args_size_per_item, int wf_size) 303 { 304 callArgMem = new CallArgMem(func_args_size_per_item, wf_size); 305 } 306 307 template<typename CType> 308 CType 309 readCallArgMem(int lane, int addr) 310 { 311 return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr))); 312 } 313 314 template<typename CType> 315 void 316 writeCallArgMem(int lane, int addr, CType val) 317 { 318 callArgMem->setLaneAddr<CType>(lane, addr, val); 319 } 320 321 typedef WavefrontParams Params; 322 Wavefront(const Params *p); 323 ~Wavefront(); 324 virtual void init(); 325 326 void 327 setParent(ComputeUnit *cu) 328 { 329 computeUnit = cu; 330 } 331 332 void start(uint64_t _wfDynId, uint64_t _base_ptr); 333 void exec(); 334 void updateResources(); 335 int ready(itype_e type); 336 bool instructionBufferHasBranch(); 337 void regStats(); 338 VectorMask getPred() { return execMask() & initMask; } 339 340 bool waitingAtBarrier(int lane); 341 342 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 343 const VectorMask& exec_mask); 344 345 void popFromReconvergenceStack(); 346 347 uint32_t pc() const; 348 349 uint32_t rpc() const; 350 351 VectorMask execMask() const; 352 353 bool execMask(int lane) const; 354 355 void pc(uint32_t new_pc); 356 357 void discardFetch(); 358 359 /** 360 * Returns the size of the static hardware context of a particular wavefront 361 * This should be updated everytime the context is changed 362 */ 363 uint32_t getStaticContextSize() const; 364 365 /** 366 * Returns the hardware context as a stream of bytes 367 * This method is designed for HSAIL execution 368 */ 369 void getContext(const void *out); 370 371 /** 372 * Sets the hardware context fromt a stream of bytes 373 * This method is designed for HSAIL execution 374 */ 375 void setContext(const void *in); 376 377 TheGpuISA::GPUISA& 378 gpuISA() 379 { 380 return _gpuISA; 381 } 382 383 private: 384 TheGpuISA::GPUISA _gpuISA; 385 /** 386 * Stack containing Control Flow Graph nodes (i.e., kernel instructions) 387 * to be visited by the wavefront, and the associated execution masks. The 388 * reconvergence stack grows every time the wavefront reaches a divergence 389 * point (branch instruction), and shrinks every time the wavefront 390 * reaches a reconvergence point (immediate post-dominator instruction). 391 */ 392 std::deque<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; 393}; 394 395#endif // __WAVEFRONT_HH__ 396