wavefront.hh revision 11308
1/* 2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Lisa Hsu 34 */ 35 36#ifndef __WAVEFRONT_HH__ 37#define __WAVEFRONT_HH__ 38 39#include <cassert> 40#include <deque> 41#include <memory> 42#include <stack> 43#include <vector> 44 45#include "base/misc.hh" 46#include "base/types.hh" 47#include "gpu-compute/condition_register_state.hh" 48#include "gpu-compute/lds_state.hh" 49#include "gpu-compute/misc.hh" 50#include "params/Wavefront.hh" 51#include "sim/sim_object.hh" 52 53static const int MAX_NUM_INSTS_PER_WF = 12; 54 55/* 56 * Arguments for the hsail opcode call, are user defined and variable length. 57 * The hardware/finalizer can support arguments in hardware or use memory to 58 * pass arguments. For now, let's assume that an unlimited number of arguments 59 * are supported in hardware (the compiler inlines functions whenver it can 60 * anyways, so unless someone is interested in the implications of linking/ 61 * library functions, I think this is a reasonable assumption given the typical 62 * size of an OpenCL kernel). 63 * 64 * Note that call args are different than kernel arguments: 65 * * All work-items in a kernel refer the same set of kernel arguments 66 * * Each work-item has it's on set of call args. So a call argument at 67 * address 0x4 is different for work-item 0 and work-item 1. 68 * 69 * Ok, the table below shows an example of how we organize the call arguments in 70 * the CallArgMem class. 71 * 72 * int foo(int arg1, double arg2) 73 * ___________________________________________________ 74 * | 0: return.0 | 4: return.1 | ... | 252: return.63 | 75 * |---------------------------------------------------| 76 * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 | 77 * |---------------------------------------------------| 78 * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 | 79 * ___________________________________________________ 80 */ 81class CallArgMem 82{ 83 public: 84 // pointer to buffer for storing function arguments 85 uint8_t *mem; 86 // size of function args 87 int funcArgsSizePerItem; 88 89 template<typename CType> 90 int 91 getLaneOffset(int lane, int addr) 92 { 93 return addr * VSZ + sizeof(CType) * lane; 94 } 95 96 CallArgMem(int func_args_size_per_item) 97 : funcArgsSizePerItem(func_args_size_per_item) 98 { 99 mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ); 100 } 101 102 ~CallArgMem() 103 { 104 free(mem); 105 } 106 107 template<typename CType> 108 uint8_t* 109 getLaneAddr(int lane, int addr) 110 { 111 return mem + getLaneOffset<CType>(lane, addr); 112 } 113 114 template<typename CType> 115 void 116 setLaneAddr(int lane, int addr, CType val) 117 { 118 *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val; 119 } 120}; 121 122/** 123 * A reconvergence stack entry conveys the necessary state to implement 124 * control flow divergence. 125 */ 126class ReconvergenceStackEntry { 127 128 public: 129 ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc, 130 VectorMask new_mask) : pc(new_pc), rpc(new_rpc), 131 execMask(new_mask) { 132 } 133 134 /** 135 * PC of current instruction. 136 */ 137 uint32_t pc; 138 /** 139 * PC of the immediate post-dominator instruction, i.e., the value of 140 * @a pc for the first instruction that will be executed by the wavefront 141 * when a reconvergence point is reached. 142 */ 143 uint32_t rpc; 144 /** 145 * Execution mask. 146 */ 147 VectorMask execMask; 148}; 149 150class Wavefront : public SimObject 151{ 152 public: 153 enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE}; 154 enum status_e {S_STOPPED,S_RETURNING,S_RUNNING}; 155 156 // Base pointer for array of instruction pointers 157 uint64_t base_ptr; 158 159 uint32_t old_barrier_cnt; 160 uint32_t barrier_cnt; 161 uint32_t barrier_id; 162 uint32_t barrier_slots; 163 status_e status; 164 // HW slot id where the WF is mapped to inside a SIMD unit 165 int wfSlotId; 166 int kern_id; 167 // SIMD unit where the WV has been scheduled 168 int simdId; 169 // pointer to parent CU 170 ComputeUnit *computeUnit; 171 172 std::deque<GPUDynInstPtr> instructionBuffer; 173 174 bool pendingFetch; 175 bool dropFetch; 176 177 // Condition Register State (for HSAIL simulations only) 178 class ConditionRegisterState *condRegState; 179 // number of single precision VGPRs required by WF 180 uint32_t maxSpVgprs; 181 // number of double precision VGPRs required by WF 182 uint32_t maxDpVgprs; 183 // map virtual to physical vector register 184 uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0); 185 void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs); 186 bool isGmInstruction(GPUDynInstPtr ii); 187 bool isLmInstruction(GPUDynInstPtr ii); 188 bool isOldestInstGMem(); 189 bool isOldestInstLMem(); 190 bool isOldestInstPrivMem(); 191 bool isOldestInstFlatMem(); 192 bool isOldestInstALU(); 193 bool isOldestInstBarrier(); 194 // used for passing spill address to DDInstGPU 195 uint64_t last_addr[VSZ]; 196 uint32_t workitemid[3][VSZ]; 197 uint32_t workitemFlatId[VSZ]; 198 uint32_t workgroupid[3]; 199 uint32_t workgroupsz[3]; 200 uint32_t gridsz[3]; 201 uint32_t wg_id; 202 uint32_t wg_sz; 203 uint32_t dynwaveid; 204 uint32_t maxdynwaveid; 205 uint32_t dispatchid; 206 // outstanding global+local memory requests 207 uint32_t outstanding_reqs; 208 // memory requests between scoreboard 209 // and execute stage not yet executed 210 uint32_t mem_reqs_in_pipe; 211 // outstanding global memory write requests 212 uint32_t outstanding_reqs_wr_gm; 213 // outstanding local memory write requests 214 uint32_t outstanding_reqs_wr_lm; 215 // outstanding global memory read requests 216 uint32_t outstanding_reqs_rd_gm; 217 // outstanding local memory read requests 218 uint32_t outstanding_reqs_rd_lm; 219 uint32_t rd_lm_reqs_in_pipe; 220 uint32_t rd_gm_reqs_in_pipe; 221 uint32_t wr_lm_reqs_in_pipe; 222 uint32_t wr_gm_reqs_in_pipe; 223 224 int mem_trace_busy; 225 uint64_t last_trace; 226 // number of vector registers reserved by WF 227 int reservedVectorRegs; 228 // Index into the Vector Register File's namespace where the WF's registers 229 // will live while the WF is executed 230 uint32_t startVgprIndex; 231 232 // Old value of destination gpr (for trace) 233 uint32_t old_vgpr[VSZ]; 234 // Id of destination gpr (for trace) 235 uint32_t old_vgpr_id; 236 // Tick count of last old_vgpr copy 237 uint64_t old_vgpr_tcnt; 238 239 // Old value of destination gpr (for trace) 240 uint64_t old_dgpr[VSZ]; 241 // Id of destination gpr (for trace) 242 uint32_t old_dgpr_id; 243 // Tick count of last old_vgpr copy 244 uint64_t old_dgpr_tcnt; 245 246 // Execution mask at wavefront start 247 VectorMask init_mask; 248 249 // number of barriers this WF has joined 250 int bar_cnt[VSZ]; 251 int max_bar_cnt; 252 // Flag to stall a wave on barrier 253 bool stalledAtBarrier; 254 255 // a pointer to the fraction of the LDS allocated 256 // to this workgroup (thus this wavefront) 257 LdsChunk *ldsChunk; 258 259 // A pointer to the spill area 260 Addr spillBase; 261 // The size of the spill area 262 uint32_t spillSizePerItem; 263 // The vector width of the spill area 264 uint32_t spillWidth; 265 266 // A pointer to the private memory area 267 Addr privBase; 268 // The size of the private memory area 269 uint32_t privSizePerItem; 270 271 // A pointer ot the read-only memory area 272 Addr roBase; 273 // size of the read-only memory area 274 uint32_t roSize; 275 276 // pointer to buffer for storing kernel arguments 277 uint8_t *kernelArgs; 278 // unique WF id over all WFs executed across all CUs 279 uint64_t wfDynId; 280 281 // number of times instruction issue for this wavefront is blocked 282 // due to VRF port availability 283 Stats::Scalar numTimesBlockedDueVrfPortAvail; 284 // number of times an instruction of a WF is blocked from being issued 285 // due to WAR and WAW dependencies 286 Stats::Scalar numTimesBlockedDueWAXDependencies; 287 // number of times an instruction of a WF is blocked from being issued 288 // due to WAR and WAW dependencies 289 Stats::Scalar numTimesBlockedDueRAWDependencies; 290 // distribution of executed instructions based on their register 291 // operands; this is used to highlight the load on the VRF 292 Stats::Distribution srcRegOpDist; 293 Stats::Distribution dstRegOpDist; 294 295 // Functions to operate on call argument memory 296 // argument memory for hsail call instruction 297 CallArgMem *callArgMem; 298 void 299 initCallArgMem(int func_args_size_per_item) 300 { 301 callArgMem = new CallArgMem(func_args_size_per_item); 302 } 303 304 template<typename CType> 305 CType 306 readCallArgMem(int lane, int addr) 307 { 308 return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr))); 309 } 310 311 template<typename CType> 312 void 313 writeCallArgMem(int lane, int addr, CType val) 314 { 315 callArgMem->setLaneAddr<CType>(lane, addr, val); 316 } 317 318 typedef WavefrontParams Params; 319 Wavefront(const Params *p); 320 ~Wavefront(); 321 virtual void init(); 322 323 void 324 setParent(ComputeUnit *cu) 325 { 326 computeUnit = cu; 327 } 328 329 void start(uint64_t _wfDynId, uint64_t _base_ptr); 330 331 void exec(); 332 void updateResources(); 333 int ready(itype_e type); 334 bool instructionBufferHasBranch(); 335 void regStats(); 336 VectorMask get_pred() { return execMask() & init_mask; } 337 338 bool waitingAtBarrier(int lane); 339 340 void pushToReconvergenceStack(uint32_t pc, uint32_t rpc, 341 const VectorMask& exec_mask); 342 343 void popFromReconvergenceStack(); 344 345 uint32_t pc() const; 346 347 uint32_t rpc() const; 348 349 VectorMask execMask() const; 350 351 bool execMask(int lane) const; 352 353 void pc(uint32_t new_pc); 354 355 void discardFetch(); 356 357 private: 358 /** 359 * Stack containing Control Flow Graph nodes (i.e., kernel instructions) 360 * to be visited by the wavefront, and the associated execution masks. The 361 * reconvergence stack grows every time the wavefront reaches a divergence 362 * point (branch instruction), and shrinks every time the wavefront 363 * reaches a reconvergence point (immediate post-dominator instruction). 364 */ 365 std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack; 366}; 367 368#endif // __WAVEFRONT_HH__ 369