gpu_dyn_inst.hh revision 12697
1/* 2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Anthony Gutierrez 34 */ 35 36#ifndef __GPU_DYN_INST_HH__ 37#define __GPU_DYN_INST_HH__ 38 39#include <cstdint> 40#include <string> 41 42#include "enums/MemType.hh" 43#include "enums/StorageClassType.hh" 44#include "gpu-compute/compute_unit.hh" 45#include "gpu-compute/gpu_exec_context.hh" 46 47class GPUStaticInst; 48 49template<typename T> 50class AtomicOpAnd : public TypedAtomicOpFunctor<T> 51{ 52 public: 53 T a; 54 55 AtomicOpAnd(T _a) : a(_a) { } 56 void execute(T *b) { *b &= a; } 57}; 58 59template<typename T> 60class AtomicOpOr : public TypedAtomicOpFunctor<T> 61{ 62 public: 63 T a; 64 AtomicOpOr(T _a) : a(_a) { } 65 void execute(T *b) { *b |= a; } 66}; 67 68template<typename T> 69class AtomicOpXor : public TypedAtomicOpFunctor<T> 70{ 71 public: 72 T a; 73 AtomicOpXor(T _a) : a(_a) {} 74 void execute(T *b) { *b ^= a; } 75}; 76 77template<typename T> 78class AtomicOpCAS : public TypedAtomicOpFunctor<T> 79{ 80 public: 81 T c; 82 T s; 83 84 ComputeUnit *computeUnit; 85 86 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit) 87 : c(_c), s(_s), computeUnit(compute_unit) { } 88 89 void 90 execute(T *b) 91 { 92 computeUnit->numCASOps++; 93 94 if (*b == c) { 95 *b = s; 96 } else { 97 computeUnit->numFailedCASOps++; 98 } 99 100 if (computeUnit->xact_cas_mode) { 101 computeUnit->xactCasLoadMap.clear(); 102 } 103 } 104}; 105 106template<typename T> 107class AtomicOpExch : public TypedAtomicOpFunctor<T> 108{ 109 public: 110 T a; 111 AtomicOpExch(T _a) : a(_a) { } 112 void execute(T *b) { *b = a; } 113}; 114 115template<typename T> 116class AtomicOpAdd : public TypedAtomicOpFunctor<T> 117{ 118 public: 119 T a; 120 AtomicOpAdd(T _a) : a(_a) { } 121 void execute(T *b) { *b += a; } 122}; 123 124template<typename T> 125class AtomicOpSub : public TypedAtomicOpFunctor<T> 126{ 127 public: 128 T a; 129 AtomicOpSub(T _a) : a(_a) { } 130 void execute(T *b) { *b -= a; } 131}; 132 133template<typename T> 134class AtomicOpInc : public TypedAtomicOpFunctor<T> 135{ 136 public: 137 AtomicOpInc() { } 138 void execute(T *b) { *b += 1; } 139}; 140 141template<typename T> 142class AtomicOpDec : public TypedAtomicOpFunctor<T> 143{ 144 public: 145 AtomicOpDec() {} 146 void execute(T *b) { *b -= 1; } 147}; 148 149template<typename T> 150class AtomicOpMax : public TypedAtomicOpFunctor<T> 151{ 152 public: 153 T a; 154 AtomicOpMax(T _a) : a(_a) { } 155 156 void 157 execute(T *b) 158 { 159 if (a > *b) 160 *b = a; 161 } 162}; 163 164template<typename T> 165class AtomicOpMin : public TypedAtomicOpFunctor<T> 166{ 167 public: 168 T a; 169 AtomicOpMin(T _a) : a(_a) {} 170 171 void 172 execute(T *b) 173 { 174 if (a < *b) 175 *b = a; 176 } 177}; 178 179typedef enum 180{ 181 VT_32, 182 VT_64, 183} vgpr_type; 184 185class GPUDynInst : public GPUExecContext 186{ 187 public: 188 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, 189 uint64_t instSeqNum); 190 ~GPUDynInst(); 191 void execute(GPUDynInstPtr gpuDynInst); 192 int numSrcRegOperands(); 193 int numDstRegOperands(); 194 int getNumOperands(); 195 bool isVectorRegister(int operandIdx); 196 bool isScalarRegister(int operandIdx); 197 bool isCondRegister(int operandIdx); 198 int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst); 199 int getOperandSize(int operandIdx); 200 bool isDstOperand(int operandIdx); 201 bool isSrcOperand(int operandIdx); 202 203 const std::string &disassemble() const; 204 205 uint64_t seqNum() const; 206 207 Enums::StorageClassType executedAs(); 208 209 // The address of the memory operation 210 std::vector<Addr> addr; 211 Addr pAddr; 212 213 // The data to get written 214 uint8_t *d_data; 215 // Additional data (for atomics) 216 uint8_t *a_data; 217 // Additional data (for atomics) 218 uint8_t *x_data; 219 // The execution mask 220 VectorMask exec_mask; 221 222 // The memory type (M_U32, M_S32, ...) 223 Enums::MemType m_type; 224 225 // The equivalency class 226 int equiv; 227 // The return VGPR type (VT_32 or VT_64) 228 vgpr_type v_type; 229 // Number of VGPR's accessed (1, 2, or 4) 230 int n_reg; 231 // The return VGPR index 232 int dst_reg; 233 // There can be max 4 dest regs> 234 int dst_reg_vec[4]; 235 // SIMD where the WF of the memory instruction has been mapped to 236 int simdId; 237 // unique id of the WF where the memory instruction belongs to 238 int wfDynId; 239 // The kernel id of the requesting wf 240 int kern_id; 241 // The CU id of the requesting wf 242 int cu_id; 243 // HW slot id where the WF is mapped to inside a SIMD unit 244 int wfSlotId; 245 // execution pipeline id where the memory instruction has been scheduled 246 int pipeId; 247 // The execution time of this operation 248 Tick time; 249 // The latency of this operation 250 WaitClass latency; 251 // A list of bank conflicts for the 4 cycles. 252 uint32_t bc[4]; 253 254 // A pointer to ROM 255 uint8_t *rom; 256 // The size of the READONLY segment 257 int sz_rom; 258 259 // Initiate the specified memory operation, by creating a 260 // memory request and sending it off to the memory system. 261 void initiateAcc(GPUDynInstPtr gpuDynInst); 262 // Complete the specified memory operation, by writing 263 // value back to the RF in the case of a load or atomic 264 // return or, in the case of a store, we do nothing 265 void completeAcc(GPUDynInstPtr gpuDynInst); 266 267 void updateStats(); 268 269 GPUStaticInst* staticInstruction() { return _staticInst; } 270 271 bool isALU() const; 272 bool isBranch() const; 273 bool isNop() const; 274 bool isReturn() const; 275 bool isUnconditionalJump() const; 276 bool isSpecialOp() const; 277 bool isWaitcnt() const; 278 279 bool isBarrier() const; 280 bool isMemFence() const; 281 bool isMemRef() const; 282 bool isFlat() const; 283 bool isLoad() const; 284 bool isStore() const; 285 286 bool isAtomic() const; 287 bool isAtomicNoRet() const; 288 bool isAtomicRet() const; 289 290 bool isScalar() const; 291 bool readsSCC() const; 292 bool writesSCC() const; 293 bool readsVCC() const; 294 bool writesVCC() const; 295 296 bool isAtomicAnd() const; 297 bool isAtomicOr() const; 298 bool isAtomicXor() const; 299 bool isAtomicCAS() const; 300 bool isAtomicExch() const; 301 bool isAtomicAdd() const; 302 bool isAtomicSub() const; 303 bool isAtomicInc() const; 304 bool isAtomicDec() const; 305 bool isAtomicMax() const; 306 bool isAtomicMin() const; 307 308 bool isArgLoad() const; 309 bool isGlobalMem() const; 310 bool isLocalMem() const; 311 312 bool isArgSeg() const; 313 bool isGlobalSeg() const; 314 bool isGroupSeg() const; 315 bool isKernArgSeg() const; 316 bool isPrivateSeg() const; 317 bool isReadOnlySeg() const; 318 bool isSpillSeg() const; 319 320 bool isWorkitemScope() const; 321 bool isWavefrontScope() const; 322 bool isWorkgroupScope() const; 323 bool isDeviceScope() const; 324 bool isSystemScope() const; 325 bool isNoScope() const; 326 327 bool isRelaxedOrder() const; 328 bool isAcquire() const; 329 bool isRelease() const; 330 bool isAcquireRelease() const; 331 bool isNoOrder() const; 332 333 bool isGloballyCoherent() const; 334 bool isSystemCoherent() const; 335 336 /* 337 * Loads/stores/atomics may have acquire/release semantics associated 338 * withthem. Some protocols want to see the acquire/release as separate 339 * requests from the load/store/atomic. We implement that separation 340 * using continuations (i.e., a function pointer with an object associated 341 * with it). When, for example, the front-end generates a store with 342 * release semantics, we will first issue a normal store and set the 343 * continuation in the GPUDynInst to a function that generate a 344 * release request. That continuation will be called when the normal 345 * store completes (in ComputeUnit::DataPort::recvTimingResponse). The 346 * continuation will be called in the context of the same GPUDynInst 347 * that generated the initial store. 348 */ 349 std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation; 350 351 // when true, call execContinuation when response arrives 352 bool useContinuation; 353 354 template<typename c0> AtomicOpFunctor* 355 makeAtomicOpFunctor(c0 *reg0, c0 *reg1) 356 { 357 if (isAtomicAnd()) { 358 return new AtomicOpAnd<c0>(*reg0); 359 } else if (isAtomicOr()) { 360 return new AtomicOpOr<c0>(*reg0); 361 } else if (isAtomicXor()) { 362 return new AtomicOpXor<c0>(*reg0); 363 } else if (isAtomicCAS()) { 364 return new AtomicOpCAS<c0>(*reg0, *reg1, cu); 365 } else if (isAtomicExch()) { 366 return new AtomicOpExch<c0>(*reg0); 367 } else if (isAtomicAdd()) { 368 return new AtomicOpAdd<c0>(*reg0); 369 } else if (isAtomicSub()) { 370 return new AtomicOpSub<c0>(*reg0); 371 } else if (isAtomicInc()) { 372 return new AtomicOpInc<c0>(); 373 } else if (isAtomicDec()) { 374 return new AtomicOpDec<c0>(); 375 } else if (isAtomicMax()) { 376 return new AtomicOpMax<c0>(*reg0); 377 } else if (isAtomicMin()) { 378 return new AtomicOpMin<c0>(*reg0); 379 } else { 380 fatal("Unrecognized atomic operation"); 381 } 382 } 383 384 void 385 setRequestFlags(Request *req, bool setMemOrder=true) 386 { 387 // currently these are the easy scopes to deduce 388 if (isPrivateSeg()) { 389 req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); 390 } else if (isSpillSeg()) { 391 req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); 392 } else if (isGlobalSeg()) { 393 req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); 394 } else if (isReadOnlySeg()) { 395 req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); 396 } else if (isGroupSeg()) { 397 req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); 398 } else if (isFlat()) { 399 // TODO: translate to correct scope 400 assert(false); 401 } else { 402 fatal("%s has bad segment type\n", disassemble()); 403 } 404 405 if (isWavefrontScope()) { 406 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 407 Request::WAVEFRONT_SCOPE); 408 } else if (isWorkgroupScope()) { 409 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 410 Request::WORKGROUP_SCOPE); 411 } else if (isDeviceScope()) { 412 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 413 Request::DEVICE_SCOPE); 414 } else if (isSystemScope()) { 415 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 416 Request::SYSTEM_SCOPE); 417 } else if (!isNoScope() && !isWorkitemScope()) { 418 fatal("%s has bad scope type\n", disassemble()); 419 } 420 421 if (setMemOrder) { 422 // set acquire and release flags 423 if (isAcquire()) { 424 req->setFlags(Request::ACQUIRE); 425 } else if (isRelease()) { 426 req->setFlags(Request::RELEASE); 427 } else if (isAcquireRelease()) { 428 req->setFlags(Request::ACQUIRE | Request::RELEASE); 429 } else if (!isNoOrder()) { 430 fatal("%s has bad memory order\n", disassemble()); 431 } 432 } 433 434 // set atomic type 435 // currently, the instruction genenerator only produces atomic return 436 // but a magic instruction can produce atomic no return 437 if (isAtomicRet()) { 438 req->setFlags(Request::ATOMIC_RETURN_OP); 439 } else if (isAtomicNoRet()) { 440 req->setFlags(Request::ATOMIC_NO_RETURN_OP); 441 } 442 } 443 444 // Map returned packets and the addresses they satisfy with which lane they 445 // were requested from 446 typedef std::unordered_map<Addr, std::vector<int>> StatusVector; 447 StatusVector memStatusVector; 448 449 // Track the status of memory requests per lane, a bit per lane 450 VectorMask statusBitVector; 451 // for ld_v# or st_v# 452 std::vector<int> statusVector; 453 std::vector<int> tlbHitLevel; 454 455 private: 456 GPUStaticInst *_staticInst; 457 uint64_t _seqNum; 458}; 459 460#endif // __GPU_DYN_INST_HH__ 461