gpu_dyn_inst.hh revision 11308
1/* 2 * Copyright (c) 2015 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Author: Anthony Gutierrez 34 */ 35 36#ifndef __GPU_DYN_INST_HH__ 37#define __GPU_DYN_INST_HH__ 38 39#include <cstdint> 40#include <string> 41 42#include "enums/GenericMemoryOrder.hh" 43#include "enums/GenericMemoryScope.hh" 44#include "enums/MemOpType.hh" 45#include "enums/MemType.hh" 46#include "enums/OpType.hh" 47#include "enums/StorageClassType.hh" 48#include "gpu-compute/compute_unit.hh" 49#include "gpu-compute/gpu_exec_context.hh" 50 51class GPUStaticInst; 52 53template<typename T> 54class AtomicOpAnd : public TypedAtomicOpFunctor<T> 55{ 56 public: 57 T a; 58 59 AtomicOpAnd(T _a) : a(_a) { } 60 void execute(T *b) { *b &= a; } 61}; 62 63template<typename T> 64class AtomicOpOr : public TypedAtomicOpFunctor<T> 65{ 66 public: 67 T a; 68 AtomicOpOr(T _a) : a(_a) { } 69 void execute(T *b) { *b |= a; } 70}; 71 72template<typename T> 73class AtomicOpXor : public TypedAtomicOpFunctor<T> 74{ 75 public: 76 T a; 77 AtomicOpXor(T _a) : a(_a) {} 78 void execute(T *b) { *b ^= a; } 79}; 80 81template<typename T> 82class AtomicOpCAS : public TypedAtomicOpFunctor<T> 83{ 84 public: 85 T c; 86 T s; 87 88 ComputeUnit *computeUnit; 89 90 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit) 91 : c(_c), s(_s), computeUnit(compute_unit) { } 92 93 void 94 execute(T *b) 95 { 96 computeUnit->numCASOps++; 97 98 if (*b == c) { 99 *b = s; 100 } else { 101 computeUnit->numFailedCASOps++; 102 } 103 104 if (computeUnit->xact_cas_mode) { 105 computeUnit->xactCasLoadMap.clear(); 106 } 107 } 108}; 109 110template<typename T> 111class AtomicOpExch : public TypedAtomicOpFunctor<T> 112{ 113 public: 114 T a; 115 AtomicOpExch(T _a) : a(_a) { } 116 void execute(T *b) { *b = a; } 117}; 118 119template<typename T> 120class AtomicOpAdd : public TypedAtomicOpFunctor<T> 121{ 122 public: 123 T a; 124 AtomicOpAdd(T _a) : a(_a) { } 125 void execute(T *b) { *b += a; } 126}; 127 128template<typename T> 129class AtomicOpSub : public TypedAtomicOpFunctor<T> 130{ 131 public: 132 T a; 133 AtomicOpSub(T _a) : a(_a) { } 134 void execute(T *b) { *b -= a; } 135}; 136 137template<typename T> 138class AtomicOpInc : public TypedAtomicOpFunctor<T> 139{ 140 public: 141 AtomicOpInc() { } 142 void execute(T *b) { *b += 1; } 143}; 144 145template<typename T> 146class AtomicOpDec : public TypedAtomicOpFunctor<T> 147{ 148 public: 149 AtomicOpDec() {} 150 void execute(T *b) { *b -= 1; } 151}; 152 153template<typename T> 154class AtomicOpMax : public TypedAtomicOpFunctor<T> 155{ 156 public: 157 T a; 158 AtomicOpMax(T _a) : a(_a) { } 159 160 void 161 execute(T *b) 162 { 163 if (a > *b) 164 *b = a; 165 } 166}; 167 168template<typename T> 169class AtomicOpMin : public TypedAtomicOpFunctor<T> 170{ 171 public: 172 T a; 173 AtomicOpMin(T _a) : a(_a) {} 174 175 void 176 execute(T *b) 177 { 178 if (a < *b) 179 *b = a; 180 } 181}; 182 183#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN) 184#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN) 185#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN) 186 187typedef enum 188{ 189 VT_32, 190 VT_64, 191} vgpr_type; 192 193typedef enum 194{ 195 SEG_PRIVATE, 196 SEG_SPILL, 197 SEG_GLOBAL, 198 SEG_SHARED, 199 SEG_READONLY, 200 SEG_FLAT 201} seg_type; 202 203class GPUDynInst : public GPUExecContext 204{ 205 public: 206 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, 207 uint64_t instSeqNum); 208 209 void execute(); 210 int numSrcRegOperands(); 211 int numDstRegOperands(); 212 int getNumOperands(); 213 bool isVectorRegister(int operandIdx); 214 bool isScalarRegister(int operandIdx); 215 int getRegisterIndex(int operandIdx); 216 int getOperandSize(int operandIdx); 217 bool isDstOperand(int operandIdx); 218 bool isSrcOperand(int operandIdx); 219 bool isArgLoad(); 220 221 const std::string &disassemble() const; 222 223 uint64_t seqNum() const; 224 225 Enums::OpType opType(); 226 Enums::StorageClassType executedAs(); 227 228 // The address of the memory operation 229 Addr addr[VSZ]; 230 Addr pAddr; 231 232 // The data to get written 233 uint8_t d_data[VSZ * 16]; 234 // Additional data (for atomics) 235 uint8_t a_data[VSZ * 8]; 236 // Additional data (for atomics) 237 uint8_t x_data[VSZ * 8]; 238 // The execution mask 239 VectorMask exec_mask; 240 241 // The memory type (M_U32, M_S32, ...) 242 Enums::MemType m_type; 243 // The memory operation (MO_LD, MO_ST, ...) 244 Enums::MemOpType m_op; 245 Enums::GenericMemoryOrder memoryOrder; 246 247 // Scope of the request 248 Enums::GenericMemoryScope scope; 249 // The memory segment (SEG_SHARED, SEG_GLOBAL, ...) 250 seg_type s_type; 251 // The equivalency class 252 int equiv; 253 // The return VGPR type (VT_32 or VT_64) 254 vgpr_type v_type; 255 // Number of VGPR's accessed (1, 2, or 4) 256 int n_reg; 257 // The return VGPR index 258 int dst_reg; 259 // There can be max 4 dest regs> 260 int dst_reg_vec[4]; 261 // SIMD where the WF of the memory instruction has been mapped to 262 int simdId; 263 // unique id of the WF where the memory instruction belongs to 264 int wfDynId; 265 // The kernel id of the requesting wf 266 int kern_id; 267 // The CU id of the requesting wf 268 int cu_id; 269 // HW slot id where the WF is mapped to inside a SIMD unit 270 int wfSlotId; 271 // execution pipeline id where the memory instruction has been scheduled 272 int pipeId; 273 // The execution time of this operation 274 Tick time; 275 // The latency of this operation 276 WaitClass latency; 277 // A list of bank conflicts for the 4 cycles. 278 uint32_t bc[4]; 279 280 // A pointer to ROM 281 uint8_t *rom; 282 // The size of the READONLY segment 283 int sz_rom; 284 285 // Initiate the specified memory operation, by creating a 286 // memory request and sending it off to the memory system. 287 void initiateAcc(GPUDynInstPtr gpuDynInst); 288 289 void updateStats(); 290 291 GPUStaticInst* staticInstruction() { return staticInst; } 292 293 // Is the instruction a scalar or vector op? 294 bool scalarOp() const; 295 296 /* 297 * Loads/stores/atomics may have acquire/release semantics associated 298 * withthem. Some protocols want to see the acquire/release as separate 299 * requests from the load/store/atomic. We implement that separation 300 * using continuations (i.e., a function pointer with an object associated 301 * with it). When, for example, the front-end generates a store with 302 * release semantics, we will first issue a normal store and set the 303 * continuation in the GPUDynInst to a function that generate a 304 * release request. That continuation will be called when the normal 305 * store completes (in ComputeUnit::DataPort::recvTimingResponse). The 306 * continuation will be called in the context of the same GPUDynInst 307 * that generated the initial store. 308 */ 309 std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation; 310 311 // when true, call execContinuation when response arrives 312 bool useContinuation; 313 314 template<typename c0> AtomicOpFunctor* 315 makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op) 316 { 317 using namespace Enums; 318 319 switch(op) { 320 case MO_AAND: 321 case MO_ANRAND: 322 return new AtomicOpAnd<c0>(*reg0); 323 case MO_AOR: 324 case MO_ANROR: 325 return new AtomicOpOr<c0>(*reg0); 326 case MO_AXOR: 327 case MO_ANRXOR: 328 return new AtomicOpXor<c0>(*reg0); 329 case MO_ACAS: 330 case MO_ANRCAS: 331 return new AtomicOpCAS<c0>(*reg0, *reg1, cu); 332 case MO_AEXCH: 333 case MO_ANREXCH: 334 return new AtomicOpExch<c0>(*reg0); 335 case MO_AADD: 336 case MO_ANRADD: 337 return new AtomicOpAdd<c0>(*reg0); 338 case MO_ASUB: 339 case MO_ANRSUB: 340 return new AtomicOpSub<c0>(*reg0); 341 case MO_AINC: 342 case MO_ANRINC: 343 return new AtomicOpInc<c0>(); 344 case MO_ADEC: 345 case MO_ANRDEC: 346 return new AtomicOpDec<c0>(); 347 case MO_AMAX: 348 case MO_ANRMAX: 349 return new AtomicOpMax<c0>(*reg0); 350 case MO_AMIN: 351 case MO_ANRMIN: 352 return new AtomicOpMin<c0>(*reg0); 353 default: 354 panic("Unrecognized atomic operation"); 355 } 356 } 357 358 void 359 setRequestFlags(Request *req, bool setMemOrder=true) 360 { 361 // currently these are the easy scopes to deduce 362 switch (s_type) { 363 case SEG_PRIVATE: 364 req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); 365 break; 366 case SEG_SPILL: 367 req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); 368 break; 369 case SEG_GLOBAL: 370 req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); 371 break; 372 case SEG_READONLY: 373 req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); 374 break; 375 case SEG_SHARED: 376 req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); 377 break; 378 case SEG_FLAT: 379 // TODO: translate to correct scope 380 assert(false); 381 default: 382 panic("Bad segment type"); 383 break; 384 } 385 386 switch (scope) { 387 case Enums::MEMORY_SCOPE_NONE: 388 case Enums::MEMORY_SCOPE_WORKITEM: 389 break; 390 case Enums::MEMORY_SCOPE_WAVEFRONT: 391 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 392 Request::WAVEFRONT_SCOPE); 393 break; 394 case Enums::MEMORY_SCOPE_WORKGROUP: 395 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 396 Request::WORKGROUP_SCOPE); 397 break; 398 case Enums::MEMORY_SCOPE_DEVICE: 399 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 400 Request::DEVICE_SCOPE); 401 break; 402 case Enums::MEMORY_SCOPE_SYSTEM: 403 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 404 Request::SYSTEM_SCOPE); 405 break; 406 default: 407 panic("Bad scope type"); 408 break; 409 } 410 411 if (setMemOrder) { 412 // set acquire and release flags 413 switch (memoryOrder){ 414 case Enums::MEMORY_ORDER_SC_ACQUIRE: 415 req->setFlags(Request::ACQUIRE); 416 break; 417 case Enums::MEMORY_ORDER_SC_RELEASE: 418 req->setFlags(Request::RELEASE); 419 break; 420 case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE: 421 req->setFlags(Request::ACQUIRE | Request::RELEASE); 422 break; 423 default: 424 break; 425 } 426 } 427 428 // set atomic type 429 // currently, the instruction genenerator only produces atomic return 430 // but a magic instruction can produce atomic no return 431 if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB || 432 m_op == Enums::MO_AAND || m_op == Enums::MO_AOR || 433 m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX || 434 m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC || 435 m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH || 436 m_op == Enums::MO_ACAS) { 437 req->setFlags(Request::ATOMIC_RETURN_OP); 438 } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB || 439 m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR || 440 m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX || 441 m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC || 442 m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH || 443 m_op == Enums::MO_ANRCAS) { 444 req->setFlags(Request::ATOMIC_NO_RETURN_OP); 445 } 446 } 447 448 // Map returned packets and the addresses they satisfy with which lane they 449 // were requested from 450 typedef std::unordered_map<Addr, std::vector<int>> StatusVector; 451 StatusVector memStatusVector; 452 453 // Track the status of memory requests per lane, a bit per lane 454 VectorMask statusBitVector; 455 // for ld_v# or st_v# 456 std::vector<int> statusVector; 457 std::vector<int> tlbHitLevel; 458 459 private: 460 GPUStaticInst *staticInst; 461 uint64_t _seqNum; 462}; 463 464#endif // __GPU_DYN_INST_HH__ 465