1/* 2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Anthony Gutierrez 34 */ 35 36#ifndef __GPU_DYN_INST_HH__ 37#define __GPU_DYN_INST_HH__ 38 39#include <cstdint> 40#include <string> 41
| 1/* 2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc. 3 * All rights reserved. 4 * 5 * For use for simulation and test purposes only 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the copyright holder nor the names of its 18 * contributors may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 * 33 * Authors: Anthony Gutierrez 34 */ 35 36#ifndef __GPU_DYN_INST_HH__ 37#define __GPU_DYN_INST_HH__ 38 39#include <cstdint> 40#include <string> 41
|
| 42#include "base/logging.hh"
|
42#include "enums/MemType.hh" 43#include "enums/StorageClassType.hh" 44#include "gpu-compute/compute_unit.hh" 45#include "gpu-compute/gpu_exec_context.hh" 46 47class GPUStaticInst; 48 49template<typename T> 50class AtomicOpAnd : public TypedAtomicOpFunctor<T> 51{ 52 public: 53 T a; 54 55 AtomicOpAnd(T _a) : a(_a) { } 56 void execute(T *b) { *b &= a; } 57 AtomicOpFunctor* clone () { return new AtomicOpAnd(a); } 58}; 59 60template<typename T> 61class AtomicOpOr : public TypedAtomicOpFunctor<T> 62{ 63 public: 64 T a; 65 AtomicOpOr(T _a) : a(_a) { } 66 void execute(T *b) { *b |= a; } 67 AtomicOpFunctor* clone () { return new AtomicOpOr(a); } 68}; 69 70template<typename T> 71class AtomicOpXor : public TypedAtomicOpFunctor<T> 72{ 73 public: 74 T a; 75 AtomicOpXor(T _a) : a(_a) {} 76 void execute(T *b) { *b ^= a; } 77 AtomicOpFunctor* clone () { return new AtomicOpXor(a); } 78}; 79 80template<typename T> 81class AtomicOpCAS : public TypedAtomicOpFunctor<T> 82{ 83 public: 84 T c; 85 T s; 86 87 ComputeUnit *computeUnit; 88 89 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit) 90 : c(_c), s(_s), computeUnit(compute_unit) { } 91 92 void 93 execute(T *b) 94 { 95 computeUnit->numCASOps++; 96 97 if (*b == c) { 98 *b = s; 99 } else { 100 computeUnit->numFailedCASOps++; 101 } 102 103 if (computeUnit->xact_cas_mode) { 104 computeUnit->xactCasLoadMap.clear(); 105 } 106 } 107 AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); } 108}; 109 110template<typename T> 111class AtomicOpExch : public TypedAtomicOpFunctor<T> 112{ 113 public: 114 T a; 115 AtomicOpExch(T _a) : a(_a) { } 116 void execute(T *b) { *b = a; } 117 AtomicOpFunctor* clone () { return new AtomicOpExch(a); } 118}; 119 120template<typename T> 121class AtomicOpAdd : public TypedAtomicOpFunctor<T> 122{ 123 public: 124 T a; 125 AtomicOpAdd(T _a) : a(_a) { } 126 void execute(T *b) { *b += a; } 127 AtomicOpFunctor* clone () { return new AtomicOpAdd(a); } 128}; 129 130template<typename T> 131class AtomicOpSub : public TypedAtomicOpFunctor<T> 132{ 133 public: 134 T a; 135 AtomicOpSub(T _a) : a(_a) { } 136 void execute(T *b) { *b -= a; } 137 AtomicOpFunctor* clone () { return new AtomicOpSub(a); } 138}; 139 140template<typename T> 141class AtomicOpInc : public TypedAtomicOpFunctor<T> 142{ 143 public: 144 AtomicOpInc() { } 145 void execute(T *b) { *b += 1; } 146 AtomicOpFunctor* clone () { return new AtomicOpInc(); } 147}; 148 149template<typename T> 150class AtomicOpDec : public TypedAtomicOpFunctor<T> 151{ 152 public: 153 AtomicOpDec() {} 154 void execute(T *b) { *b -= 1; } 155 AtomicOpFunctor* clone () { return new AtomicOpDec(); } 156}; 157 158template<typename T> 159class AtomicOpMax : public TypedAtomicOpFunctor<T> 160{ 161 public: 162 T a; 163 AtomicOpMax(T _a) : a(_a) { } 164 165 void 166 execute(T *b) 167 { 168 if (a > *b) 169 *b = a; 170 } 171 AtomicOpFunctor* clone () { return new AtomicOpMax(a); } 172}; 173 174template<typename T> 175class AtomicOpMin : public TypedAtomicOpFunctor<T> 176{ 177 public: 178 T a; 179 AtomicOpMin(T _a) : a(_a) {} 180 181 void 182 execute(T *b) 183 { 184 if (a < *b) 185 *b = a; 186 } 187 AtomicOpFunctor* clone () { return new AtomicOpMin(a); } 188}; 189 190typedef enum 191{ 192 VT_32, 193 VT_64, 194} vgpr_type; 195 196class GPUDynInst : public GPUExecContext 197{ 198 public: 199 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, 200 uint64_t instSeqNum); 201 ~GPUDynInst(); 202 void execute(GPUDynInstPtr gpuDynInst); 203 int numSrcRegOperands(); 204 int numDstRegOperands(); 205 int getNumOperands(); 206 bool isVectorRegister(int operandIdx); 207 bool isScalarRegister(int operandIdx); 208 bool isCondRegister(int operandIdx); 209 int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst); 210 int getOperandSize(int operandIdx); 211 bool isDstOperand(int operandIdx); 212 bool isSrcOperand(int operandIdx); 213 214 const std::string &disassemble() const; 215 216 uint64_t seqNum() const; 217 218 Enums::StorageClassType executedAs(); 219 220 // The address of the memory operation 221 std::vector<Addr> addr; 222 Addr pAddr; 223 224 // The data to get written 225 uint8_t *d_data; 226 // Additional data (for atomics) 227 uint8_t *a_data; 228 // Additional data (for atomics) 229 uint8_t *x_data; 230 // The execution mask 231 VectorMask exec_mask; 232 233 // The memory type (M_U32, M_S32, ...) 234 Enums::MemType m_type; 235 236 // The equivalency class 237 int equiv; 238 // The return VGPR type (VT_32 or VT_64) 239 vgpr_type v_type; 240 // Number of VGPR's accessed (1, 2, or 4) 241 int n_reg; 242 // The return VGPR index 243 int dst_reg; 244 // There can be max 4 dest regs> 245 int dst_reg_vec[4]; 246 // SIMD where the WF of the memory instruction has been mapped to 247 int simdId; 248 // unique id of the WF where the memory instruction belongs to 249 int wfDynId; 250 // The kernel id of the requesting wf 251 int kern_id; 252 // The CU id of the requesting wf 253 int cu_id; 254 // HW slot id where the WF is mapped to inside a SIMD unit 255 int wfSlotId; 256 // execution pipeline id where the memory instruction has been scheduled 257 int pipeId; 258 // The execution time of this operation 259 Tick time; 260 // The latency of this operation 261 WaitClass latency; 262 // A list of bank conflicts for the 4 cycles. 263 uint32_t bc[4]; 264 265 // A pointer to ROM 266 uint8_t *rom; 267 // The size of the READONLY segment 268 int sz_rom; 269 270 // Initiate the specified memory operation, by creating a 271 // memory request and sending it off to the memory system. 272 void initiateAcc(GPUDynInstPtr gpuDynInst); 273 // Complete the specified memory operation, by writing 274 // value back to the RF in the case of a load or atomic 275 // return or, in the case of a store, we do nothing 276 void completeAcc(GPUDynInstPtr gpuDynInst); 277 278 void updateStats(); 279 280 GPUStaticInst* staticInstruction() { return _staticInst; } 281 282 bool isALU() const; 283 bool isBranch() const; 284 bool isNop() const; 285 bool isReturn() const; 286 bool isUnconditionalJump() const; 287 bool isSpecialOp() const; 288 bool isWaitcnt() const; 289 290 bool isBarrier() const; 291 bool isMemFence() const; 292 bool isMemRef() const; 293 bool isFlat() const; 294 bool isLoad() const; 295 bool isStore() const; 296 297 bool isAtomic() const; 298 bool isAtomicNoRet() const; 299 bool isAtomicRet() const; 300 301 bool isScalar() const; 302 bool readsSCC() const; 303 bool writesSCC() const; 304 bool readsVCC() const; 305 bool writesVCC() const; 306 307 bool isAtomicAnd() const; 308 bool isAtomicOr() const; 309 bool isAtomicXor() const; 310 bool isAtomicCAS() const; 311 bool isAtomicExch() const; 312 bool isAtomicAdd() const; 313 bool isAtomicSub() const; 314 bool isAtomicInc() const; 315 bool isAtomicDec() const; 316 bool isAtomicMax() const; 317 bool isAtomicMin() const; 318 319 bool isArgLoad() const; 320 bool isGlobalMem() const; 321 bool isLocalMem() const; 322 323 bool isArgSeg() const; 324 bool isGlobalSeg() const; 325 bool isGroupSeg() const; 326 bool isKernArgSeg() const; 327 bool isPrivateSeg() const; 328 bool isReadOnlySeg() const; 329 bool isSpillSeg() const; 330 331 bool isWorkitemScope() const; 332 bool isWavefrontScope() const; 333 bool isWorkgroupScope() const; 334 bool isDeviceScope() const; 335 bool isSystemScope() const; 336 bool isNoScope() const; 337 338 bool isRelaxedOrder() const; 339 bool isAcquire() const; 340 bool isRelease() const; 341 bool isAcquireRelease() const; 342 bool isNoOrder() const; 343 344 bool isGloballyCoherent() const; 345 bool isSystemCoherent() const; 346 347 /* 348 * Loads/stores/atomics may have acquire/release semantics associated 349 * withthem. Some protocols want to see the acquire/release as separate 350 * requests from the load/store/atomic. We implement that separation 351 * using continuations (i.e., a function pointer with an object associated 352 * with it). When, for example, the front-end generates a store with 353 * release semantics, we will first issue a normal store and set the 354 * continuation in the GPUDynInst to a function that generate a 355 * release request. That continuation will be called when the normal 356 * store completes (in ComputeUnit::DataPort::recvTimingResponse). The 357 * continuation will be called in the context of the same GPUDynInst 358 * that generated the initial store. 359 */ 360 std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation; 361 362 // when true, call execContinuation when response arrives 363 bool useContinuation; 364 365 template<typename c0> AtomicOpFunctor* 366 makeAtomicOpFunctor(c0 *reg0, c0 *reg1) 367 { 368 if (isAtomicAnd()) { 369 return new AtomicOpAnd<c0>(*reg0); 370 } else if (isAtomicOr()) { 371 return new AtomicOpOr<c0>(*reg0); 372 } else if (isAtomicXor()) { 373 return new AtomicOpXor<c0>(*reg0); 374 } else if (isAtomicCAS()) { 375 return new AtomicOpCAS<c0>(*reg0, *reg1, cu); 376 } else if (isAtomicExch()) { 377 return new AtomicOpExch<c0>(*reg0); 378 } else if (isAtomicAdd()) { 379 return new AtomicOpAdd<c0>(*reg0); 380 } else if (isAtomicSub()) { 381 return new AtomicOpSub<c0>(*reg0); 382 } else if (isAtomicInc()) { 383 return new AtomicOpInc<c0>(); 384 } else if (isAtomicDec()) { 385 return new AtomicOpDec<c0>(); 386 } else if (isAtomicMax()) { 387 return new AtomicOpMax<c0>(*reg0); 388 } else if (isAtomicMin()) { 389 return new AtomicOpMin<c0>(*reg0); 390 } else { 391 fatal("Unrecognized atomic operation"); 392 } 393 } 394 395 void 396 setRequestFlags(RequestPtr req, bool setMemOrder=true) 397 { 398 // currently these are the easy scopes to deduce 399 if (isPrivateSeg()) { 400 req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); 401 } else if (isSpillSeg()) { 402 req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); 403 } else if (isGlobalSeg()) { 404 req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); 405 } else if (isReadOnlySeg()) { 406 req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); 407 } else if (isGroupSeg()) { 408 req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); 409 } else if (isFlat()) {
| 43#include "enums/MemType.hh" 44#include "enums/StorageClassType.hh" 45#include "gpu-compute/compute_unit.hh" 46#include "gpu-compute/gpu_exec_context.hh" 47 48class GPUStaticInst; 49 50template<typename T> 51class AtomicOpAnd : public TypedAtomicOpFunctor<T> 52{ 53 public: 54 T a; 55 56 AtomicOpAnd(T _a) : a(_a) { } 57 void execute(T *b) { *b &= a; } 58 AtomicOpFunctor* clone () { return new AtomicOpAnd(a); } 59}; 60 61template<typename T> 62class AtomicOpOr : public TypedAtomicOpFunctor<T> 63{ 64 public: 65 T a; 66 AtomicOpOr(T _a) : a(_a) { } 67 void execute(T *b) { *b |= a; } 68 AtomicOpFunctor* clone () { return new AtomicOpOr(a); } 69}; 70 71template<typename T> 72class AtomicOpXor : public TypedAtomicOpFunctor<T> 73{ 74 public: 75 T a; 76 AtomicOpXor(T _a) : a(_a) {} 77 void execute(T *b) { *b ^= a; } 78 AtomicOpFunctor* clone () { return new AtomicOpXor(a); } 79}; 80 81template<typename T> 82class AtomicOpCAS : public TypedAtomicOpFunctor<T> 83{ 84 public: 85 T c; 86 T s; 87 88 ComputeUnit *computeUnit; 89 90 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit) 91 : c(_c), s(_s), computeUnit(compute_unit) { } 92 93 void 94 execute(T *b) 95 { 96 computeUnit->numCASOps++; 97 98 if (*b == c) { 99 *b = s; 100 } else { 101 computeUnit->numFailedCASOps++; 102 } 103 104 if (computeUnit->xact_cas_mode) { 105 computeUnit->xactCasLoadMap.clear(); 106 } 107 } 108 AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); } 109}; 110 111template<typename T> 112class AtomicOpExch : public TypedAtomicOpFunctor<T> 113{ 114 public: 115 T a; 116 AtomicOpExch(T _a) : a(_a) { } 117 void execute(T *b) { *b = a; } 118 AtomicOpFunctor* clone () { return new AtomicOpExch(a); } 119}; 120 121template<typename T> 122class AtomicOpAdd : public TypedAtomicOpFunctor<T> 123{ 124 public: 125 T a; 126 AtomicOpAdd(T _a) : a(_a) { } 127 void execute(T *b) { *b += a; } 128 AtomicOpFunctor* clone () { return new AtomicOpAdd(a); } 129}; 130 131template<typename T> 132class AtomicOpSub : public TypedAtomicOpFunctor<T> 133{ 134 public: 135 T a; 136 AtomicOpSub(T _a) : a(_a) { } 137 void execute(T *b) { *b -= a; } 138 AtomicOpFunctor* clone () { return new AtomicOpSub(a); } 139}; 140 141template<typename T> 142class AtomicOpInc : public TypedAtomicOpFunctor<T> 143{ 144 public: 145 AtomicOpInc() { } 146 void execute(T *b) { *b += 1; } 147 AtomicOpFunctor* clone () { return new AtomicOpInc(); } 148}; 149 150template<typename T> 151class AtomicOpDec : public TypedAtomicOpFunctor<T> 152{ 153 public: 154 AtomicOpDec() {} 155 void execute(T *b) { *b -= 1; } 156 AtomicOpFunctor* clone () { return new AtomicOpDec(); } 157}; 158 159template<typename T> 160class AtomicOpMax : public TypedAtomicOpFunctor<T> 161{ 162 public: 163 T a; 164 AtomicOpMax(T _a) : a(_a) { } 165 166 void 167 execute(T *b) 168 { 169 if (a > *b) 170 *b = a; 171 } 172 AtomicOpFunctor* clone () { return new AtomicOpMax(a); } 173}; 174 175template<typename T> 176class AtomicOpMin : public TypedAtomicOpFunctor<T> 177{ 178 public: 179 T a; 180 AtomicOpMin(T _a) : a(_a) {} 181 182 void 183 execute(T *b) 184 { 185 if (a < *b) 186 *b = a; 187 } 188 AtomicOpFunctor* clone () { return new AtomicOpMin(a); } 189}; 190 191typedef enum 192{ 193 VT_32, 194 VT_64, 195} vgpr_type; 196 197class GPUDynInst : public GPUExecContext 198{ 199 public: 200 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, 201 uint64_t instSeqNum); 202 ~GPUDynInst(); 203 void execute(GPUDynInstPtr gpuDynInst); 204 int numSrcRegOperands(); 205 int numDstRegOperands(); 206 int getNumOperands(); 207 bool isVectorRegister(int operandIdx); 208 bool isScalarRegister(int operandIdx); 209 bool isCondRegister(int operandIdx); 210 int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst); 211 int getOperandSize(int operandIdx); 212 bool isDstOperand(int operandIdx); 213 bool isSrcOperand(int operandIdx); 214 215 const std::string &disassemble() const; 216 217 uint64_t seqNum() const; 218 219 Enums::StorageClassType executedAs(); 220 221 // The address of the memory operation 222 std::vector<Addr> addr; 223 Addr pAddr; 224 225 // The data to get written 226 uint8_t *d_data; 227 // Additional data (for atomics) 228 uint8_t *a_data; 229 // Additional data (for atomics) 230 uint8_t *x_data; 231 // The execution mask 232 VectorMask exec_mask; 233 234 // The memory type (M_U32, M_S32, ...) 235 Enums::MemType m_type; 236 237 // The equivalency class 238 int equiv; 239 // The return VGPR type (VT_32 or VT_64) 240 vgpr_type v_type; 241 // Number of VGPR's accessed (1, 2, or 4) 242 int n_reg; 243 // The return VGPR index 244 int dst_reg; 245 // There can be max 4 dest regs> 246 int dst_reg_vec[4]; 247 // SIMD where the WF of the memory instruction has been mapped to 248 int simdId; 249 // unique id of the WF where the memory instruction belongs to 250 int wfDynId; 251 // The kernel id of the requesting wf 252 int kern_id; 253 // The CU id of the requesting wf 254 int cu_id; 255 // HW slot id where the WF is mapped to inside a SIMD unit 256 int wfSlotId; 257 // execution pipeline id where the memory instruction has been scheduled 258 int pipeId; 259 // The execution time of this operation 260 Tick time; 261 // The latency of this operation 262 WaitClass latency; 263 // A list of bank conflicts for the 4 cycles. 264 uint32_t bc[4]; 265 266 // A pointer to ROM 267 uint8_t *rom; 268 // The size of the READONLY segment 269 int sz_rom; 270 271 // Initiate the specified memory operation, by creating a 272 // memory request and sending it off to the memory system. 273 void initiateAcc(GPUDynInstPtr gpuDynInst); 274 // Complete the specified memory operation, by writing 275 // value back to the RF in the case of a load or atomic 276 // return or, in the case of a store, we do nothing 277 void completeAcc(GPUDynInstPtr gpuDynInst); 278 279 void updateStats(); 280 281 GPUStaticInst* staticInstruction() { return _staticInst; } 282 283 bool isALU() const; 284 bool isBranch() const; 285 bool isNop() const; 286 bool isReturn() const; 287 bool isUnconditionalJump() const; 288 bool isSpecialOp() const; 289 bool isWaitcnt() const; 290 291 bool isBarrier() const; 292 bool isMemFence() const; 293 bool isMemRef() const; 294 bool isFlat() const; 295 bool isLoad() const; 296 bool isStore() const; 297 298 bool isAtomic() const; 299 bool isAtomicNoRet() const; 300 bool isAtomicRet() const; 301 302 bool isScalar() const; 303 bool readsSCC() const; 304 bool writesSCC() const; 305 bool readsVCC() const; 306 bool writesVCC() const; 307 308 bool isAtomicAnd() const; 309 bool isAtomicOr() const; 310 bool isAtomicXor() const; 311 bool isAtomicCAS() const; 312 bool isAtomicExch() const; 313 bool isAtomicAdd() const; 314 bool isAtomicSub() const; 315 bool isAtomicInc() const; 316 bool isAtomicDec() const; 317 bool isAtomicMax() const; 318 bool isAtomicMin() const; 319 320 bool isArgLoad() const; 321 bool isGlobalMem() const; 322 bool isLocalMem() const; 323 324 bool isArgSeg() const; 325 bool isGlobalSeg() const; 326 bool isGroupSeg() const; 327 bool isKernArgSeg() const; 328 bool isPrivateSeg() const; 329 bool isReadOnlySeg() const; 330 bool isSpillSeg() const; 331 332 bool isWorkitemScope() const; 333 bool isWavefrontScope() const; 334 bool isWorkgroupScope() const; 335 bool isDeviceScope() const; 336 bool isSystemScope() const; 337 bool isNoScope() const; 338 339 bool isRelaxedOrder() const; 340 bool isAcquire() const; 341 bool isRelease() const; 342 bool isAcquireRelease() const; 343 bool isNoOrder() const; 344 345 bool isGloballyCoherent() const; 346 bool isSystemCoherent() const; 347 348 /* 349 * Loads/stores/atomics may have acquire/release semantics associated 350 * withthem. Some protocols want to see the acquire/release as separate 351 * requests from the load/store/atomic. We implement that separation 352 * using continuations (i.e., a function pointer with an object associated 353 * with it). When, for example, the front-end generates a store with 354 * release semantics, we will first issue a normal store and set the 355 * continuation in the GPUDynInst to a function that generate a 356 * release request. That continuation will be called when the normal 357 * store completes (in ComputeUnit::DataPort::recvTimingResponse). The 358 * continuation will be called in the context of the same GPUDynInst 359 * that generated the initial store. 360 */ 361 std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation; 362 363 // when true, call execContinuation when response arrives 364 bool useContinuation; 365 366 template<typename c0> AtomicOpFunctor* 367 makeAtomicOpFunctor(c0 *reg0, c0 *reg1) 368 { 369 if (isAtomicAnd()) { 370 return new AtomicOpAnd<c0>(*reg0); 371 } else if (isAtomicOr()) { 372 return new AtomicOpOr<c0>(*reg0); 373 } else if (isAtomicXor()) { 374 return new AtomicOpXor<c0>(*reg0); 375 } else if (isAtomicCAS()) { 376 return new AtomicOpCAS<c0>(*reg0, *reg1, cu); 377 } else if (isAtomicExch()) { 378 return new AtomicOpExch<c0>(*reg0); 379 } else if (isAtomicAdd()) { 380 return new AtomicOpAdd<c0>(*reg0); 381 } else if (isAtomicSub()) { 382 return new AtomicOpSub<c0>(*reg0); 383 } else if (isAtomicInc()) { 384 return new AtomicOpInc<c0>(); 385 } else if (isAtomicDec()) { 386 return new AtomicOpDec<c0>(); 387 } else if (isAtomicMax()) { 388 return new AtomicOpMax<c0>(*reg0); 389 } else if (isAtomicMin()) { 390 return new AtomicOpMin<c0>(*reg0); 391 } else { 392 fatal("Unrecognized atomic operation"); 393 } 394 } 395 396 void 397 setRequestFlags(RequestPtr req, bool setMemOrder=true) 398 { 399 // currently these are the easy scopes to deduce 400 if (isPrivateSeg()) { 401 req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); 402 } else if (isSpillSeg()) { 403 req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); 404 } else if (isGlobalSeg()) { 405 req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); 406 } else if (isReadOnlySeg()) { 407 req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); 408 } else if (isGroupSeg()) { 409 req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); 410 } else if (isFlat()) {
|
410 // TODO: translate to correct scope 411 assert(false);
| 411 panic("TODO: translate to correct scope");
|
412 } else { 413 fatal("%s has bad segment type\n", disassemble()); 414 } 415 416 if (isWavefrontScope()) { 417 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 418 Request::WAVEFRONT_SCOPE); 419 } else if (isWorkgroupScope()) { 420 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 421 Request::WORKGROUP_SCOPE); 422 } else if (isDeviceScope()) { 423 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 424 Request::DEVICE_SCOPE); 425 } else if (isSystemScope()) { 426 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 427 Request::SYSTEM_SCOPE); 428 } else if (!isNoScope() && !isWorkitemScope()) { 429 fatal("%s has bad scope type\n", disassemble()); 430 } 431 432 if (setMemOrder) { 433 // set acquire and release flags 434 if (isAcquire()) { 435 req->setFlags(Request::ACQUIRE); 436 } else if (isRelease()) { 437 req->setFlags(Request::RELEASE); 438 } else if (isAcquireRelease()) { 439 req->setFlags(Request::ACQUIRE | Request::RELEASE); 440 } else if (!isNoOrder()) { 441 fatal("%s has bad memory order\n", disassemble()); 442 } 443 } 444 445 // set atomic type 446 // currently, the instruction genenerator only produces atomic return 447 // but a magic instruction can produce atomic no return 448 if (isAtomicRet()) { 449 req->setFlags(Request::ATOMIC_RETURN_OP); 450 } else if (isAtomicNoRet()) { 451 req->setFlags(Request::ATOMIC_NO_RETURN_OP); 452 } 453 } 454 455 // Map returned packets and the addresses they satisfy with which lane they 456 // were requested from 457 typedef std::unordered_map<Addr, std::vector<int>> StatusVector; 458 StatusVector memStatusVector; 459 460 // Track the status of memory requests per lane, a bit per lane 461 VectorMask statusBitVector; 462 // for ld_v# or st_v# 463 std::vector<int> statusVector; 464 std::vector<int> tlbHitLevel; 465 466 private: 467 GPUStaticInst *_staticInst; 468 uint64_t _seqNum; 469}; 470 471#endif // __GPU_DYN_INST_HH__
| 412 } else { 413 fatal("%s has bad segment type\n", disassemble()); 414 } 415 416 if (isWavefrontScope()) { 417 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 418 Request::WAVEFRONT_SCOPE); 419 } else if (isWorkgroupScope()) { 420 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 421 Request::WORKGROUP_SCOPE); 422 } else if (isDeviceScope()) { 423 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 424 Request::DEVICE_SCOPE); 425 } else if (isSystemScope()) { 426 req->setMemSpaceConfigFlags(Request::SCOPE_VALID | 427 Request::SYSTEM_SCOPE); 428 } else if (!isNoScope() && !isWorkitemScope()) { 429 fatal("%s has bad scope type\n", disassemble()); 430 } 431 432 if (setMemOrder) { 433 // set acquire and release flags 434 if (isAcquire()) { 435 req->setFlags(Request::ACQUIRE); 436 } else if (isRelease()) { 437 req->setFlags(Request::RELEASE); 438 } else if (isAcquireRelease()) { 439 req->setFlags(Request::ACQUIRE | Request::RELEASE); 440 } else if (!isNoOrder()) { 441 fatal("%s has bad memory order\n", disassemble()); 442 } 443 } 444 445 // set atomic type 446 // currently, the instruction genenerator only produces atomic return 447 // but a magic instruction can produce atomic no return 448 if (isAtomicRet()) { 449 req->setFlags(Request::ATOMIC_RETURN_OP); 450 } else if (isAtomicNoRet()) { 451 req->setFlags(Request::ATOMIC_NO_RETURN_OP); 452 } 453 } 454 455 // Map returned packets and the addresses they satisfy with which lane they 456 // were requested from 457 typedef std::unordered_map<Addr, std::vector<int>> StatusVector; 458 StatusVector memStatusVector; 459 460 // Track the status of memory requests per lane, a bit per lane 461 VectorMask statusBitVector; 462 // for ld_v# or st_v# 463 std::vector<int> statusVector; 464 std::vector<int> tlbHitLevel; 465 466 private: 467 GPUStaticInst *_staticInst; 468 uint64_t _seqNum; 469}; 470 471#endif // __GPU_DYN_INST_HH__
|