gpu_dyn_inst.hh (11692:e772fdcd3809) gpu_dyn_inst.hh (11693:bc1f702c25b9)
1/*
2 * Copyright (c) 2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Anthony Gutierrez
34 */
35
36#ifndef __GPU_DYN_INST_HH__
37#define __GPU_DYN_INST_HH__
38
39#include <cstdint>
40#include <string>
41
42#include "enums/MemType.hh"
43#include "enums/StorageClassType.hh"
44#include "gpu-compute/compute_unit.hh"
45#include "gpu-compute/gpu_exec_context.hh"
46
47class GPUStaticInst;
48
49template<typename T>
50class AtomicOpAnd : public TypedAtomicOpFunctor<T>
51{
52 public:
53 T a;
54
55 AtomicOpAnd(T _a) : a(_a) { }
56 void execute(T *b) { *b &= a; }
57};
58
59template<typename T>
60class AtomicOpOr : public TypedAtomicOpFunctor<T>
61{
62 public:
63 T a;
64 AtomicOpOr(T _a) : a(_a) { }
65 void execute(T *b) { *b |= a; }
66};
67
68template<typename T>
69class AtomicOpXor : public TypedAtomicOpFunctor<T>
70{
71 public:
72 T a;
73 AtomicOpXor(T _a) : a(_a) {}
74 void execute(T *b) { *b ^= a; }
75};
76
77template<typename T>
78class AtomicOpCAS : public TypedAtomicOpFunctor<T>
79{
80 public:
81 T c;
82 T s;
83
84 ComputeUnit *computeUnit;
85
86 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
87 : c(_c), s(_s), computeUnit(compute_unit) { }
88
89 void
90 execute(T *b)
91 {
92 computeUnit->numCASOps++;
93
94 if (*b == c) {
95 *b = s;
96 } else {
97 computeUnit->numFailedCASOps++;
98 }
99
100 if (computeUnit->xact_cas_mode) {
101 computeUnit->xactCasLoadMap.clear();
102 }
103 }
104};
105
106template<typename T>
107class AtomicOpExch : public TypedAtomicOpFunctor<T>
108{
109 public:
110 T a;
111 AtomicOpExch(T _a) : a(_a) { }
112 void execute(T *b) { *b = a; }
113};
114
115template<typename T>
116class AtomicOpAdd : public TypedAtomicOpFunctor<T>
117{
118 public:
119 T a;
120 AtomicOpAdd(T _a) : a(_a) { }
121 void execute(T *b) { *b += a; }
122};
123
124template<typename T>
125class AtomicOpSub : public TypedAtomicOpFunctor<T>
126{
127 public:
128 T a;
129 AtomicOpSub(T _a) : a(_a) { }
130 void execute(T *b) { *b -= a; }
131};
132
133template<typename T>
134class AtomicOpInc : public TypedAtomicOpFunctor<T>
135{
136 public:
137 AtomicOpInc() { }
138 void execute(T *b) { *b += 1; }
139};
140
141template<typename T>
142class AtomicOpDec : public TypedAtomicOpFunctor<T>
143{
144 public:
145 AtomicOpDec() {}
146 void execute(T *b) { *b -= 1; }
147};
148
149template<typename T>
150class AtomicOpMax : public TypedAtomicOpFunctor<T>
151{
152 public:
153 T a;
154 AtomicOpMax(T _a) : a(_a) { }
155
156 void
157 execute(T *b)
158 {
159 if (a > *b)
160 *b = a;
161 }
162};
163
164template<typename T>
165class AtomicOpMin : public TypedAtomicOpFunctor<T>
166{
167 public:
168 T a;
169 AtomicOpMin(T _a) : a(_a) {}
170
171 void
172 execute(T *b)
173 {
174 if (a < *b)
175 *b = a;
176 }
177};
178
179typedef enum
180{
181 VT_32,
182 VT_64,
183} vgpr_type;
184
185class GPUDynInst : public GPUExecContext
186{
187 public:
188 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
189 uint64_t instSeqNum);
190 ~GPUDynInst();
191 void execute(GPUDynInstPtr gpuDynInst);
192 int numSrcRegOperands();
193 int numDstRegOperands();
194 int getNumOperands();
195 bool isVectorRegister(int operandIdx);
196 bool isScalarRegister(int operandIdx);
197 int getRegisterIndex(int operandIdx);
198 int getOperandSize(int operandIdx);
199 bool isDstOperand(int operandIdx);
200 bool isSrcOperand(int operandIdx);
201
202 const std::string &disassemble() const;
203
204 uint64_t seqNum() const;
205
206 Enums::StorageClassType executedAs();
207
208 // The address of the memory operation
209 std::vector<Addr> addr;
210 Addr pAddr;
211
212 // The data to get written
213 uint8_t *d_data;
214 // Additional data (for atomics)
215 uint8_t *a_data;
216 // Additional data (for atomics)
217 uint8_t *x_data;
218 // The execution mask
219 VectorMask exec_mask;
220
221 // The memory type (M_U32, M_S32, ...)
222 Enums::MemType m_type;
223
224 // The equivalency class
225 int equiv;
226 // The return VGPR type (VT_32 or VT_64)
227 vgpr_type v_type;
228 // Number of VGPR's accessed (1, 2, or 4)
229 int n_reg;
230 // The return VGPR index
231 int dst_reg;
232 // There can be max 4 dest regs>
233 int dst_reg_vec[4];
234 // SIMD where the WF of the memory instruction has been mapped to
235 int simdId;
236 // unique id of the WF where the memory instruction belongs to
237 int wfDynId;
238 // The kernel id of the requesting wf
239 int kern_id;
240 // The CU id of the requesting wf
241 int cu_id;
242 // HW slot id where the WF is mapped to inside a SIMD unit
243 int wfSlotId;
244 // execution pipeline id where the memory instruction has been scheduled
245 int pipeId;
246 // The execution time of this operation
247 Tick time;
248 // The latency of this operation
249 WaitClass latency;
250 // A list of bank conflicts for the 4 cycles.
251 uint32_t bc[4];
252
253 // A pointer to ROM
254 uint8_t *rom;
255 // The size of the READONLY segment
256 int sz_rom;
257
258 // Initiate the specified memory operation, by creating a
259 // memory request and sending it off to the memory system.
260 void initiateAcc(GPUDynInstPtr gpuDynInst);
1/*
2 * Copyright (c) 2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Anthony Gutierrez
34 */
35
36#ifndef __GPU_DYN_INST_HH__
37#define __GPU_DYN_INST_HH__
38
39#include <cstdint>
40#include <string>
41
42#include "enums/MemType.hh"
43#include "enums/StorageClassType.hh"
44#include "gpu-compute/compute_unit.hh"
45#include "gpu-compute/gpu_exec_context.hh"
46
47class GPUStaticInst;
48
49template<typename T>
50class AtomicOpAnd : public TypedAtomicOpFunctor<T>
51{
52 public:
53 T a;
54
55 AtomicOpAnd(T _a) : a(_a) { }
56 void execute(T *b) { *b &= a; }
57};
58
59template<typename T>
60class AtomicOpOr : public TypedAtomicOpFunctor<T>
61{
62 public:
63 T a;
64 AtomicOpOr(T _a) : a(_a) { }
65 void execute(T *b) { *b |= a; }
66};
67
68template<typename T>
69class AtomicOpXor : public TypedAtomicOpFunctor<T>
70{
71 public:
72 T a;
73 AtomicOpXor(T _a) : a(_a) {}
74 void execute(T *b) { *b ^= a; }
75};
76
77template<typename T>
78class AtomicOpCAS : public TypedAtomicOpFunctor<T>
79{
80 public:
81 T c;
82 T s;
83
84 ComputeUnit *computeUnit;
85
86 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
87 : c(_c), s(_s), computeUnit(compute_unit) { }
88
89 void
90 execute(T *b)
91 {
92 computeUnit->numCASOps++;
93
94 if (*b == c) {
95 *b = s;
96 } else {
97 computeUnit->numFailedCASOps++;
98 }
99
100 if (computeUnit->xact_cas_mode) {
101 computeUnit->xactCasLoadMap.clear();
102 }
103 }
104};
105
106template<typename T>
107class AtomicOpExch : public TypedAtomicOpFunctor<T>
108{
109 public:
110 T a;
111 AtomicOpExch(T _a) : a(_a) { }
112 void execute(T *b) { *b = a; }
113};
114
115template<typename T>
116class AtomicOpAdd : public TypedAtomicOpFunctor<T>
117{
118 public:
119 T a;
120 AtomicOpAdd(T _a) : a(_a) { }
121 void execute(T *b) { *b += a; }
122};
123
124template<typename T>
125class AtomicOpSub : public TypedAtomicOpFunctor<T>
126{
127 public:
128 T a;
129 AtomicOpSub(T _a) : a(_a) { }
130 void execute(T *b) { *b -= a; }
131};
132
133template<typename T>
134class AtomicOpInc : public TypedAtomicOpFunctor<T>
135{
136 public:
137 AtomicOpInc() { }
138 void execute(T *b) { *b += 1; }
139};
140
141template<typename T>
142class AtomicOpDec : public TypedAtomicOpFunctor<T>
143{
144 public:
145 AtomicOpDec() {}
146 void execute(T *b) { *b -= 1; }
147};
148
149template<typename T>
150class AtomicOpMax : public TypedAtomicOpFunctor<T>
151{
152 public:
153 T a;
154 AtomicOpMax(T _a) : a(_a) { }
155
156 void
157 execute(T *b)
158 {
159 if (a > *b)
160 *b = a;
161 }
162};
163
164template<typename T>
165class AtomicOpMin : public TypedAtomicOpFunctor<T>
166{
167 public:
168 T a;
169 AtomicOpMin(T _a) : a(_a) {}
170
171 void
172 execute(T *b)
173 {
174 if (a < *b)
175 *b = a;
176 }
177};
178
179typedef enum
180{
181 VT_32,
182 VT_64,
183} vgpr_type;
184
185class GPUDynInst : public GPUExecContext
186{
187 public:
188 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
189 uint64_t instSeqNum);
190 ~GPUDynInst();
191 void execute(GPUDynInstPtr gpuDynInst);
192 int numSrcRegOperands();
193 int numDstRegOperands();
194 int getNumOperands();
195 bool isVectorRegister(int operandIdx);
196 bool isScalarRegister(int operandIdx);
197 int getRegisterIndex(int operandIdx);
198 int getOperandSize(int operandIdx);
199 bool isDstOperand(int operandIdx);
200 bool isSrcOperand(int operandIdx);
201
202 const std::string &disassemble() const;
203
204 uint64_t seqNum() const;
205
206 Enums::StorageClassType executedAs();
207
208 // The address of the memory operation
209 std::vector<Addr> addr;
210 Addr pAddr;
211
212 // The data to get written
213 uint8_t *d_data;
214 // Additional data (for atomics)
215 uint8_t *a_data;
216 // Additional data (for atomics)
217 uint8_t *x_data;
218 // The execution mask
219 VectorMask exec_mask;
220
221 // The memory type (M_U32, M_S32, ...)
222 Enums::MemType m_type;
223
224 // The equivalency class
225 int equiv;
226 // The return VGPR type (VT_32 or VT_64)
227 vgpr_type v_type;
228 // Number of VGPR's accessed (1, 2, or 4)
229 int n_reg;
230 // The return VGPR index
231 int dst_reg;
232 // There can be max 4 dest regs>
233 int dst_reg_vec[4];
234 // SIMD where the WF of the memory instruction has been mapped to
235 int simdId;
236 // unique id of the WF where the memory instruction belongs to
237 int wfDynId;
238 // The kernel id of the requesting wf
239 int kern_id;
240 // The CU id of the requesting wf
241 int cu_id;
242 // HW slot id where the WF is mapped to inside a SIMD unit
243 int wfSlotId;
244 // execution pipeline id where the memory instruction has been scheduled
245 int pipeId;
246 // The execution time of this operation
247 Tick time;
248 // The latency of this operation
249 WaitClass latency;
250 // A list of bank conflicts for the 4 cycles.
251 uint32_t bc[4];
252
253 // A pointer to ROM
254 uint8_t *rom;
255 // The size of the READONLY segment
256 int sz_rom;
257
258 // Initiate the specified memory operation, by creating a
259 // memory request and sending it off to the memory system.
260 void initiateAcc(GPUDynInstPtr gpuDynInst);
261 // Complete the specified memory operation, by writing
262 // value back to the RF in the case of a load or atomic
263 // return or, in the case of a store, we do nothing
264 void completeAcc(GPUDynInstPtr gpuDynInst);
261
262 void updateStats();
263
264 GPUStaticInst* staticInstruction() { return _staticInst; }
265
266 bool isALU() const;
267 bool isBranch() const;
268 bool isNop() const;
269 bool isReturn() const;
270 bool isUnconditionalJump() const;
271 bool isSpecialOp() const;
272 bool isWaitcnt() const;
273
274 bool isBarrier() const;
275 bool isMemFence() const;
276 bool isMemRef() const;
277 bool isFlat() const;
278 bool isLoad() const;
279 bool isStore() const;
280
281 bool isAtomic() const;
282 bool isAtomicNoRet() const;
283 bool isAtomicRet() const;
284
285 bool isScalar() const;
286 bool readsSCC() const;
287 bool writesSCC() const;
288 bool readsVCC() const;
289 bool writesVCC() const;
290
291 bool isAtomicAnd() const;
292 bool isAtomicOr() const;
293 bool isAtomicXor() const;
294 bool isAtomicCAS() const;
295 bool isAtomicExch() const;
296 bool isAtomicAdd() const;
297 bool isAtomicSub() const;
298 bool isAtomicInc() const;
299 bool isAtomicDec() const;
300 bool isAtomicMax() const;
301 bool isAtomicMin() const;
302
303 bool isArgLoad() const;
304 bool isGlobalMem() const;
305 bool isLocalMem() const;
306
307 bool isArgSeg() const;
308 bool isGlobalSeg() const;
309 bool isGroupSeg() const;
310 bool isKernArgSeg() const;
311 bool isPrivateSeg() const;
312 bool isReadOnlySeg() const;
313 bool isSpillSeg() const;
314
315 bool isWorkitemScope() const;
316 bool isWavefrontScope() const;
317 bool isWorkgroupScope() const;
318 bool isDeviceScope() const;
319 bool isSystemScope() const;
320 bool isNoScope() const;
321
322 bool isRelaxedOrder() const;
323 bool isAcquire() const;
324 bool isRelease() const;
325 bool isAcquireRelease() const;
326 bool isNoOrder() const;
327
328 bool isGloballyCoherent() const;
329 bool isSystemCoherent() const;
330
331 /*
332 * Loads/stores/atomics may have acquire/release semantics associated
333 * withthem. Some protocols want to see the acquire/release as separate
334 * requests from the load/store/atomic. We implement that separation
335 * using continuations (i.e., a function pointer with an object associated
336 * with it). When, for example, the front-end generates a store with
337 * release semantics, we will first issue a normal store and set the
338 * continuation in the GPUDynInst to a function that generate a
339 * release request. That continuation will be called when the normal
340 * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
341 * continuation will be called in the context of the same GPUDynInst
342 * that generated the initial store.
343 */
344 std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
345
346 // when true, call execContinuation when response arrives
347 bool useContinuation;
348
349 template<typename c0> AtomicOpFunctor*
350 makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
351 {
352 if (isAtomicAnd()) {
353 return new AtomicOpAnd<c0>(*reg0);
354 } else if (isAtomicOr()) {
355 return new AtomicOpOr<c0>(*reg0);
356 } else if (isAtomicXor()) {
357 return new AtomicOpXor<c0>(*reg0);
358 } else if (isAtomicCAS()) {
359 return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
360 } else if (isAtomicExch()) {
361 return new AtomicOpExch<c0>(*reg0);
362 } else if (isAtomicAdd()) {
363 return new AtomicOpAdd<c0>(*reg0);
364 } else if (isAtomicSub()) {
365 return new AtomicOpSub<c0>(*reg0);
366 } else if (isAtomicInc()) {
367 return new AtomicOpInc<c0>();
368 } else if (isAtomicDec()) {
369 return new AtomicOpDec<c0>();
370 } else if (isAtomicMax()) {
371 return new AtomicOpMax<c0>(*reg0);
372 } else if (isAtomicMin()) {
373 return new AtomicOpMin<c0>(*reg0);
374 } else {
375 fatal("Unrecognized atomic operation");
376 }
377 }
378
379 void
380 setRequestFlags(Request *req, bool setMemOrder=true)
381 {
382 // currently these are the easy scopes to deduce
383 if (isPrivateSeg()) {
384 req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
385 } else if (isSpillSeg()) {
386 req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
387 } else if (isGlobalSeg()) {
388 req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
389 } else if (isReadOnlySeg()) {
390 req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
391 } else if (isGroupSeg()) {
392 req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
393 } else if (isFlat()) {
394 // TODO: translate to correct scope
395 assert(false);
396 } else {
397 fatal("%s has bad segment type\n", disassemble());
398 }
399
400 if (isWavefrontScope()) {
401 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
402 Request::WAVEFRONT_SCOPE);
403 } else if (isWorkgroupScope()) {
404 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
405 Request::WORKGROUP_SCOPE);
406 } else if (isDeviceScope()) {
407 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
408 Request::DEVICE_SCOPE);
409 } else if (isSystemScope()) {
410 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
411 Request::SYSTEM_SCOPE);
412 } else if (!isNoScope() && !isWorkitemScope()) {
413 fatal("%s has bad scope type\n", disassemble());
414 }
415
416 if (setMemOrder) {
417 // set acquire and release flags
418 if (isAcquire()) {
419 req->setFlags(Request::ACQUIRE);
420 } else if (isRelease()) {
421 req->setFlags(Request::RELEASE);
422 } else if (isAcquireRelease()) {
423 req->setFlags(Request::ACQUIRE | Request::RELEASE);
424 } else if (!isNoOrder()) {
425 fatal("%s has bad memory order\n", disassemble());
426 }
427 }
428
429 // set atomic type
430 // currently, the instruction genenerator only produces atomic return
431 // but a magic instruction can produce atomic no return
432 if (isAtomicRet()) {
433 req->setFlags(Request::ATOMIC_RETURN_OP);
434 } else if (isAtomicNoRet()) {
435 req->setFlags(Request::ATOMIC_NO_RETURN_OP);
436 }
437 }
438
439 // Map returned packets and the addresses they satisfy with which lane they
440 // were requested from
441 typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
442 StatusVector memStatusVector;
443
444 // Track the status of memory requests per lane, a bit per lane
445 VectorMask statusBitVector;
446 // for ld_v# or st_v#
447 std::vector<int> statusVector;
448 std::vector<int> tlbHitLevel;
449
450 private:
451 GPUStaticInst *_staticInst;
452 uint64_t _seqNum;
453};
454
455#endif // __GPU_DYN_INST_HH__
265
266 void updateStats();
267
268 GPUStaticInst* staticInstruction() { return _staticInst; }
269
270 bool isALU() const;
271 bool isBranch() const;
272 bool isNop() const;
273 bool isReturn() const;
274 bool isUnconditionalJump() const;
275 bool isSpecialOp() const;
276 bool isWaitcnt() const;
277
278 bool isBarrier() const;
279 bool isMemFence() const;
280 bool isMemRef() const;
281 bool isFlat() const;
282 bool isLoad() const;
283 bool isStore() const;
284
285 bool isAtomic() const;
286 bool isAtomicNoRet() const;
287 bool isAtomicRet() const;
288
289 bool isScalar() const;
290 bool readsSCC() const;
291 bool writesSCC() const;
292 bool readsVCC() const;
293 bool writesVCC() const;
294
295 bool isAtomicAnd() const;
296 bool isAtomicOr() const;
297 bool isAtomicXor() const;
298 bool isAtomicCAS() const;
299 bool isAtomicExch() const;
300 bool isAtomicAdd() const;
301 bool isAtomicSub() const;
302 bool isAtomicInc() const;
303 bool isAtomicDec() const;
304 bool isAtomicMax() const;
305 bool isAtomicMin() const;
306
307 bool isArgLoad() const;
308 bool isGlobalMem() const;
309 bool isLocalMem() const;
310
311 bool isArgSeg() const;
312 bool isGlobalSeg() const;
313 bool isGroupSeg() const;
314 bool isKernArgSeg() const;
315 bool isPrivateSeg() const;
316 bool isReadOnlySeg() const;
317 bool isSpillSeg() const;
318
319 bool isWorkitemScope() const;
320 bool isWavefrontScope() const;
321 bool isWorkgroupScope() const;
322 bool isDeviceScope() const;
323 bool isSystemScope() const;
324 bool isNoScope() const;
325
326 bool isRelaxedOrder() const;
327 bool isAcquire() const;
328 bool isRelease() const;
329 bool isAcquireRelease() const;
330 bool isNoOrder() const;
331
332 bool isGloballyCoherent() const;
333 bool isSystemCoherent() const;
334
335 /*
336 * Loads/stores/atomics may have acquire/release semantics associated
337 * withthem. Some protocols want to see the acquire/release as separate
338 * requests from the load/store/atomic. We implement that separation
339 * using continuations (i.e., a function pointer with an object associated
340 * with it). When, for example, the front-end generates a store with
341 * release semantics, we will first issue a normal store and set the
342 * continuation in the GPUDynInst to a function that generate a
343 * release request. That continuation will be called when the normal
344 * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
345 * continuation will be called in the context of the same GPUDynInst
346 * that generated the initial store.
347 */
348 std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
349
350 // when true, call execContinuation when response arrives
351 bool useContinuation;
352
353 template<typename c0> AtomicOpFunctor*
354 makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
355 {
356 if (isAtomicAnd()) {
357 return new AtomicOpAnd<c0>(*reg0);
358 } else if (isAtomicOr()) {
359 return new AtomicOpOr<c0>(*reg0);
360 } else if (isAtomicXor()) {
361 return new AtomicOpXor<c0>(*reg0);
362 } else if (isAtomicCAS()) {
363 return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
364 } else if (isAtomicExch()) {
365 return new AtomicOpExch<c0>(*reg0);
366 } else if (isAtomicAdd()) {
367 return new AtomicOpAdd<c0>(*reg0);
368 } else if (isAtomicSub()) {
369 return new AtomicOpSub<c0>(*reg0);
370 } else if (isAtomicInc()) {
371 return new AtomicOpInc<c0>();
372 } else if (isAtomicDec()) {
373 return new AtomicOpDec<c0>();
374 } else if (isAtomicMax()) {
375 return new AtomicOpMax<c0>(*reg0);
376 } else if (isAtomicMin()) {
377 return new AtomicOpMin<c0>(*reg0);
378 } else {
379 fatal("Unrecognized atomic operation");
380 }
381 }
382
383 void
384 setRequestFlags(Request *req, bool setMemOrder=true)
385 {
386 // currently these are the easy scopes to deduce
387 if (isPrivateSeg()) {
388 req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
389 } else if (isSpillSeg()) {
390 req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
391 } else if (isGlobalSeg()) {
392 req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
393 } else if (isReadOnlySeg()) {
394 req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
395 } else if (isGroupSeg()) {
396 req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
397 } else if (isFlat()) {
398 // TODO: translate to correct scope
399 assert(false);
400 } else {
401 fatal("%s has bad segment type\n", disassemble());
402 }
403
404 if (isWavefrontScope()) {
405 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
406 Request::WAVEFRONT_SCOPE);
407 } else if (isWorkgroupScope()) {
408 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
409 Request::WORKGROUP_SCOPE);
410 } else if (isDeviceScope()) {
411 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
412 Request::DEVICE_SCOPE);
413 } else if (isSystemScope()) {
414 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
415 Request::SYSTEM_SCOPE);
416 } else if (!isNoScope() && !isWorkitemScope()) {
417 fatal("%s has bad scope type\n", disassemble());
418 }
419
420 if (setMemOrder) {
421 // set acquire and release flags
422 if (isAcquire()) {
423 req->setFlags(Request::ACQUIRE);
424 } else if (isRelease()) {
425 req->setFlags(Request::RELEASE);
426 } else if (isAcquireRelease()) {
427 req->setFlags(Request::ACQUIRE | Request::RELEASE);
428 } else if (!isNoOrder()) {
429 fatal("%s has bad memory order\n", disassemble());
430 }
431 }
432
433 // set atomic type
434 // currently, the instruction genenerator only produces atomic return
435 // but a magic instruction can produce atomic no return
436 if (isAtomicRet()) {
437 req->setFlags(Request::ATOMIC_RETURN_OP);
438 } else if (isAtomicNoRet()) {
439 req->setFlags(Request::ATOMIC_NO_RETURN_OP);
440 }
441 }
442
443 // Map returned packets and the addresses they satisfy with which lane they
444 // were requested from
445 typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
446 StatusVector memStatusVector;
447
448 // Track the status of memory requests per lane, a bit per lane
449 VectorMask statusBitVector;
450 // for ld_v# or st_v#
451 std::vector<int> statusVector;
452 std::vector<int> tlbHitLevel;
453
454 private:
455 GPUStaticInst *_staticInst;
456 uint64_t _seqNum;
457};
458
459#endif // __GPU_DYN_INST_HH__