Deleted Added
sdiff udiff text old ( 12889:6d4515549710 ) new ( 13449:2f7efa89c58b )
full compact
1/*
2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Anthony Gutierrez
34 */
35
36#ifndef __GPU_DYN_INST_HH__
37#define __GPU_DYN_INST_HH__
38
39#include <cstdint>
40#include <string>
41
42#include "enums/MemType.hh"
43#include "enums/StorageClassType.hh"
44#include "gpu-compute/compute_unit.hh"
45#include "gpu-compute/gpu_exec_context.hh"
46
47class GPUStaticInst;
48
49template<typename T>
50class AtomicOpAnd : public TypedAtomicOpFunctor<T>
51{
52 public:
53 T a;
54
55 AtomicOpAnd(T _a) : a(_a) { }
56 void execute(T *b) { *b &= a; }
57 AtomicOpFunctor* clone () { return new AtomicOpAnd(a); }
58};
59
60template<typename T>
61class AtomicOpOr : public TypedAtomicOpFunctor<T>
62{
63 public:
64 T a;
65 AtomicOpOr(T _a) : a(_a) { }
66 void execute(T *b) { *b |= a; }
67 AtomicOpFunctor* clone () { return new AtomicOpOr(a); }
68};
69
70template<typename T>
71class AtomicOpXor : public TypedAtomicOpFunctor<T>
72{
73 public:
74 T a;
75 AtomicOpXor(T _a) : a(_a) {}
76 void execute(T *b) { *b ^= a; }
77 AtomicOpFunctor* clone () { return new AtomicOpXor(a); }
78};
79
80template<typename T>
81class AtomicOpCAS : public TypedAtomicOpFunctor<T>
82{
83 public:
84 T c;
85 T s;
86
87 ComputeUnit *computeUnit;
88
89 AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
90 : c(_c), s(_s), computeUnit(compute_unit) { }
91
92 void
93 execute(T *b)
94 {
95 computeUnit->numCASOps++;
96
97 if (*b == c) {
98 *b = s;
99 } else {
100 computeUnit->numFailedCASOps++;
101 }
102
103 if (computeUnit->xact_cas_mode) {
104 computeUnit->xactCasLoadMap.clear();
105 }
106 }
107 AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
108};
109
110template<typename T>
111class AtomicOpExch : public TypedAtomicOpFunctor<T>
112{
113 public:
114 T a;
115 AtomicOpExch(T _a) : a(_a) { }
116 void execute(T *b) { *b = a; }
117 AtomicOpFunctor* clone () { return new AtomicOpExch(a); }
118};
119
120template<typename T>
121class AtomicOpAdd : public TypedAtomicOpFunctor<T>
122{
123 public:
124 T a;
125 AtomicOpAdd(T _a) : a(_a) { }
126 void execute(T *b) { *b += a; }
127 AtomicOpFunctor* clone () { return new AtomicOpAdd(a); }
128};
129
130template<typename T>
131class AtomicOpSub : public TypedAtomicOpFunctor<T>
132{
133 public:
134 T a;
135 AtomicOpSub(T _a) : a(_a) { }
136 void execute(T *b) { *b -= a; }
137 AtomicOpFunctor* clone () { return new AtomicOpSub(a); }
138};
139
140template<typename T>
141class AtomicOpInc : public TypedAtomicOpFunctor<T>
142{
143 public:
144 AtomicOpInc() { }
145 void execute(T *b) { *b += 1; }
146 AtomicOpFunctor* clone () { return new AtomicOpInc(); }
147};
148
149template<typename T>
150class AtomicOpDec : public TypedAtomicOpFunctor<T>
151{
152 public:
153 AtomicOpDec() {}
154 void execute(T *b) { *b -= 1; }
155 AtomicOpFunctor* clone () { return new AtomicOpDec(); }
156};
157
158template<typename T>
159class AtomicOpMax : public TypedAtomicOpFunctor<T>
160{
161 public:
162 T a;
163 AtomicOpMax(T _a) : a(_a) { }
164
165 void
166 execute(T *b)
167 {
168 if (a > *b)
169 *b = a;
170 }
171 AtomicOpFunctor* clone () { return new AtomicOpMax(a); }
172};
173
174template<typename T>
175class AtomicOpMin : public TypedAtomicOpFunctor<T>
176{
177 public:
178 T a;
179 AtomicOpMin(T _a) : a(_a) {}
180
181 void
182 execute(T *b)
183 {
184 if (a < *b)
185 *b = a;
186 }
187 AtomicOpFunctor* clone () { return new AtomicOpMin(a); }
188};
189
190typedef enum
191{
192 VT_32,
193 VT_64,
194} vgpr_type;
195
196class GPUDynInst : public GPUExecContext
197{
198 public:
199 GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
200 uint64_t instSeqNum);
201 ~GPUDynInst();
202 void execute(GPUDynInstPtr gpuDynInst);
203 int numSrcRegOperands();
204 int numDstRegOperands();
205 int getNumOperands();
206 bool isVectorRegister(int operandIdx);
207 bool isScalarRegister(int operandIdx);
208 bool isCondRegister(int operandIdx);
209 int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
210 int getOperandSize(int operandIdx);
211 bool isDstOperand(int operandIdx);
212 bool isSrcOperand(int operandIdx);
213
214 const std::string &disassemble() const;
215
216 uint64_t seqNum() const;
217
218 Enums::StorageClassType executedAs();
219
220 // The address of the memory operation
221 std::vector<Addr> addr;
222 Addr pAddr;
223
224 // The data to get written
225 uint8_t *d_data;
226 // Additional data (for atomics)
227 uint8_t *a_data;
228 // Additional data (for atomics)
229 uint8_t *x_data;
230 // The execution mask
231 VectorMask exec_mask;
232
233 // The memory type (M_U32, M_S32, ...)
234 Enums::MemType m_type;
235
236 // The equivalency class
237 int equiv;
238 // The return VGPR type (VT_32 or VT_64)
239 vgpr_type v_type;
240 // Number of VGPR's accessed (1, 2, or 4)
241 int n_reg;
242 // The return VGPR index
243 int dst_reg;
244 // There can be max 4 dest regs>
245 int dst_reg_vec[4];
246 // SIMD where the WF of the memory instruction has been mapped to
247 int simdId;
248 // unique id of the WF where the memory instruction belongs to
249 int wfDynId;
250 // The kernel id of the requesting wf
251 int kern_id;
252 // The CU id of the requesting wf
253 int cu_id;
254 // HW slot id where the WF is mapped to inside a SIMD unit
255 int wfSlotId;
256 // execution pipeline id where the memory instruction has been scheduled
257 int pipeId;
258 // The execution time of this operation
259 Tick time;
260 // The latency of this operation
261 WaitClass latency;
262 // A list of bank conflicts for the 4 cycles.
263 uint32_t bc[4];
264
265 // A pointer to ROM
266 uint8_t *rom;
267 // The size of the READONLY segment
268 int sz_rom;
269
270 // Initiate the specified memory operation, by creating a
271 // memory request and sending it off to the memory system.
272 void initiateAcc(GPUDynInstPtr gpuDynInst);
273 // Complete the specified memory operation, by writing
274 // value back to the RF in the case of a load or atomic
275 // return or, in the case of a store, we do nothing
276 void completeAcc(GPUDynInstPtr gpuDynInst);
277
278 void updateStats();
279
280 GPUStaticInst* staticInstruction() { return _staticInst; }
281
282 bool isALU() const;
283 bool isBranch() const;
284 bool isNop() const;
285 bool isReturn() const;
286 bool isUnconditionalJump() const;
287 bool isSpecialOp() const;
288 bool isWaitcnt() const;
289
290 bool isBarrier() const;
291 bool isMemFence() const;
292 bool isMemRef() const;
293 bool isFlat() const;
294 bool isLoad() const;
295 bool isStore() const;
296
297 bool isAtomic() const;
298 bool isAtomicNoRet() const;
299 bool isAtomicRet() const;
300
301 bool isScalar() const;
302 bool readsSCC() const;
303 bool writesSCC() const;
304 bool readsVCC() const;
305 bool writesVCC() const;
306
307 bool isAtomicAnd() const;
308 bool isAtomicOr() const;
309 bool isAtomicXor() const;
310 bool isAtomicCAS() const;
311 bool isAtomicExch() const;
312 bool isAtomicAdd() const;
313 bool isAtomicSub() const;
314 bool isAtomicInc() const;
315 bool isAtomicDec() const;
316 bool isAtomicMax() const;
317 bool isAtomicMin() const;
318
319 bool isArgLoad() const;
320 bool isGlobalMem() const;
321 bool isLocalMem() const;
322
323 bool isArgSeg() const;
324 bool isGlobalSeg() const;
325 bool isGroupSeg() const;
326 bool isKernArgSeg() const;
327 bool isPrivateSeg() const;
328 bool isReadOnlySeg() const;
329 bool isSpillSeg() const;
330
331 bool isWorkitemScope() const;
332 bool isWavefrontScope() const;
333 bool isWorkgroupScope() const;
334 bool isDeviceScope() const;
335 bool isSystemScope() const;
336 bool isNoScope() const;
337
338 bool isRelaxedOrder() const;
339 bool isAcquire() const;
340 bool isRelease() const;
341 bool isAcquireRelease() const;
342 bool isNoOrder() const;
343
344 bool isGloballyCoherent() const;
345 bool isSystemCoherent() const;
346
347 /*
348 * Loads/stores/atomics may have acquire/release semantics associated
349 * withthem. Some protocols want to see the acquire/release as separate
350 * requests from the load/store/atomic. We implement that separation
351 * using continuations (i.e., a function pointer with an object associated
352 * with it). When, for example, the front-end generates a store with
353 * release semantics, we will first issue a normal store and set the
354 * continuation in the GPUDynInst to a function that generate a
355 * release request. That continuation will be called when the normal
356 * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
357 * continuation will be called in the context of the same GPUDynInst
358 * that generated the initial store.
359 */
360 std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
361
362 // when true, call execContinuation when response arrives
363 bool useContinuation;
364
365 template<typename c0> AtomicOpFunctor*
366 makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
367 {
368 if (isAtomicAnd()) {
369 return new AtomicOpAnd<c0>(*reg0);
370 } else if (isAtomicOr()) {
371 return new AtomicOpOr<c0>(*reg0);
372 } else if (isAtomicXor()) {
373 return new AtomicOpXor<c0>(*reg0);
374 } else if (isAtomicCAS()) {
375 return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
376 } else if (isAtomicExch()) {
377 return new AtomicOpExch<c0>(*reg0);
378 } else if (isAtomicAdd()) {
379 return new AtomicOpAdd<c0>(*reg0);
380 } else if (isAtomicSub()) {
381 return new AtomicOpSub<c0>(*reg0);
382 } else if (isAtomicInc()) {
383 return new AtomicOpInc<c0>();
384 } else if (isAtomicDec()) {
385 return new AtomicOpDec<c0>();
386 } else if (isAtomicMax()) {
387 return new AtomicOpMax<c0>(*reg0);
388 } else if (isAtomicMin()) {
389 return new AtomicOpMin<c0>(*reg0);
390 } else {
391 fatal("Unrecognized atomic operation");
392 }
393 }
394
395 void
396 setRequestFlags(RequestPtr req, bool setMemOrder=true)
397 {
398 // currently these are the easy scopes to deduce
399 if (isPrivateSeg()) {
400 req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
401 } else if (isSpillSeg()) {
402 req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
403 } else if (isGlobalSeg()) {
404 req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
405 } else if (isReadOnlySeg()) {
406 req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
407 } else if (isGroupSeg()) {
408 req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
409 } else if (isFlat()) {
410 // TODO: translate to correct scope
411 assert(false);
412 } else {
413 fatal("%s has bad segment type\n", disassemble());
414 }
415
416 if (isWavefrontScope()) {
417 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
418 Request::WAVEFRONT_SCOPE);
419 } else if (isWorkgroupScope()) {
420 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
421 Request::WORKGROUP_SCOPE);
422 } else if (isDeviceScope()) {
423 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
424 Request::DEVICE_SCOPE);
425 } else if (isSystemScope()) {
426 req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
427 Request::SYSTEM_SCOPE);
428 } else if (!isNoScope() && !isWorkitemScope()) {
429 fatal("%s has bad scope type\n", disassemble());
430 }
431
432 if (setMemOrder) {
433 // set acquire and release flags
434 if (isAcquire()) {
435 req->setFlags(Request::ACQUIRE);
436 } else if (isRelease()) {
437 req->setFlags(Request::RELEASE);
438 } else if (isAcquireRelease()) {
439 req->setFlags(Request::ACQUIRE | Request::RELEASE);
440 } else if (!isNoOrder()) {
441 fatal("%s has bad memory order\n", disassemble());
442 }
443 }
444
445 // set atomic type
446 // currently, the instruction genenerator only produces atomic return
447 // but a magic instruction can produce atomic no return
448 if (isAtomicRet()) {
449 req->setFlags(Request::ATOMIC_RETURN_OP);
450 } else if (isAtomicNoRet()) {
451 req->setFlags(Request::ATOMIC_NO_RETURN_OP);
452 }
453 }
454
455 // Map returned packets and the addresses they satisfy with which lane they
456 // were requested from
457 typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
458 StatusVector memStatusVector;
459
460 // Track the status of memory requests per lane, a bit per lane
461 VectorMask statusBitVector;
462 // for ld_v# or st_v#
463 std::vector<int> statusVector;
464 std::vector<int> tlbHitLevel;
465
466 private:
467 GPUStaticInst *_staticInst;
468 uint64_t _seqNum;
469};
470
471#endif // __GPU_DYN_INST_HH__