gpu_dyn_inst.hh revision 12889
1/*
2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Anthony Gutierrez
34 */
35
36#ifndef __GPU_DYN_INST_HH__
37#define __GPU_DYN_INST_HH__
38
39#include <cstdint>
40#include <string>
41
42#include "enums/MemType.hh"
43#include "enums/StorageClassType.hh"
44#include "gpu-compute/compute_unit.hh"
45#include "gpu-compute/gpu_exec_context.hh"
46
47class GPUStaticInst;
48
49template<typename T>
50class AtomicOpAnd : public TypedAtomicOpFunctor<T>
51{
52  public:
53    T a;
54
55    AtomicOpAnd(T _a) : a(_a) { }
56    void execute(T *b) { *b &= a; }
57    AtomicOpFunctor* clone () { return new AtomicOpAnd(a); }
58};
59
60template<typename T>
61class AtomicOpOr : public TypedAtomicOpFunctor<T>
62{
63  public:
64    T a;
65    AtomicOpOr(T _a) : a(_a) { }
66    void execute(T *b) { *b |= a; }
67    AtomicOpFunctor* clone () { return new AtomicOpOr(a); }
68};
69
70template<typename T>
71class AtomicOpXor : public TypedAtomicOpFunctor<T>
72{
73  public:
74    T a;
75    AtomicOpXor(T _a) : a(_a) {}
76    void execute(T *b) { *b ^= a; }
77    AtomicOpFunctor* clone () { return new AtomicOpXor(a); }
78};
79
80template<typename T>
81class AtomicOpCAS : public TypedAtomicOpFunctor<T>
82{
83  public:
84    T c;
85    T s;
86
87    ComputeUnit *computeUnit;
88
89    AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
90      : c(_c), s(_s), computeUnit(compute_unit) { }
91
92    void
93    execute(T *b)
94    {
95        computeUnit->numCASOps++;
96
97        if (*b == c) {
98            *b = s;
99        } else {
100            computeUnit->numFailedCASOps++;
101        }
102
103        if (computeUnit->xact_cas_mode) {
104            computeUnit->xactCasLoadMap.clear();
105        }
106    }
107    AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
108};
109
110template<typename T>
111class AtomicOpExch : public TypedAtomicOpFunctor<T>
112{
113  public:
114    T a;
115    AtomicOpExch(T _a) : a(_a) { }
116    void execute(T *b) { *b = a; }
117    AtomicOpFunctor* clone () { return new AtomicOpExch(a); }
118};
119
120template<typename T>
121class AtomicOpAdd : public TypedAtomicOpFunctor<T>
122{
123  public:
124    T a;
125    AtomicOpAdd(T _a) : a(_a) { }
126    void execute(T *b) { *b += a; }
127    AtomicOpFunctor* clone () { return new AtomicOpAdd(a); }
128};
129
130template<typename T>
131class AtomicOpSub : public TypedAtomicOpFunctor<T>
132{
133  public:
134    T a;
135    AtomicOpSub(T _a) : a(_a) { }
136    void execute(T *b) { *b -= a; }
137    AtomicOpFunctor* clone () { return new AtomicOpSub(a); }
138};
139
140template<typename T>
141class AtomicOpInc : public TypedAtomicOpFunctor<T>
142{
143  public:
144    AtomicOpInc() { }
145    void execute(T *b) { *b += 1; }
146    AtomicOpFunctor* clone () { return new AtomicOpInc(); }
147};
148
149template<typename T>
150class AtomicOpDec : public TypedAtomicOpFunctor<T>
151{
152  public:
153    AtomicOpDec() {}
154    void execute(T *b) { *b -= 1; }
155    AtomicOpFunctor* clone () { return new AtomicOpDec(); }
156};
157
158template<typename T>
159class AtomicOpMax : public TypedAtomicOpFunctor<T>
160{
161  public:
162    T a;
163    AtomicOpMax(T _a) : a(_a) { }
164
165    void
166    execute(T *b)
167    {
168        if (a > *b)
169            *b = a;
170    }
171    AtomicOpFunctor* clone () { return new AtomicOpMax(a); }
172};
173
174template<typename T>
175class AtomicOpMin : public TypedAtomicOpFunctor<T>
176{
177  public:
178    T a;
179    AtomicOpMin(T _a) : a(_a) {}
180
181    void
182    execute(T *b)
183    {
184        if (a < *b)
185            *b = a;
186    }
187    AtomicOpFunctor* clone () { return new AtomicOpMin(a); }
188};
189
190typedef enum
191{
192    VT_32,
193    VT_64,
194} vgpr_type;
195
196class GPUDynInst : public GPUExecContext
197{
198  public:
199    GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
200               uint64_t instSeqNum);
201    ~GPUDynInst();
202    void execute(GPUDynInstPtr gpuDynInst);
203    int numSrcRegOperands();
204    int numDstRegOperands();
205    int getNumOperands();
206    bool isVectorRegister(int operandIdx);
207    bool isScalarRegister(int operandIdx);
208    bool isCondRegister(int operandIdx);
209    int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
210    int getOperandSize(int operandIdx);
211    bool isDstOperand(int operandIdx);
212    bool isSrcOperand(int operandIdx);
213
214    const std::string &disassemble() const;
215
216    uint64_t seqNum() const;
217
218    Enums::StorageClassType executedAs();
219
220    // The address of the memory operation
221    std::vector<Addr> addr;
222    Addr pAddr;
223
224    // The data to get written
225    uint8_t *d_data;
226    // Additional data (for atomics)
227    uint8_t *a_data;
228    // Additional data (for atomics)
229    uint8_t *x_data;
230    // The execution mask
231    VectorMask exec_mask;
232
233    // The memory type (M_U32, M_S32, ...)
234    Enums::MemType m_type;
235
236    // The equivalency class
237    int equiv;
238    // The return VGPR type (VT_32 or VT_64)
239    vgpr_type v_type;
240    // Number of VGPR's accessed (1, 2, or 4)
241    int n_reg;
242    // The return VGPR index
243    int dst_reg;
244    // There can be max 4 dest regs>
245    int dst_reg_vec[4];
246    // SIMD where the WF of the memory instruction has been mapped to
247    int simdId;
248    // unique id of the WF where the memory instruction belongs to
249    int wfDynId;
250    // The kernel id of the requesting wf
251    int kern_id;
252    // The CU id of the requesting wf
253    int cu_id;
254    // HW slot id where the WF is mapped to inside a SIMD unit
255    int wfSlotId;
256    // execution pipeline id where the memory instruction has been scheduled
257    int pipeId;
258    // The execution time of this operation
259    Tick time;
260    // The latency of this operation
261    WaitClass latency;
262    // A list of bank conflicts for the 4 cycles.
263    uint32_t bc[4];
264
265    // A pointer to ROM
266    uint8_t *rom;
267    // The size of the READONLY segment
268    int sz_rom;
269
270    // Initiate the specified memory operation, by creating a
271    // memory request and sending it off to the memory system.
272    void initiateAcc(GPUDynInstPtr gpuDynInst);
273    // Complete the specified memory operation, by writing
274    // value back to the RF in the case of a load or atomic
275    // return or, in the case of a store, we do nothing
276    void completeAcc(GPUDynInstPtr gpuDynInst);
277
278    void updateStats();
279
280    GPUStaticInst* staticInstruction() { return _staticInst; }
281
282    bool isALU() const;
283    bool isBranch() const;
284    bool isNop() const;
285    bool isReturn() const;
286    bool isUnconditionalJump() const;
287    bool isSpecialOp() const;
288    bool isWaitcnt() const;
289
290    bool isBarrier() const;
291    bool isMemFence() const;
292    bool isMemRef() const;
293    bool isFlat() const;
294    bool isLoad() const;
295    bool isStore() const;
296
297    bool isAtomic() const;
298    bool isAtomicNoRet() const;
299    bool isAtomicRet() const;
300
301    bool isScalar() const;
302    bool readsSCC() const;
303    bool writesSCC() const;
304    bool readsVCC() const;
305    bool writesVCC() const;
306
307    bool isAtomicAnd() const;
308    bool isAtomicOr() const;
309    bool isAtomicXor() const;
310    bool isAtomicCAS() const;
311    bool isAtomicExch() const;
312    bool isAtomicAdd() const;
313    bool isAtomicSub() const;
314    bool isAtomicInc() const;
315    bool isAtomicDec() const;
316    bool isAtomicMax() const;
317    bool isAtomicMin() const;
318
319    bool isArgLoad() const;
320    bool isGlobalMem() const;
321    bool isLocalMem() const;
322
323    bool isArgSeg() const;
324    bool isGlobalSeg() const;
325    bool isGroupSeg() const;
326    bool isKernArgSeg() const;
327    bool isPrivateSeg() const;
328    bool isReadOnlySeg() const;
329    bool isSpillSeg() const;
330
331    bool isWorkitemScope() const;
332    bool isWavefrontScope() const;
333    bool isWorkgroupScope() const;
334    bool isDeviceScope() const;
335    bool isSystemScope() const;
336    bool isNoScope() const;
337
338    bool isRelaxedOrder() const;
339    bool isAcquire() const;
340    bool isRelease() const;
341    bool isAcquireRelease() const;
342    bool isNoOrder() const;
343
344    bool isGloballyCoherent() const;
345    bool isSystemCoherent() const;
346
347    /*
348     * Loads/stores/atomics may have acquire/release semantics associated
349     * withthem. Some protocols want to see the acquire/release as separate
350     * requests from the load/store/atomic. We implement that separation
351     * using continuations (i.e., a function pointer with an object associated
352     * with it). When, for example, the front-end generates a store with
353     * release semantics, we will first issue a normal store and set the
354     * continuation in the GPUDynInst to a function that generate a
355     * release request. That continuation will be called when the normal
356     * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
357     * continuation will be called in the context of the same GPUDynInst
358     * that generated the initial store.
359     */
360    std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
361
362    // when true, call execContinuation when response arrives
363    bool useContinuation;
364
365    template<typename c0> AtomicOpFunctor*
366    makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
367    {
368        if (isAtomicAnd()) {
369            return new AtomicOpAnd<c0>(*reg0);
370        } else if (isAtomicOr()) {
371            return new AtomicOpOr<c0>(*reg0);
372        } else if (isAtomicXor()) {
373            return new AtomicOpXor<c0>(*reg0);
374        } else if (isAtomicCAS()) {
375            return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
376        } else if (isAtomicExch()) {
377            return new AtomicOpExch<c0>(*reg0);
378        } else if (isAtomicAdd()) {
379            return new AtomicOpAdd<c0>(*reg0);
380        } else if (isAtomicSub()) {
381            return new AtomicOpSub<c0>(*reg0);
382        } else if (isAtomicInc()) {
383            return new AtomicOpInc<c0>();
384        } else if (isAtomicDec()) {
385            return new AtomicOpDec<c0>();
386        } else if (isAtomicMax()) {
387            return new AtomicOpMax<c0>(*reg0);
388        } else if (isAtomicMin()) {
389            return new AtomicOpMin<c0>(*reg0);
390        } else {
391            fatal("Unrecognized atomic operation");
392        }
393    }
394
395    void
396    setRequestFlags(RequestPtr req, bool setMemOrder=true)
397    {
398        // currently these are the easy scopes to deduce
399        if (isPrivateSeg()) {
400            req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
401        } else if (isSpillSeg()) {
402            req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
403        } else if (isGlobalSeg()) {
404            req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
405        } else if (isReadOnlySeg()) {
406            req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
407        } else if (isGroupSeg()) {
408            req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
409        } else if (isFlat()) {
410            // TODO: translate to correct scope
411            assert(false);
412        } else {
413            fatal("%s has bad segment type\n", disassemble());
414        }
415
416        if (isWavefrontScope()) {
417            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
418                                        Request::WAVEFRONT_SCOPE);
419        } else if (isWorkgroupScope()) {
420            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
421                                        Request::WORKGROUP_SCOPE);
422        } else if (isDeviceScope()) {
423            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
424                                        Request::DEVICE_SCOPE);
425        } else if (isSystemScope()) {
426            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
427                                        Request::SYSTEM_SCOPE);
428        } else if (!isNoScope() && !isWorkitemScope()) {
429            fatal("%s has bad scope type\n", disassemble());
430        }
431
432        if (setMemOrder) {
433            // set acquire and release flags
434            if (isAcquire()) {
435                req->setFlags(Request::ACQUIRE);
436            } else if (isRelease()) {
437                req->setFlags(Request::RELEASE);
438            } else if (isAcquireRelease()) {
439                req->setFlags(Request::ACQUIRE | Request::RELEASE);
440            } else if (!isNoOrder()) {
441                fatal("%s has bad memory order\n", disassemble());
442            }
443        }
444
445        // set atomic type
446        // currently, the instruction genenerator only produces atomic return
447        // but a magic instruction can produce atomic no return
448        if (isAtomicRet()) {
449            req->setFlags(Request::ATOMIC_RETURN_OP);
450        } else if (isAtomicNoRet()) {
451            req->setFlags(Request::ATOMIC_NO_RETURN_OP);
452        }
453    }
454
455    // Map returned packets and the addresses they satisfy with which lane they
456    // were requested from
457    typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
458    StatusVector memStatusVector;
459
460    // Track the status of memory requests per lane, a bit per lane
461    VectorMask statusBitVector;
462    // for ld_v# or st_v#
463    std::vector<int> statusVector;
464    std::vector<int> tlbHitLevel;
465
466  private:
467    GPUStaticInst *_staticInst;
468    uint64_t _seqNum;
469};
470
471#endif // __GPU_DYN_INST_HH__
472