1/*
2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Anthony Gutierrez
34 */
35
36#ifndef __GPU_DYN_INST_HH__
37#define __GPU_DYN_INST_HH__
38
39#include <cstdint>
40#include <string>
41
42#include "base/logging.hh"
43#include "enums/MemType.hh"
44#include "enums/StorageClassType.hh"
45#include "gpu-compute/compute_unit.hh"
46#include "gpu-compute/gpu_exec_context.hh"
47
48class GPUStaticInst;
49
50template<typename T>
51class AtomicOpAnd : public TypedAtomicOpFunctor<T>
52{
53  public:
54    T a;
55
56    AtomicOpAnd(T _a) : a(_a) { }
57    void execute(T *b) { *b &= a; }
58    AtomicOpFunctor* clone () { return new AtomicOpAnd(a); }
59};
60
61template<typename T>
62class AtomicOpOr : public TypedAtomicOpFunctor<T>
63{
64  public:
65    T a;
66    AtomicOpOr(T _a) : a(_a) { }
67    void execute(T *b) { *b |= a; }
68    AtomicOpFunctor* clone () { return new AtomicOpOr(a); }
69};
70
71template<typename T>
72class AtomicOpXor : public TypedAtomicOpFunctor<T>
73{
74  public:
75    T a;
76    AtomicOpXor(T _a) : a(_a) {}
77    void execute(T *b) { *b ^= a; }
78    AtomicOpFunctor* clone () { return new AtomicOpXor(a); }
79};
80
81template<typename T>
82class AtomicOpCAS : public TypedAtomicOpFunctor<T>
83{
84  public:
85    T c;
86    T s;
87
88    ComputeUnit *computeUnit;
89
90    AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
91      : c(_c), s(_s), computeUnit(compute_unit) { }
92
93    void
94    execute(T *b)
95    {
96        computeUnit->numCASOps++;
97
98        if (*b == c) {
99            *b = s;
100        } else {
101            computeUnit->numFailedCASOps++;
102        }
103
104        if (computeUnit->xact_cas_mode) {
105            computeUnit->xactCasLoadMap.clear();
106        }
107    }
108    AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
109};
110
111template<typename T>
112class AtomicOpExch : public TypedAtomicOpFunctor<T>
113{
114  public:
115    T a;
116    AtomicOpExch(T _a) : a(_a) { }
117    void execute(T *b) { *b = a; }
118    AtomicOpFunctor* clone () { return new AtomicOpExch(a); }
119};
120
121template<typename T>
122class AtomicOpAdd : public TypedAtomicOpFunctor<T>
123{
124  public:
125    T a;
126    AtomicOpAdd(T _a) : a(_a) { }
127    void execute(T *b) { *b += a; }
128    AtomicOpFunctor* clone () { return new AtomicOpAdd(a); }
129};
130
131template<typename T>
132class AtomicOpSub : public TypedAtomicOpFunctor<T>
133{
134  public:
135    T a;
136    AtomicOpSub(T _a) : a(_a) { }
137    void execute(T *b) { *b -= a; }
138    AtomicOpFunctor* clone () { return new AtomicOpSub(a); }
139};
140
141template<typename T>
142class AtomicOpInc : public TypedAtomicOpFunctor<T>
143{
144  public:
145    AtomicOpInc() { }
146    void execute(T *b) { *b += 1; }
147    AtomicOpFunctor* clone () { return new AtomicOpInc(); }
148};
149
150template<typename T>
151class AtomicOpDec : public TypedAtomicOpFunctor<T>
152{
153  public:
154    AtomicOpDec() {}
155    void execute(T *b) { *b -= 1; }
156    AtomicOpFunctor* clone () { return new AtomicOpDec(); }
157};
158
159template<typename T>
160class AtomicOpMax : public TypedAtomicOpFunctor<T>
161{
162  public:
163    T a;
164    AtomicOpMax(T _a) : a(_a) { }
165
166    void
167    execute(T *b)
168    {
169        if (a > *b)
170            *b = a;
171    }
172    AtomicOpFunctor* clone () { return new AtomicOpMax(a); }
173};
174
175template<typename T>
176class AtomicOpMin : public TypedAtomicOpFunctor<T>
177{
178  public:
179    T a;
180    AtomicOpMin(T _a) : a(_a) {}
181
182    void
183    execute(T *b)
184    {
185        if (a < *b)
186            *b = a;
187    }
188    AtomicOpFunctor* clone () { return new AtomicOpMin(a); }
189};
190
191typedef enum
192{
193    VT_32,
194    VT_64,
195} vgpr_type;
196
197class GPUDynInst : public GPUExecContext
198{
199  public:
200    GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
201               uint64_t instSeqNum);
202    ~GPUDynInst();
203    void execute(GPUDynInstPtr gpuDynInst);
204    int numSrcRegOperands();
205    int numDstRegOperands();
206    int getNumOperands();
207    bool isVectorRegister(int operandIdx);
208    bool isScalarRegister(int operandIdx);
209    bool isCondRegister(int operandIdx);
210    int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
211    int getOperandSize(int operandIdx);
212    bool isDstOperand(int operandIdx);
213    bool isSrcOperand(int operandIdx);
214
215    const std::string &disassemble() const;
216
217    uint64_t seqNum() const;
218
219    Enums::StorageClassType executedAs();
220
221    // The address of the memory operation
222    std::vector<Addr> addr;
223    Addr pAddr;
224
225    // The data to get written
226    uint8_t *d_data;
227    // Additional data (for atomics)
228    uint8_t *a_data;
229    // Additional data (for atomics)
230    uint8_t *x_data;
231    // The execution mask
232    VectorMask exec_mask;
233
234    // The memory type (M_U32, M_S32, ...)
235    Enums::MemType m_type;
236
237    // The equivalency class
238    int equiv;
239    // The return VGPR type (VT_32 or VT_64)
240    vgpr_type v_type;
241    // Number of VGPR's accessed (1, 2, or 4)
242    int n_reg;
243    // The return VGPR index
244    int dst_reg;
245    // There can be max 4 dest regs>
246    int dst_reg_vec[4];
247    // SIMD where the WF of the memory instruction has been mapped to
248    int simdId;
249    // unique id of the WF where the memory instruction belongs to
250    int wfDynId;
251    // The kernel id of the requesting wf
252    int kern_id;
253    // The CU id of the requesting wf
254    int cu_id;
255    // HW slot id where the WF is mapped to inside a SIMD unit
256    int wfSlotId;
257    // execution pipeline id where the memory instruction has been scheduled
258    int pipeId;
259    // The execution time of this operation
260    Tick time;
261    // The latency of this operation
262    WaitClass latency;
263    // A list of bank conflicts for the 4 cycles.
264    uint32_t bc[4];
265
266    // A pointer to ROM
267    uint8_t *rom;
268    // The size of the READONLY segment
269    int sz_rom;
270
271    // Initiate the specified memory operation, by creating a
272    // memory request and sending it off to the memory system.
273    void initiateAcc(GPUDynInstPtr gpuDynInst);
274    // Complete the specified memory operation, by writing
275    // value back to the RF in the case of a load or atomic
276    // return or, in the case of a store, we do nothing
277    void completeAcc(GPUDynInstPtr gpuDynInst);
278
279    void updateStats();
280
281    GPUStaticInst* staticInstruction() { return _staticInst; }
282
283    bool isALU() const;
284    bool isBranch() const;
285    bool isNop() const;
286    bool isReturn() const;
287    bool isUnconditionalJump() const;
288    bool isSpecialOp() const;
289    bool isWaitcnt() const;
290
291    bool isBarrier() const;
292    bool isMemFence() const;
293    bool isMemRef() const;
294    bool isFlat() const;
295    bool isLoad() const;
296    bool isStore() const;
297
298    bool isAtomic() const;
299    bool isAtomicNoRet() const;
300    bool isAtomicRet() const;
301
302    bool isScalar() const;
303    bool readsSCC() const;
304    bool writesSCC() const;
305    bool readsVCC() const;
306    bool writesVCC() const;
307
308    bool isAtomicAnd() const;
309    bool isAtomicOr() const;
310    bool isAtomicXor() const;
311    bool isAtomicCAS() const;
312    bool isAtomicExch() const;
313    bool isAtomicAdd() const;
314    bool isAtomicSub() const;
315    bool isAtomicInc() const;
316    bool isAtomicDec() const;
317    bool isAtomicMax() const;
318    bool isAtomicMin() const;
319
320    bool isArgLoad() const;
321    bool isGlobalMem() const;
322    bool isLocalMem() const;
323
324    bool isArgSeg() const;
325    bool isGlobalSeg() const;
326    bool isGroupSeg() const;
327    bool isKernArgSeg() const;
328    bool isPrivateSeg() const;
329    bool isReadOnlySeg() const;
330    bool isSpillSeg() const;
331
332    bool isWorkitemScope() const;
333    bool isWavefrontScope() const;
334    bool isWorkgroupScope() const;
335    bool isDeviceScope() const;
336    bool isSystemScope() const;
337    bool isNoScope() const;
338
339    bool isRelaxedOrder() const;
340    bool isAcquire() const;
341    bool isRelease() const;
342    bool isAcquireRelease() const;
343    bool isNoOrder() const;
344
345    bool isGloballyCoherent() const;
346    bool isSystemCoherent() const;
347
348    /*
349     * Loads/stores/atomics may have acquire/release semantics associated
350     * withthem. Some protocols want to see the acquire/release as separate
351     * requests from the load/store/atomic. We implement that separation
352     * using continuations (i.e., a function pointer with an object associated
353     * with it). When, for example, the front-end generates a store with
354     * release semantics, we will first issue a normal store and set the
355     * continuation in the GPUDynInst to a function that generate a
356     * release request. That continuation will be called when the normal
357     * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
358     * continuation will be called in the context of the same GPUDynInst
359     * that generated the initial store.
360     */
361    std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
362
363    // when true, call execContinuation when response arrives
364    bool useContinuation;
365
366    template<typename c0> AtomicOpFunctor*
367    makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
368    {
369        if (isAtomicAnd()) {
370            return new AtomicOpAnd<c0>(*reg0);
371        } else if (isAtomicOr()) {
372            return new AtomicOpOr<c0>(*reg0);
373        } else if (isAtomicXor()) {
374            return new AtomicOpXor<c0>(*reg0);
375        } else if (isAtomicCAS()) {
376            return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
377        } else if (isAtomicExch()) {
378            return new AtomicOpExch<c0>(*reg0);
379        } else if (isAtomicAdd()) {
380            return new AtomicOpAdd<c0>(*reg0);
381        } else if (isAtomicSub()) {
382            return new AtomicOpSub<c0>(*reg0);
383        } else if (isAtomicInc()) {
384            return new AtomicOpInc<c0>();
385        } else if (isAtomicDec()) {
386            return new AtomicOpDec<c0>();
387        } else if (isAtomicMax()) {
388            return new AtomicOpMax<c0>(*reg0);
389        } else if (isAtomicMin()) {
390            return new AtomicOpMin<c0>(*reg0);
391        } else {
392            fatal("Unrecognized atomic operation");
393        }
394    }
395
396    void
397    setRequestFlags(RequestPtr req, bool setMemOrder=true)
398    {
399        // currently these are the easy scopes to deduce
400        if (isPrivateSeg()) {
401            req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
402        } else if (isSpillSeg()) {
403            req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
404        } else if (isGlobalSeg()) {
405            req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
406        } else if (isReadOnlySeg()) {
407            req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
408        } else if (isGroupSeg()) {
409            req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
410        } else if (isFlat()) {
411            panic("TODO: translate to correct scope");
412        } else {
413            fatal("%s has bad segment type\n", disassemble());
414        }
415
416        if (isWavefrontScope()) {
417            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
418                                        Request::WAVEFRONT_SCOPE);
419        } else if (isWorkgroupScope()) {
420            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
421                                        Request::WORKGROUP_SCOPE);
422        } else if (isDeviceScope()) {
423            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
424                                        Request::DEVICE_SCOPE);
425        } else if (isSystemScope()) {
426            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
427                                        Request::SYSTEM_SCOPE);
428        } else if (!isNoScope() && !isWorkitemScope()) {
429            fatal("%s has bad scope type\n", disassemble());
430        }
431
432        if (setMemOrder) {
433            // set acquire and release flags
434            if (isAcquire()) {
435                req->setFlags(Request::ACQUIRE);
436            } else if (isRelease()) {
437                req->setFlags(Request::RELEASE);
438            } else if (isAcquireRelease()) {
439                req->setFlags(Request::ACQUIRE | Request::RELEASE);
440            } else if (!isNoOrder()) {
441                fatal("%s has bad memory order\n", disassemble());
442            }
443        }
444
445        // set atomic type
446        // currently, the instruction genenerator only produces atomic return
447        // but a magic instruction can produce atomic no return
448        if (isAtomicRet()) {
449            req->setFlags(Request::ATOMIC_RETURN_OP);
450        } else if (isAtomicNoRet()) {
451            req->setFlags(Request::ATOMIC_NO_RETURN_OP);
452        }
453    }
454
455    // Map returned packets and the addresses they satisfy with which lane they
456    // were requested from
457    typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
458    StatusVector memStatusVector;
459
460    // Track the status of memory requests per lane, a bit per lane
461    VectorMask statusBitVector;
462    // for ld_v# or st_v#
463    std::vector<int> statusVector;
464    std::vector<int> tlbHitLevel;
465
466  private:
467    GPUStaticInst *_staticInst;
468    uint64_t _seqNum;
469};
470
471#endif // __GPU_DYN_INST_HH__
472