gpu_dyn_inst.hh revision 11693
1/*
2 * Copyright (c) 2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Anthony Gutierrez
34 */
35
36#ifndef __GPU_DYN_INST_HH__
37#define __GPU_DYN_INST_HH__
38
39#include <cstdint>
40#include <string>
41
42#include "enums/MemType.hh"
43#include "enums/StorageClassType.hh"
44#include "gpu-compute/compute_unit.hh"
45#include "gpu-compute/gpu_exec_context.hh"
46
47class GPUStaticInst;
48
49template<typename T>
50class AtomicOpAnd : public TypedAtomicOpFunctor<T>
51{
52  public:
53    T a;
54
55    AtomicOpAnd(T _a) : a(_a) { }
56    void execute(T *b) { *b &= a; }
57};
58
59template<typename T>
60class AtomicOpOr : public TypedAtomicOpFunctor<T>
61{
62  public:
63    T a;
64    AtomicOpOr(T _a) : a(_a) { }
65    void execute(T *b) { *b |= a; }
66};
67
68template<typename T>
69class AtomicOpXor : public TypedAtomicOpFunctor<T>
70{
71  public:
72    T a;
73    AtomicOpXor(T _a) : a(_a) {}
74    void execute(T *b) { *b ^= a; }
75};
76
77template<typename T>
78class AtomicOpCAS : public TypedAtomicOpFunctor<T>
79{
80  public:
81    T c;
82    T s;
83
84    ComputeUnit *computeUnit;
85
86    AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
87      : c(_c), s(_s), computeUnit(compute_unit) { }
88
89    void
90    execute(T *b)
91    {
92        computeUnit->numCASOps++;
93
94        if (*b == c) {
95            *b = s;
96        } else {
97            computeUnit->numFailedCASOps++;
98        }
99
100        if (computeUnit->xact_cas_mode) {
101            computeUnit->xactCasLoadMap.clear();
102        }
103    }
104};
105
106template<typename T>
107class AtomicOpExch : public TypedAtomicOpFunctor<T>
108{
109  public:
110    T a;
111    AtomicOpExch(T _a) : a(_a) { }
112    void execute(T *b) { *b = a; }
113};
114
115template<typename T>
116class AtomicOpAdd : public TypedAtomicOpFunctor<T>
117{
118  public:
119    T a;
120    AtomicOpAdd(T _a) : a(_a) { }
121    void execute(T *b) { *b += a; }
122};
123
124template<typename T>
125class AtomicOpSub : public TypedAtomicOpFunctor<T>
126{
127  public:
128    T a;
129    AtomicOpSub(T _a) : a(_a) { }
130    void execute(T *b) { *b -= a; }
131};
132
133template<typename T>
134class AtomicOpInc : public TypedAtomicOpFunctor<T>
135{
136  public:
137    AtomicOpInc() { }
138    void execute(T *b) { *b += 1; }
139};
140
141template<typename T>
142class AtomicOpDec : public TypedAtomicOpFunctor<T>
143{
144  public:
145    AtomicOpDec() {}
146    void execute(T *b) { *b -= 1; }
147};
148
149template<typename T>
150class AtomicOpMax : public TypedAtomicOpFunctor<T>
151{
152  public:
153    T a;
154    AtomicOpMax(T _a) : a(_a) { }
155
156    void
157    execute(T *b)
158    {
159        if (a > *b)
160            *b = a;
161    }
162};
163
164template<typename T>
165class AtomicOpMin : public TypedAtomicOpFunctor<T>
166{
167  public:
168    T a;
169    AtomicOpMin(T _a) : a(_a) {}
170
171    void
172    execute(T *b)
173    {
174        if (a < *b)
175            *b = a;
176    }
177};
178
179typedef enum
180{
181    VT_32,
182    VT_64,
183} vgpr_type;
184
185class GPUDynInst : public GPUExecContext
186{
187  public:
188    GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
189               uint64_t instSeqNum);
190    ~GPUDynInst();
191    void execute(GPUDynInstPtr gpuDynInst);
192    int numSrcRegOperands();
193    int numDstRegOperands();
194    int getNumOperands();
195    bool isVectorRegister(int operandIdx);
196    bool isScalarRegister(int operandIdx);
197    int getRegisterIndex(int operandIdx);
198    int getOperandSize(int operandIdx);
199    bool isDstOperand(int operandIdx);
200    bool isSrcOperand(int operandIdx);
201
202    const std::string &disassemble() const;
203
204    uint64_t seqNum() const;
205
206    Enums::StorageClassType executedAs();
207
208    // The address of the memory operation
209    std::vector<Addr> addr;
210    Addr pAddr;
211
212    // The data to get written
213    uint8_t *d_data;
214    // Additional data (for atomics)
215    uint8_t *a_data;
216    // Additional data (for atomics)
217    uint8_t *x_data;
218    // The execution mask
219    VectorMask exec_mask;
220
221    // The memory type (M_U32, M_S32, ...)
222    Enums::MemType m_type;
223
224    // The equivalency class
225    int equiv;
226    // The return VGPR type (VT_32 or VT_64)
227    vgpr_type v_type;
228    // Number of VGPR's accessed (1, 2, or 4)
229    int n_reg;
230    // The return VGPR index
231    int dst_reg;
232    // There can be max 4 dest regs>
233    int dst_reg_vec[4];
234    // SIMD where the WF of the memory instruction has been mapped to
235    int simdId;
236    // unique id of the WF where the memory instruction belongs to
237    int wfDynId;
238    // The kernel id of the requesting wf
239    int kern_id;
240    // The CU id of the requesting wf
241    int cu_id;
242    // HW slot id where the WF is mapped to inside a SIMD unit
243    int wfSlotId;
244    // execution pipeline id where the memory instruction has been scheduled
245    int pipeId;
246    // The execution time of this operation
247    Tick time;
248    // The latency of this operation
249    WaitClass latency;
250    // A list of bank conflicts for the 4 cycles.
251    uint32_t bc[4];
252
253    // A pointer to ROM
254    uint8_t *rom;
255    // The size of the READONLY segment
256    int sz_rom;
257
258    // Initiate the specified memory operation, by creating a
259    // memory request and sending it off to the memory system.
260    void initiateAcc(GPUDynInstPtr gpuDynInst);
261    // Complete the specified memory operation, by writing
262    // value back to the RF in the case of a load or atomic
263    // return or, in the case of a store, we do nothing
264    void completeAcc(GPUDynInstPtr gpuDynInst);
265
266    void updateStats();
267
268    GPUStaticInst* staticInstruction() { return _staticInst; }
269
270    bool isALU() const;
271    bool isBranch() const;
272    bool isNop() const;
273    bool isReturn() const;
274    bool isUnconditionalJump() const;
275    bool isSpecialOp() const;
276    bool isWaitcnt() const;
277
278    bool isBarrier() const;
279    bool isMemFence() const;
280    bool isMemRef() const;
281    bool isFlat() const;
282    bool isLoad() const;
283    bool isStore() const;
284
285    bool isAtomic() const;
286    bool isAtomicNoRet() const;
287    bool isAtomicRet() const;
288
289    bool isScalar() const;
290    bool readsSCC() const;
291    bool writesSCC() const;
292    bool readsVCC() const;
293    bool writesVCC() const;
294
295    bool isAtomicAnd() const;
296    bool isAtomicOr() const;
297    bool isAtomicXor() const;
298    bool isAtomicCAS() const;
299    bool isAtomicExch() const;
300    bool isAtomicAdd() const;
301    bool isAtomicSub() const;
302    bool isAtomicInc() const;
303    bool isAtomicDec() const;
304    bool isAtomicMax() const;
305    bool isAtomicMin() const;
306
307    bool isArgLoad() const;
308    bool isGlobalMem() const;
309    bool isLocalMem() const;
310
311    bool isArgSeg() const;
312    bool isGlobalSeg() const;
313    bool isGroupSeg() const;
314    bool isKernArgSeg() const;
315    bool isPrivateSeg() const;
316    bool isReadOnlySeg() const;
317    bool isSpillSeg() const;
318
319    bool isWorkitemScope() const;
320    bool isWavefrontScope() const;
321    bool isWorkgroupScope() const;
322    bool isDeviceScope() const;
323    bool isSystemScope() const;
324    bool isNoScope() const;
325
326    bool isRelaxedOrder() const;
327    bool isAcquire() const;
328    bool isRelease() const;
329    bool isAcquireRelease() const;
330    bool isNoOrder() const;
331
332    bool isGloballyCoherent() const;
333    bool isSystemCoherent() const;
334
335    /*
336     * Loads/stores/atomics may have acquire/release semantics associated
337     * withthem. Some protocols want to see the acquire/release as separate
338     * requests from the load/store/atomic. We implement that separation
339     * using continuations (i.e., a function pointer with an object associated
340     * with it). When, for example, the front-end generates a store with
341     * release semantics, we will first issue a normal store and set the
342     * continuation in the GPUDynInst to a function that generate a
343     * release request. That continuation will be called when the normal
344     * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
345     * continuation will be called in the context of the same GPUDynInst
346     * that generated the initial store.
347     */
348    std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
349
350    // when true, call execContinuation when response arrives
351    bool useContinuation;
352
353    template<typename c0> AtomicOpFunctor*
354    makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
355    {
356        if (isAtomicAnd()) {
357            return new AtomicOpAnd<c0>(*reg0);
358        } else if (isAtomicOr()) {
359            return new AtomicOpOr<c0>(*reg0);
360        } else if (isAtomicXor()) {
361            return new AtomicOpXor<c0>(*reg0);
362        } else if (isAtomicCAS()) {
363            return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
364        } else if (isAtomicExch()) {
365            return new AtomicOpExch<c0>(*reg0);
366        } else if (isAtomicAdd()) {
367            return new AtomicOpAdd<c0>(*reg0);
368        } else if (isAtomicSub()) {
369            return new AtomicOpSub<c0>(*reg0);
370        } else if (isAtomicInc()) {
371            return new AtomicOpInc<c0>();
372        } else if (isAtomicDec()) {
373            return new AtomicOpDec<c0>();
374        } else if (isAtomicMax()) {
375            return new AtomicOpMax<c0>(*reg0);
376        } else if (isAtomicMin()) {
377            return new AtomicOpMin<c0>(*reg0);
378        } else {
379            fatal("Unrecognized atomic operation");
380        }
381    }
382
383    void
384    setRequestFlags(Request *req, bool setMemOrder=true)
385    {
386        // currently these are the easy scopes to deduce
387        if (isPrivateSeg()) {
388            req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
389        } else if (isSpillSeg()) {
390            req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
391        } else if (isGlobalSeg()) {
392            req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
393        } else if (isReadOnlySeg()) {
394            req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
395        } else if (isGroupSeg()) {
396            req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
397        } else if (isFlat()) {
398            // TODO: translate to correct scope
399            assert(false);
400        } else {
401            fatal("%s has bad segment type\n", disassemble());
402        }
403
404        if (isWavefrontScope()) {
405            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
406                                        Request::WAVEFRONT_SCOPE);
407        } else if (isWorkgroupScope()) {
408            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
409                                        Request::WORKGROUP_SCOPE);
410        } else if (isDeviceScope()) {
411            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
412                                        Request::DEVICE_SCOPE);
413        } else if (isSystemScope()) {
414            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
415                                        Request::SYSTEM_SCOPE);
416        } else if (!isNoScope() && !isWorkitemScope()) {
417            fatal("%s has bad scope type\n", disassemble());
418        }
419
420        if (setMemOrder) {
421            // set acquire and release flags
422            if (isAcquire()) {
423                req->setFlags(Request::ACQUIRE);
424            } else if (isRelease()) {
425                req->setFlags(Request::RELEASE);
426            } else if (isAcquireRelease()) {
427                req->setFlags(Request::ACQUIRE | Request::RELEASE);
428            } else if (!isNoOrder()) {
429                fatal("%s has bad memory order\n", disassemble());
430            }
431        }
432
433        // set atomic type
434        // currently, the instruction genenerator only produces atomic return
435        // but a magic instruction can produce atomic no return
436        if (isAtomicRet()) {
437            req->setFlags(Request::ATOMIC_RETURN_OP);
438        } else if (isAtomicNoRet()) {
439            req->setFlags(Request::ATOMIC_NO_RETURN_OP);
440        }
441    }
442
443    // Map returned packets and the addresses they satisfy with which lane they
444    // were requested from
445    typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
446    StatusVector memStatusVector;
447
448    // Track the status of memory requests per lane, a bit per lane
449    VectorMask statusBitVector;
450    // for ld_v# or st_v#
451    std::vector<int> statusVector;
452    std::vector<int> tlbHitLevel;
453
454  private:
455    GPUStaticInst *_staticInst;
456    uint64_t _seqNum;
457};
458
459#endif // __GPU_DYN_INST_HH__
460