gpu_dyn_inst.hh revision 11308
1/*
2 * Copyright (c) 2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Anthony Gutierrez
34 */
35
36#ifndef __GPU_DYN_INST_HH__
37#define __GPU_DYN_INST_HH__
38
39#include <cstdint>
40#include <string>
41
42#include "enums/GenericMemoryOrder.hh"
43#include "enums/GenericMemoryScope.hh"
44#include "enums/MemOpType.hh"
45#include "enums/MemType.hh"
46#include "enums/OpType.hh"
47#include "enums/StorageClassType.hh"
48#include "gpu-compute/compute_unit.hh"
49#include "gpu-compute/gpu_exec_context.hh"
50
51class GPUStaticInst;
52
53template<typename T>
54class AtomicOpAnd : public TypedAtomicOpFunctor<T>
55{
56  public:
57    T a;
58
59    AtomicOpAnd(T _a) : a(_a) { }
60    void execute(T *b) { *b &= a; }
61};
62
63template<typename T>
64class AtomicOpOr : public TypedAtomicOpFunctor<T>
65{
66  public:
67    T a;
68    AtomicOpOr(T _a) : a(_a) { }
69    void execute(T *b) { *b |= a; }
70};
71
72template<typename T>
73class AtomicOpXor : public TypedAtomicOpFunctor<T>
74{
75  public:
76    T a;
77    AtomicOpXor(T _a) : a(_a) {}
78    void execute(T *b) { *b ^= a; }
79};
80
81template<typename T>
82class AtomicOpCAS : public TypedAtomicOpFunctor<T>
83{
84  public:
85    T c;
86    T s;
87
88    ComputeUnit *computeUnit;
89
90    AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
91      : c(_c), s(_s), computeUnit(compute_unit) { }
92
93    void
94    execute(T *b)
95    {
96        computeUnit->numCASOps++;
97
98        if (*b == c) {
99            *b = s;
100        } else {
101            computeUnit->numFailedCASOps++;
102        }
103
104        if (computeUnit->xact_cas_mode) {
105            computeUnit->xactCasLoadMap.clear();
106        }
107    }
108};
109
110template<typename T>
111class AtomicOpExch : public TypedAtomicOpFunctor<T>
112{
113  public:
114    T a;
115    AtomicOpExch(T _a) : a(_a) { }
116    void execute(T *b) { *b = a; }
117};
118
119template<typename T>
120class AtomicOpAdd : public TypedAtomicOpFunctor<T>
121{
122  public:
123    T a;
124    AtomicOpAdd(T _a) : a(_a) { }
125    void execute(T *b) { *b += a; }
126};
127
128template<typename T>
129class AtomicOpSub : public TypedAtomicOpFunctor<T>
130{
131  public:
132    T a;
133    AtomicOpSub(T _a) : a(_a) { }
134    void execute(T *b) { *b -= a; }
135};
136
137template<typename T>
138class AtomicOpInc : public TypedAtomicOpFunctor<T>
139{
140  public:
141    AtomicOpInc() { }
142    void execute(T *b) { *b += 1; }
143};
144
145template<typename T>
146class AtomicOpDec : public TypedAtomicOpFunctor<T>
147{
148  public:
149    AtomicOpDec() {}
150    void execute(T *b) { *b -= 1; }
151};
152
153template<typename T>
154class AtomicOpMax : public TypedAtomicOpFunctor<T>
155{
156  public:
157    T a;
158    AtomicOpMax(T _a) : a(_a) { }
159
160    void
161    execute(T *b)
162    {
163        if (a > *b)
164            *b = a;
165    }
166};
167
168template<typename T>
169class AtomicOpMin : public TypedAtomicOpFunctor<T>
170{
171  public:
172    T a;
173    AtomicOpMin(T _a) : a(_a) {}
174
175    void
176    execute(T *b)
177    {
178        if (a < *b)
179            *b = a;
180    }
181};
182
183#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN)
184#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN)
185#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN)
186
187typedef enum
188{
189    VT_32,
190    VT_64,
191} vgpr_type;
192
193typedef enum
194{
195    SEG_PRIVATE,
196    SEG_SPILL,
197    SEG_GLOBAL,
198    SEG_SHARED,
199    SEG_READONLY,
200    SEG_FLAT
201} seg_type;
202
203class GPUDynInst : public GPUExecContext
204{
205  public:
206    GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
207               uint64_t instSeqNum);
208
209    void execute();
210    int numSrcRegOperands();
211    int numDstRegOperands();
212    int getNumOperands();
213    bool isVectorRegister(int operandIdx);
214    bool isScalarRegister(int operandIdx);
215    int getRegisterIndex(int operandIdx);
216    int getOperandSize(int operandIdx);
217    bool isDstOperand(int operandIdx);
218    bool isSrcOperand(int operandIdx);
219    bool isArgLoad();
220
221    const std::string &disassemble() const;
222
223    uint64_t seqNum() const;
224
225    Enums::OpType opType();
226    Enums::StorageClassType executedAs();
227
228    // The address of the memory operation
229    Addr addr[VSZ];
230    Addr pAddr;
231
232    // The data to get written
233    uint8_t d_data[VSZ * 16];
234    // Additional data (for atomics)
235    uint8_t a_data[VSZ * 8];
236    // Additional data (for atomics)
237    uint8_t x_data[VSZ * 8];
238    // The execution mask
239    VectorMask exec_mask;
240
241    // The memory type (M_U32, M_S32, ...)
242    Enums::MemType m_type;
243    // The memory operation (MO_LD, MO_ST, ...)
244    Enums::MemOpType m_op;
245    Enums::GenericMemoryOrder memoryOrder;
246
247    // Scope of the request
248    Enums::GenericMemoryScope scope;
249    // The memory segment (SEG_SHARED, SEG_GLOBAL, ...)
250    seg_type s_type;
251    // The equivalency class
252    int equiv;
253    // The return VGPR type (VT_32 or VT_64)
254    vgpr_type v_type;
255    // Number of VGPR's accessed (1, 2, or 4)
256    int n_reg;
257    // The return VGPR index
258    int dst_reg;
259    // There can be max 4 dest regs>
260    int dst_reg_vec[4];
261    // SIMD where the WF of the memory instruction has been mapped to
262    int simdId;
263    // unique id of the WF where the memory instruction belongs to
264    int wfDynId;
265    // The kernel id of the requesting wf
266    int kern_id;
267    // The CU id of the requesting wf
268    int cu_id;
269    // HW slot id where the WF is mapped to inside a SIMD unit
270    int wfSlotId;
271    // execution pipeline id where the memory instruction has been scheduled
272    int pipeId;
273    // The execution time of this operation
274    Tick time;
275    // The latency of this operation
276    WaitClass latency;
277    // A list of bank conflicts for the 4 cycles.
278    uint32_t bc[4];
279
280    // A pointer to ROM
281    uint8_t *rom;
282    // The size of the READONLY segment
283    int sz_rom;
284
285    // Initiate the specified memory operation, by creating a
286    // memory request and sending it off to the memory system.
287    void initiateAcc(GPUDynInstPtr gpuDynInst);
288
289    void updateStats();
290
291    GPUStaticInst* staticInstruction() { return staticInst; }
292
293    // Is the instruction a scalar or vector op?
294    bool scalarOp() const;
295
296    /*
297     * Loads/stores/atomics may have acquire/release semantics associated
298     * withthem. Some protocols want to see the acquire/release as separate
299     * requests from the load/store/atomic. We implement that separation
300     * using continuations (i.e., a function pointer with an object associated
301     * with it). When, for example, the front-end generates a store with
302     * release semantics, we will first issue a normal store and set the
303     * continuation in the GPUDynInst to a function that generate a
304     * release request. That continuation will be called when the normal
305     * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
306     * continuation will be called in the context of the same GPUDynInst
307     * that generated the initial store.
308     */
309    std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
310
311    // when true, call execContinuation when response arrives
312    bool useContinuation;
313
314    template<typename c0> AtomicOpFunctor*
315    makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op)
316    {
317        using namespace Enums;
318
319        switch(op) {
320          case MO_AAND:
321          case MO_ANRAND:
322            return new AtomicOpAnd<c0>(*reg0);
323          case MO_AOR:
324          case MO_ANROR:
325            return new AtomicOpOr<c0>(*reg0);
326          case MO_AXOR:
327          case MO_ANRXOR:
328            return new AtomicOpXor<c0>(*reg0);
329          case MO_ACAS:
330          case MO_ANRCAS:
331            return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
332          case MO_AEXCH:
333          case MO_ANREXCH:
334            return new AtomicOpExch<c0>(*reg0);
335          case MO_AADD:
336          case MO_ANRADD:
337            return new AtomicOpAdd<c0>(*reg0);
338          case MO_ASUB:
339          case MO_ANRSUB:
340            return new AtomicOpSub<c0>(*reg0);
341          case MO_AINC:
342          case MO_ANRINC:
343            return new AtomicOpInc<c0>();
344          case MO_ADEC:
345          case MO_ANRDEC:
346            return new AtomicOpDec<c0>();
347          case MO_AMAX:
348          case MO_ANRMAX:
349            return new AtomicOpMax<c0>(*reg0);
350          case MO_AMIN:
351          case MO_ANRMIN:
352            return new AtomicOpMin<c0>(*reg0);
353          default:
354            panic("Unrecognized atomic operation");
355        }
356    }
357
358    void
359    setRequestFlags(Request *req, bool setMemOrder=true)
360    {
361        // currently these are the easy scopes to deduce
362        switch (s_type) {
363          case SEG_PRIVATE:
364            req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
365            break;
366          case SEG_SPILL:
367            req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
368            break;
369          case SEG_GLOBAL:
370            req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
371            break;
372          case SEG_READONLY:
373            req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
374            break;
375          case SEG_SHARED:
376            req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
377            break;
378          case SEG_FLAT:
379            // TODO: translate to correct scope
380            assert(false);
381          default:
382            panic("Bad segment type");
383            break;
384        }
385
386        switch (scope) {
387          case Enums::MEMORY_SCOPE_NONE:
388          case Enums::MEMORY_SCOPE_WORKITEM:
389            break;
390          case Enums::MEMORY_SCOPE_WAVEFRONT:
391            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
392                                        Request::WAVEFRONT_SCOPE);
393            break;
394          case Enums::MEMORY_SCOPE_WORKGROUP:
395            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
396                                        Request::WORKGROUP_SCOPE);
397            break;
398          case Enums::MEMORY_SCOPE_DEVICE:
399            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
400                                        Request::DEVICE_SCOPE);
401            break;
402          case Enums::MEMORY_SCOPE_SYSTEM:
403            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
404                                        Request::SYSTEM_SCOPE);
405            break;
406          default:
407            panic("Bad scope type");
408            break;
409        }
410
411        if (setMemOrder) {
412            // set acquire and release flags
413            switch (memoryOrder){
414              case Enums::MEMORY_ORDER_SC_ACQUIRE:
415                req->setFlags(Request::ACQUIRE);
416                break;
417              case Enums::MEMORY_ORDER_SC_RELEASE:
418                req->setFlags(Request::RELEASE);
419                break;
420              case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE:
421                req->setFlags(Request::ACQUIRE | Request::RELEASE);
422                break;
423              default:
424                break;
425            }
426        }
427
428        // set atomic type
429        // currently, the instruction genenerator only produces atomic return
430        // but a magic instruction can produce atomic no return
431        if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB ||
432            m_op == Enums::MO_AAND || m_op == Enums::MO_AOR ||
433            m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX ||
434            m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC ||
435            m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH ||
436            m_op == Enums::MO_ACAS) {
437            req->setFlags(Request::ATOMIC_RETURN_OP);
438        } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB ||
439                   m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR ||
440                   m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX ||
441                   m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC ||
442                   m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH ||
443                   m_op == Enums::MO_ANRCAS) {
444            req->setFlags(Request::ATOMIC_NO_RETURN_OP);
445        }
446    }
447
448    // Map returned packets and the addresses they satisfy with which lane they
449    // were requested from
450    typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
451    StatusVector memStatusVector;
452
453    // Track the status of memory requests per lane, a bit per lane
454    VectorMask statusBitVector;
455    // for ld_v# or st_v#
456    std::vector<int> statusVector;
457    std::vector<int> tlbHitLevel;
458
459  private:
460    GPUStaticInst *staticInst;
461    uint64_t _seqNum;
462};
463
464#endif // __GPU_DYN_INST_HH__
465