mem.hh revision 11699:c7453f485a5f
1/*
2 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Steve Reinhardt
34 */
35
36#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
37#define __ARCH_HSAIL_INSTS_MEM_HH__
38
39#include <type_traits>
40
41#include "arch/hsail/insts/decl.hh"
42#include "arch/hsail/insts/gpu_static_inst.hh"
43#include "arch/hsail/operand.hh"
44#include "gpu-compute/compute_unit.hh"
45
46namespace HsailISA
47{
48    class MemInst
49    {
50      public:
51        MemInst() : size(0), addr_operand(nullptr) { }
52
53        MemInst(Enums::MemType m_type)
54        {
55            if (m_type == Enums::M_U64 ||
56                m_type == Enums::M_S64 ||
57                m_type == Enums::M_F64) {
58                size = 8;
59            } else if (m_type == Enums::M_U32 ||
60                       m_type == Enums::M_S32 ||
61                       m_type == Enums::M_F32) {
62                size = 4;
63            } else if (m_type == Enums::M_U16 ||
64                       m_type == Enums::M_S16 ||
65                       m_type == Enums::M_F16) {
66                size = 2;
67            } else {
68                size = 1;
69            }
70
71            addr_operand = nullptr;
72        }
73
74        void
75        init_addr(AddrOperandBase *_addr_operand)
76        {
77            addr_operand = _addr_operand;
78        }
79
80      private:
81        int size;
82        AddrOperandBase *addr_operand;
83
84      public:
85        int getMemOperandSize() { return size; }
86        AddrOperandBase *getAddressOperand() { return addr_operand; }
87    };
88
89    template<typename DestOperandType, typename AddrOperandType>
90    class LdaInstBase : public HsailGPUStaticInst
91    {
92      public:
93        typename DestOperandType::DestOperand dest;
94        AddrOperandType addr;
95
96        LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
97                    const char *_opcode)
98           : HsailGPUStaticInst(obj, _opcode)
99        {
100            using namespace Brig;
101
102            setFlag(ALU);
103
104            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
105            dest.init(op_offs, obj);
106            op_offs = obj->getOperandPtr(ib->operands, 1);
107            addr.init(op_offs, obj);
108        }
109
110        int numSrcRegOperands() override
111        { return(this->addr.isVectorRegister()); }
112        int numDstRegOperands() override
113        { return dest.isVectorRegister(); }
114        bool isVectorRegister(int operandIndex) override
115        {
116            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
117            return((operandIndex == 0) ? dest.isVectorRegister() :
118                   this->addr.isVectorRegister());
119        }
120        bool isCondRegister(int operandIndex) override
121        {
122            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
123            return((operandIndex == 0) ? dest.isCondRegister() :
124                   this->addr.isCondRegister());
125        }
126        bool isScalarRegister(int operandIndex) override
127        {
128            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
129            return((operandIndex == 0) ? dest.isScalarRegister() :
130                   this->addr.isScalarRegister());
131        }
132        bool isSrcOperand(int operandIndex) override
133        {
134            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
135            if (operandIndex > 0)
136                return(this->addr.isVectorRegister());
137            return false;
138        }
139        bool isDstOperand(int operandIndex) override {
140            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
141            return(operandIndex == 0);
142        }
143        int getOperandSize(int operandIndex) override
144        {
145            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
146            return((operandIndex == 0) ? dest.opSize() :
147                   this->addr.opSize());
148        }
149        int
150        getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
151        {
152            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
153            return((operandIndex == 0) ? dest.regIndex() :
154                   this->addr.regIndex());
155        }
156        int getNumOperands() override
157        {
158            if (this->addr.isVectorRegister())
159                return 2;
160            return 1;
161        }
162    };
163
164    template<typename DestDataType, typename AddrOperandType>
165    class LdaInst :
166        public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
167        public MemInst
168    {
169      public:
170        void generateDisassembly();
171
172        LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
173                        const char *_opcode)
174            : LdaInstBase<typename DestDataType::OperandType,
175                          AddrOperandType>(ib, obj, _opcode)
176        {
177            init_addr(&this->addr);
178        }
179
180        void execute(GPUDynInstPtr gpuDynInst);
181    };
182
183    template<typename DataType>
184    GPUStaticInst*
185    decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
186    {
187        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
188        BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj);
189
190        if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
191            return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas");
192        } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
193            // V2/V4 not allowed
194            switch (regDataType.regKind) {
195              case Brig::BRIG_REGISTER_KIND_SINGLE:
196                return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas");
197              case Brig::BRIG_REGISTER_KIND_DOUBLE:
198                return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas");
199              default:
200                fatal("Bad ldas register operand type %d\n", regDataType.type);
201            }
202        } else {
203            fatal("Bad ldas register operand kind %d\n", regDataType.kind);
204        }
205    }
206
207    template<typename MemOperandType, typename DestOperandType,
208             typename AddrOperandType>
209    class LdInstBase : public HsailGPUStaticInst
210    {
211      public:
212        Brig::BrigWidth8_t width;
213        typename DestOperandType::DestOperand dest;
214        AddrOperandType addr;
215
216        Brig::BrigSegment segment;
217        Brig::BrigMemoryOrder memoryOrder;
218        Brig::BrigMemoryScope memoryScope;
219        unsigned int equivClass;
220
221        LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
222                   const char *_opcode)
223           : HsailGPUStaticInst(obj, _opcode)
224        {
225            using namespace Brig;
226
227            setFlag(MemoryRef);
228            setFlag(Load);
229
230            if (ib->opcode == BRIG_OPCODE_LD) {
231                const BrigInstMem *ldst = (const BrigInstMem*)ib;
232
233                segment = (BrigSegment)ldst->segment;
234                memoryOrder = BRIG_MEMORY_ORDER_NONE;
235                memoryScope = BRIG_MEMORY_SCOPE_NONE;
236                equivClass = ldst->equivClass;
237
238                width = ldst->width;
239                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
240                const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
241                if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
242                    dest.init(op_offs, obj);
243
244                op_offs = obj->getOperandPtr(ib->operands, 1);
245                addr.init(op_offs, obj);
246            } else {
247                const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
248
249                segment = (BrigSegment)at->segment;
250                memoryOrder = (BrigMemoryOrder)at->memoryOrder;
251                memoryScope = (BrigMemoryScope)at->memoryScope;
252                equivClass = 0;
253
254                width = BRIG_WIDTH_1;
255                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
256                const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
257
258                if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
259                    dest.init(op_offs, obj);
260
261                op_offs = obj->getOperandPtr(ib->operands,1);
262                addr.init(op_offs, obj);
263            }
264
265            switch (memoryOrder) {
266              case BRIG_MEMORY_ORDER_NONE:
267                setFlag(NoOrder);
268                break;
269              case BRIG_MEMORY_ORDER_RELAXED:
270                setFlag(RelaxedOrder);
271                break;
272              case BRIG_MEMORY_ORDER_SC_ACQUIRE:
273                setFlag(Acquire);
274                break;
275              case BRIG_MEMORY_ORDER_SC_RELEASE:
276                setFlag(Release);
277                break;
278              case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
279                setFlag(AcquireRelease);
280                break;
281              default:
282                fatal("LdInst has bad memory order type\n");
283            }
284
285            switch (memoryScope) {
286              case BRIG_MEMORY_SCOPE_NONE:
287                setFlag(NoScope);
288                break;
289              case BRIG_MEMORY_SCOPE_WORKITEM:
290                setFlag(WorkitemScope);
291                break;
292              case BRIG_MEMORY_SCOPE_WORKGROUP:
293                setFlag(WorkgroupScope);
294                break;
295              case BRIG_MEMORY_SCOPE_AGENT:
296                setFlag(DeviceScope);
297                break;
298              case BRIG_MEMORY_SCOPE_SYSTEM:
299                setFlag(SystemScope);
300                break;
301              default:
302                fatal("LdInst has bad memory scope type\n");
303            }
304
305            switch (segment) {
306              case BRIG_SEGMENT_GLOBAL:
307                setFlag(GlobalSegment);
308                break;
309              case BRIG_SEGMENT_GROUP:
310                setFlag(GroupSegment);
311                break;
312              case BRIG_SEGMENT_PRIVATE:
313                setFlag(PrivateSegment);
314                break;
315              case BRIG_SEGMENT_READONLY:
316                setFlag(ReadOnlySegment);
317                break;
318              case BRIG_SEGMENT_SPILL:
319                setFlag(SpillSegment);
320                break;
321              case BRIG_SEGMENT_FLAT:
322                setFlag(Flat);
323                break;
324              case BRIG_SEGMENT_KERNARG:
325                setFlag(KernArgSegment);
326                break;
327              case BRIG_SEGMENT_ARG:
328                setFlag(ArgSegment);
329                break;
330              default:
331                panic("Ld: segment %d not supported\n", segment);
332            }
333        }
334
335        int numSrcRegOperands() override
336        { return(this->addr.isVectorRegister()); }
337        int numDstRegOperands() override { return dest.isVectorRegister(); }
338        int getNumOperands() override
339        {
340            if (this->addr.isVectorRegister())
341                return 2;
342            else
343                return 1;
344        }
345        bool isVectorRegister(int operandIndex) override
346        {
347            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
348            return((operandIndex == 0) ? dest.isVectorRegister() :
349                   this->addr.isVectorRegister());
350        }
351        bool isCondRegister(int operandIndex) override
352        {
353            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
354            return((operandIndex == 0) ? dest.isCondRegister() :
355                   this->addr.isCondRegister());
356        }
357        bool isScalarRegister(int operandIndex) override
358        {
359            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
360            return((operandIndex == 0) ? dest.isScalarRegister() :
361                   this->addr.isScalarRegister());
362        }
363        bool isSrcOperand(int operandIndex) override
364        {
365            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
366            if (operandIndex > 0)
367                return(this->addr.isVectorRegister());
368            return false;
369        }
370        bool isDstOperand(int operandIndex) override
371        {
372            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
373            return(operandIndex == 0);
374        }
375        int getOperandSize(int operandIndex) override
376        {
377            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
378            return((operandIndex == 0) ? dest.opSize() :
379                   this->addr.opSize());
380        }
381        int
382        getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
383        {
384            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
385            return((operandIndex == 0) ? dest.regIndex() :
386                   this->addr.regIndex());
387        }
388    };
389
390    template<typename MemDataType, typename DestDataType,
391             typename AddrOperandType>
392    class LdInst :
393        public LdInstBase<typename MemDataType::CType,
394                          typename DestDataType::OperandType, AddrOperandType>,
395        public MemInst
396    {
397        typename DestDataType::OperandType::DestOperand dest_vect[4];
398        uint16_t num_dest_operands;
399        void generateDisassembly() override;
400
401      public:
402        LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
403               const char *_opcode)
404            : LdInstBase<typename MemDataType::CType,
405                         typename DestDataType::OperandType,
406                         AddrOperandType>(ib, obj, _opcode),
407              MemInst(MemDataType::memType)
408        {
409            init_addr(&this->addr);
410
411            unsigned op_offs = obj->getOperandPtr(ib->operands,0);
412            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
413
414            if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
415                const Brig::BrigOperandOperandList *brigRegVecOp =
416                    (const Brig::BrigOperandOperandList*)brigOp;
417
418                num_dest_operands =
419                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
420
421                assert(num_dest_operands <= 4);
422            } else {
423                num_dest_operands = 1;
424            }
425
426            if (num_dest_operands > 1) {
427                assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
428
429                for (int i = 0; i < num_dest_operands; ++i) {
430                    dest_vect[i].init_from_vect(op_offs, obj, i);
431                }
432            }
433        }
434
435        void
436        initiateAcc(GPUDynInstPtr gpuDynInst) override
437        {
438            typedef typename MemDataType::CType c0;
439
440            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
441
442            if (num_dest_operands > 1) {
443                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
444                    if (gpuDynInst->exec_mask[i])
445                        gpuDynInst->statusVector.push_back(num_dest_operands);
446                    else
447                        gpuDynInst->statusVector.push_back(0);
448            }
449
450            for (int k = 0; k < num_dest_operands; ++k) {
451
452                c0 *d = &((c0*)gpuDynInst->d_data)
453                    [k * gpuDynInst->computeUnit()->wfSize()];
454
455                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
456                    if (gpuDynInst->exec_mask[i]) {
457                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
458
459                        if (this->isLocalMem()) {
460                            // load from shared memory
461                            *d = gpuDynInst->wavefront()->ldsChunk->
462                                read<c0>(vaddr);
463                        } else {
464                            Request *req = new Request(0, vaddr, sizeof(c0), 0,
465                                          gpuDynInst->computeUnit()->masterId(),
466                                          0, gpuDynInst->wfDynId);
467
468                            gpuDynInst->setRequestFlags(req);
469                            PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
470                            pkt->dataStatic(d);
471
472                            if (gpuDynInst->computeUnit()->shader->
473                                separate_acquire_release &&
474                                gpuDynInst->isAcquire()) {
475                                // if this load has acquire semantics,
476                                // set the response continuation function
477                                // to perform an Acquire request
478                                gpuDynInst->execContinuation =
479                                    &GPUStaticInst::execLdAcq;
480
481                                gpuDynInst->useContinuation = true;
482                            } else {
483                                // the request will be finished when
484                                // the load completes
485                                gpuDynInst->useContinuation = false;
486                            }
487                            // translation is performed in sendRequest()
488                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
489                                                                   i, pkt);
490                        }
491                    }
492                    ++d;
493                }
494            }
495
496            gpuDynInst->updateStats();
497        }
498
499        void
500        completeAcc(GPUDynInstPtr gpuDynInst) override
501        {
502            typedef typename MemDataType::CType c1;
503
504            constexpr bool is_vt_32 = DestDataType::vgprType == VT_32;
505
506            /**
507              * this code essentially replaces the long if-else chain
508              * that was in used GlobalMemPipeline::exec() to infer the
509              * size (single/double) and type (floating point/integer) of
510              * the destination register. this is needed for load
511              * instructions because the loaded value and the
512              * destination type can be of different sizes, and we also
513              * need to know if the value we're writing back is floating
514              * point and signed/unsigned, so we can properly cast the
515              * writeback value
516              */
517            typedef typename std::conditional<is_vt_32,
518                typename std::conditional<std::is_floating_point<c1>::value,
519                    float, typename std::conditional<std::is_signed<c1>::value,
520                    int32_t, uint32_t>::type>::type,
521                typename std::conditional<std::is_floating_point<c1>::value,
522                    double, typename std::conditional<std::is_signed<c1>::value,
523                    int64_t, uint64_t>::type>::type>::type c0;
524
525
526            Wavefront *w = gpuDynInst->wavefront();
527
528            std::vector<uint32_t> regVec;
529            // iterate over number of destination register operands since
530            // this is a load
531            for (int k = 0; k < num_dest_operands; ++k) {
532                assert((sizeof(c1) * num_dest_operands)
533                       <= MAX_WIDTH_FOR_MEM_INST);
534
535                int dst = this->dest.regIndex() + k;
536                if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST)
537                    dst = dest_vect[k].regIndex();
538                // virtual->physical VGPR mapping
539                int physVgpr = w->remap(dst, sizeof(c0), 1);
540                // save the physical VGPR index
541                regVec.push_back(physVgpr);
542
543                c1 *p1 =
544                    &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()];
545
546                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
547                    if (gpuDynInst->exec_mask[i]) {
548                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
549                                "$%s%d <- %d global ld done (src = wavefront "
550                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
551                                w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
552                                dst, *p1);
553                        // write the value into the physical VGPR. This is a
554                        // purely functional operation. No timing is modeled.
555                        w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
556                                                                    *p1, i);
557                    }
558                    ++p1;
559                }
560            }
561
562            // Schedule the write operation of the load data on the VRF.
563            // This simply models the timing aspect of the VRF write operation.
564            // It does not modify the physical VGPR.
565            int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
566                vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
567                                     sizeof(c0), gpuDynInst->time);
568
569            if (this->isGlobalMem()) {
570                gpuDynInst->computeUnit()->globalMemoryPipe
571                    .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
572            } else {
573                assert(this->isLocalMem());
574                gpuDynInst->computeUnit()->localMemoryPipe
575                    .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
576            }
577        }
578
579      private:
580        void
581        execLdAcq(GPUDynInstPtr gpuDynInst) override
582        {
583            // after the load has complete and if the load has acquire
584            // semantics, issue an acquire request.
585            if (!this->isLocalMem()) {
586                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
587                    && gpuDynInst->isAcquire()) {
588                    gpuDynInst->statusBitVector = VectorMask(1);
589                    gpuDynInst->useContinuation = false;
590                    // create request
591                    Request *req = new Request(0, 0, 0, 0,
592                                  gpuDynInst->computeUnit()->masterId(),
593                                  0, gpuDynInst->wfDynId);
594                    req->setFlags(Request::ACQUIRE);
595                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
596                }
597            }
598        }
599
600      public:
601        bool isVectorRegister(int operandIndex) override
602        {
603            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
604            if ((num_dest_operands != getNumOperands()) &&
605                (operandIndex == (getNumOperands()-1)))
606                return(this->addr.isVectorRegister());
607            if (num_dest_operands > 1) {
608                return dest_vect[operandIndex].isVectorRegister();
609            }
610            else if (num_dest_operands == 1) {
611                return LdInstBase<typename MemDataType::CType,
612                       typename DestDataType::OperandType,
613                       AddrOperandType>::dest.isVectorRegister();
614            }
615            return false;
616        }
617        bool isCondRegister(int operandIndex) override
618        {
619            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
620            if ((num_dest_operands != getNumOperands()) &&
621                (operandIndex == (getNumOperands()-1)))
622                return(this->addr.isCondRegister());
623            if (num_dest_operands > 1)
624                return dest_vect[operandIndex].isCondRegister();
625            else if (num_dest_operands == 1)
626                return LdInstBase<typename MemDataType::CType,
627                       typename DestDataType::OperandType,
628                       AddrOperandType>::dest.isCondRegister();
629            return false;
630        }
631        bool isScalarRegister(int operandIndex) override
632        {
633            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
634            if ((num_dest_operands != getNumOperands()) &&
635                (operandIndex == (getNumOperands()-1)))
636                return(this->addr.isScalarRegister());
637            if (num_dest_operands > 1)
638                return dest_vect[operandIndex].isScalarRegister();
639            else if (num_dest_operands == 1)
640                return LdInstBase<typename MemDataType::CType,
641                       typename DestDataType::OperandType,
642                       AddrOperandType>::dest.isScalarRegister();
643            return false;
644        }
645        bool isSrcOperand(int operandIndex) override
646        {
647            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
648            if ((num_dest_operands != getNumOperands()) &&
649                (operandIndex == (getNumOperands()-1)))
650                return(this->addr.isVectorRegister());
651            return false;
652        }
653        bool isDstOperand(int operandIndex) override
654        {
655            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
656            if ((num_dest_operands != getNumOperands()) &&
657                (operandIndex == (getNumOperands()-1)))
658                return false;
659            return true;
660        }
661        int getOperandSize(int operandIndex) override
662        {
663            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
664            if ((num_dest_operands != getNumOperands()) &&
665                (operandIndex == (getNumOperands()-1)))
666                return(this->addr.opSize());
667            if (num_dest_operands > 1)
668                return(dest_vect[operandIndex].opSize());
669            else if (num_dest_operands == 1)
670                return(LdInstBase<typename MemDataType::CType,
671                       typename DestDataType::OperandType,
672                       AddrOperandType>::dest.opSize());
673            return 0;
674        }
675        int
676        getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
677        {
678            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
679            if ((num_dest_operands != getNumOperands()) &&
680                (operandIndex == (getNumOperands()-1)))
681                return(this->addr.regIndex());
682            if (num_dest_operands > 1)
683                return(dest_vect[operandIndex].regIndex());
684            else if (num_dest_operands == 1)
685                return(LdInstBase<typename MemDataType::CType,
686                       typename DestDataType::OperandType,
687                       AddrOperandType>::dest.regIndex());
688            return -1;
689        }
690        int getNumOperands() override
691        {
692            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
693                return(num_dest_operands+1);
694            else
695                return(num_dest_operands);
696        }
697        void execute(GPUDynInstPtr gpuDynInst) override;
698    };
699
700    template<typename MemDT, typename DestDT>
701    GPUStaticInst*
702    decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
703    {
704        unsigned op_offs = obj->getOperandPtr(ib->operands,1);
705        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
706
707        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
708            return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld");
709        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
710                   tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
711            switch (tmp.regKind) {
712              case Brig::BRIG_REGISTER_KIND_SINGLE:
713                return new LdInst<MemDT, DestDT,
714                                  SRegAddrOperand>(ib, obj, "ld");
715              case Brig::BRIG_REGISTER_KIND_DOUBLE:
716                return new LdInst<MemDT, DestDT,
717                                  DRegAddrOperand>(ib, obj, "ld");
718              default:
719                fatal("Bad ld register operand type %d\n", tmp.regKind);
720            }
721        } else {
722            fatal("Bad ld register operand kind %d\n", tmp.kind);
723        }
724    }
725
726    template<typename MemDT>
727    GPUStaticInst*
728    decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
729    {
730        unsigned op_offs = obj->getOperandPtr(ib->operands,0);
731        BrigRegOperandInfo dest = findRegDataType(op_offs, obj);
732
733        assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
734               dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
735        switch(dest.regKind) {
736          case Brig::BRIG_REGISTER_KIND_SINGLE:
737            switch (ib->type) {
738              case Brig::BRIG_TYPE_B8:
739              case Brig::BRIG_TYPE_B16:
740              case Brig::BRIG_TYPE_B32:
741                return decodeLd2<MemDT, B32>(ib, obj);
742              case Brig::BRIG_TYPE_U8:
743              case Brig::BRIG_TYPE_U16:
744              case Brig::BRIG_TYPE_U32:
745                return decodeLd2<MemDT, U32>(ib, obj);
746              case Brig::BRIG_TYPE_S8:
747              case Brig::BRIG_TYPE_S16:
748              case Brig::BRIG_TYPE_S32:
749                return decodeLd2<MemDT, S32>(ib, obj);
750              case Brig::BRIG_TYPE_F16:
751              case Brig::BRIG_TYPE_F32:
752                return decodeLd2<MemDT, U32>(ib, obj);
753              default:
754                fatal("Bad ld register operand type %d, %d\n",
755                      dest.regKind, ib->type);
756            };
757          case Brig::BRIG_REGISTER_KIND_DOUBLE:
758            switch (ib->type) {
759              case Brig::BRIG_TYPE_B64:
760                return decodeLd2<MemDT, B64>(ib, obj);
761              case Brig::BRIG_TYPE_U64:
762                return decodeLd2<MemDT, U64>(ib, obj);
763              case Brig::BRIG_TYPE_S64:
764                return decodeLd2<MemDT, S64>(ib, obj);
765              case Brig::BRIG_TYPE_F64:
766                return decodeLd2<MemDT, U64>(ib, obj);
767              default:
768                fatal("Bad ld register operand type %d, %d\n",
769                      dest.regKind, ib->type);
770            };
771          default:
772            fatal("Bad ld register operand type %d, %d\n", dest.regKind,
773                  ib->type);
774        }
775    }
776
777    template<typename MemDataType, typename SrcOperandType,
778             typename AddrOperandType>
779    class StInstBase : public HsailGPUStaticInst
780    {
781      public:
782        typename SrcOperandType::SrcOperand src;
783        AddrOperandType addr;
784
785        Brig::BrigSegment segment;
786        Brig::BrigMemoryScope memoryScope;
787        Brig::BrigMemoryOrder memoryOrder;
788        unsigned int equivClass;
789
790        StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
791                   const char *_opcode)
792           : HsailGPUStaticInst(obj, _opcode)
793        {
794            using namespace Brig;
795
796            setFlag(MemoryRef);
797            setFlag(Store);
798
799            if (ib->opcode == BRIG_OPCODE_ST) {
800                const BrigInstMem *ldst = (const BrigInstMem*)ib;
801
802                segment = (BrigSegment)ldst->segment;
803                memoryOrder = BRIG_MEMORY_ORDER_NONE;
804                memoryScope = BRIG_MEMORY_SCOPE_NONE;
805                equivClass = ldst->equivClass;
806
807                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
808                const BrigOperand *baseOp = obj->getOperand(op_offs);
809
810                if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) ||
811                    (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) {
812                    src.init(op_offs, obj);
813                }
814
815                op_offs = obj->getOperandPtr(ib->operands, 1);
816                addr.init(op_offs, obj);
817            } else {
818                const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
819
820                segment = (BrigSegment)at->segment;
821                memoryScope = (BrigMemoryScope)at->memoryScope;
822                memoryOrder = (BrigMemoryOrder)at->memoryOrder;
823                equivClass = 0;
824
825                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
826                addr.init(op_offs, obj);
827
828                op_offs = obj->getOperandPtr(ib->operands, 1);
829                src.init(op_offs, obj);
830            }
831
832            switch (memoryOrder) {
833              case BRIG_MEMORY_ORDER_NONE:
834                setFlag(NoOrder);
835                break;
836              case BRIG_MEMORY_ORDER_RELAXED:
837                setFlag(RelaxedOrder);
838                break;
839              case BRIG_MEMORY_ORDER_SC_ACQUIRE:
840                setFlag(Acquire);
841                break;
842              case BRIG_MEMORY_ORDER_SC_RELEASE:
843                setFlag(Release);
844                break;
845              case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
846                setFlag(AcquireRelease);
847                break;
848              default:
849                fatal("StInst has bad memory order type\n");
850            }
851
852            switch (memoryScope) {
853              case BRIG_MEMORY_SCOPE_NONE:
854                setFlag(NoScope);
855                break;
856              case BRIG_MEMORY_SCOPE_WORKITEM:
857                setFlag(WorkitemScope);
858                break;
859              case BRIG_MEMORY_SCOPE_WORKGROUP:
860                setFlag(WorkgroupScope);
861                break;
862              case BRIG_MEMORY_SCOPE_AGENT:
863                setFlag(DeviceScope);
864                break;
865              case BRIG_MEMORY_SCOPE_SYSTEM:
866                setFlag(SystemScope);
867                break;
868              default:
869                fatal("StInst has bad memory scope type\n");
870            }
871
872            switch (segment) {
873              case BRIG_SEGMENT_GLOBAL:
874                setFlag(GlobalSegment);
875                break;
876              case BRIG_SEGMENT_GROUP:
877                setFlag(GroupSegment);
878                break;
879              case BRIG_SEGMENT_PRIVATE:
880                setFlag(PrivateSegment);
881                break;
882              case BRIG_SEGMENT_READONLY:
883                setFlag(ReadOnlySegment);
884                break;
885              case BRIG_SEGMENT_SPILL:
886                setFlag(SpillSegment);
887                break;
888              case BRIG_SEGMENT_FLAT:
889                setFlag(Flat);
890                break;
891              case BRIG_SEGMENT_ARG:
892                setFlag(ArgSegment);
893                break;
894              default:
895                panic("St: segment %d not supported\n", segment);
896            }
897        }
898
899        int numDstRegOperands() override { return 0; }
900        int numSrcRegOperands() override
901        {
902            return src.isVectorRegister() + this->addr.isVectorRegister();
903        }
904        int getNumOperands() override
905        {
906            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
907                return 2;
908            else
909                return 1;
910        }
911        bool isVectorRegister(int operandIndex) override
912        {
913            assert(operandIndex >= 0 && operandIndex < getNumOperands());
914            return !operandIndex ? src.isVectorRegister() :
915                   this->addr.isVectorRegister();
916        }
917        bool isCondRegister(int operandIndex) override
918        {
919            assert(operandIndex >= 0 && operandIndex < getNumOperands());
920            return !operandIndex ? src.isCondRegister() :
921                   this->addr.isCondRegister();
922        }
923        bool isScalarRegister(int operandIndex) override
924        {
925            assert(operandIndex >= 0 && operandIndex < getNumOperands());
926            return !operandIndex ? src.isScalarRegister() :
927                   this->addr.isScalarRegister();
928        }
929        bool isSrcOperand(int operandIndex) override
930        {
931            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
932            return true;
933        }
934        bool isDstOperand(int operandIndex) override { return false; }
935        int getOperandSize(int operandIndex) override
936        {
937            assert(operandIndex >= 0 && operandIndex < getNumOperands());
938            return !operandIndex ? src.opSize() : this->addr.opSize();
939        }
940        int
941        getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
942        {
943            assert(operandIndex >= 0 && operandIndex < getNumOperands());
944            return !operandIndex ? src.regIndex() : this->addr.regIndex();
945        }
946    };
947
948
949    template<typename MemDataType, typename SrcDataType,
950             typename AddrOperandType>
951    class StInst :
952        public StInstBase<MemDataType, typename SrcDataType::OperandType,
953                          AddrOperandType>,
954        public MemInst
955    {
956      public:
957        typename SrcDataType::OperandType::SrcOperand src_vect[4];
958        uint16_t num_src_operands;
959        void generateDisassembly() override;
960
961        StInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
962                        const char *_opcode, int srcIdx)
963            : StInstBase<MemDataType, typename SrcDataType::OperandType,
964                         AddrOperandType>(ib, obj, _opcode),
965              MemInst(SrcDataType::memType)
966        {
967            init_addr(&this->addr);
968
969            BrigRegOperandInfo rinfo;
970            unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx);
971            const Brig::BrigOperand *baseOp = obj->getOperand(op_offs);
972
973            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
974                const Brig::BrigOperandConstantBytes *op =
975                    (Brig::BrigOperandConstantBytes*)baseOp;
976
977                rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind,
978                                           Brig::BRIG_TYPE_NONE);
979            } else {
980                rinfo = findRegDataType(op_offs, obj);
981            }
982
983            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
984                const Brig::BrigOperandOperandList *brigRegVecOp =
985                    (const Brig::BrigOperandOperandList*)baseOp;
986
987                num_src_operands =
988                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
989
990                assert(num_src_operands <= 4);
991            } else {
992                num_src_operands = 1;
993            }
994
995            if (num_src_operands > 1) {
996                assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
997
998                for (int i = 0; i < num_src_operands; ++i) {
999                    src_vect[i].init_from_vect(op_offs, obj, i);
1000                }
1001            }
1002        }
1003
1004        void
1005        initiateAcc(GPUDynInstPtr gpuDynInst) override
1006        {
1007            // before performing a store, check if this store has
1008            // release semantics, and if so issue a release first
1009            if (!this->isLocalMem()) {
1010                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1011                    && gpuDynInst->isRelease()) {
1012
1013                    gpuDynInst->statusBitVector = VectorMask(1);
1014                    gpuDynInst->execContinuation = &GPUStaticInst::execSt;
1015                    gpuDynInst->useContinuation = true;
1016                    // create request
1017                    Request *req = new Request(0, 0, 0, 0,
1018                                  gpuDynInst->computeUnit()->masterId(),
1019                                  0, gpuDynInst->wfDynId);
1020                    req->setFlags(Request::RELEASE);
1021                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
1022
1023                    return;
1024                }
1025            }
1026
1027            // if there is no release semantic, perform stores immediately
1028            execSt(gpuDynInst);
1029        }
1030
1031        // stores don't write anything back, so there is nothing
1032        // to do here. we only override this method to avoid the
1033        // fatal in the base class implementation
1034        void completeAcc(GPUDynInstPtr gpuDynInst) override { }
1035
1036      private:
1037        // execSt may be called through a continuation
1038        // if the store had release semantics. see comment for
1039        // execSt in gpu_static_inst.hh
1040        void
1041        execSt(GPUDynInstPtr gpuDynInst) override
1042        {
1043            typedef typename MemDataType::CType c0;
1044
1045            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
1046
1047            if (num_src_operands > 1) {
1048                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
1049                    if (gpuDynInst->exec_mask[i])
1050                        gpuDynInst->statusVector.push_back(num_src_operands);
1051                    else
1052                        gpuDynInst->statusVector.push_back(0);
1053            }
1054
1055            for (int k = 0; k < num_src_operands; ++k) {
1056                c0 *d = &((c0*)gpuDynInst->d_data)
1057                    [k * gpuDynInst->computeUnit()->wfSize()];
1058
1059                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
1060                    if (gpuDynInst->exec_mask[i]) {
1061                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
1062
1063                        if (this->isLocalMem()) {
1064                            //store to shared memory
1065                            gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr,
1066                                                                         *d);
1067                        } else {
1068                            Request *req =
1069                              new Request(0, vaddr, sizeof(c0), 0,
1070                                          gpuDynInst->computeUnit()->masterId(),
1071                                          0, gpuDynInst->wfDynId);
1072
1073                            gpuDynInst->setRequestFlags(req);
1074                            PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
1075                            pkt->dataStatic<c0>(d);
1076
1077                            // translation is performed in sendRequest()
1078                            // the request will be finished when the store completes
1079                            gpuDynInst->useContinuation = false;
1080                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
1081                                                                   i, pkt);
1082
1083                        }
1084                    }
1085                    ++d;
1086                }
1087            }
1088
1089            gpuDynInst->updateStats();
1090        }
1091
1092      public:
1093        bool isVectorRegister(int operandIndex) override
1094        {
1095            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1096            if (operandIndex == num_src_operands)
1097                return this->addr.isVectorRegister();
1098            if (num_src_operands > 1)
1099                return src_vect[operandIndex].isVectorRegister();
1100            else if (num_src_operands == 1)
1101                return StInstBase<MemDataType,
1102                       typename SrcDataType::OperandType,
1103                       AddrOperandType>::src.isVectorRegister();
1104            return false;
1105        }
1106        bool isCondRegister(int operandIndex) override
1107        {
1108            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1109            if (operandIndex == num_src_operands)
1110                return this->addr.isCondRegister();
1111            if (num_src_operands > 1)
1112                return src_vect[operandIndex].isCondRegister();
1113            else if (num_src_operands == 1)
1114                return StInstBase<MemDataType,
1115                       typename SrcDataType::OperandType,
1116                       AddrOperandType>::src.isCondRegister();
1117            return false;
1118        }
1119        bool isScalarRegister(int operandIndex) override
1120        {
1121            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1122            if (operandIndex == num_src_operands)
1123                return this->addr.isScalarRegister();
1124            if (num_src_operands > 1)
1125                return src_vect[operandIndex].isScalarRegister();
1126            else if (num_src_operands == 1)
1127                return StInstBase<MemDataType,
1128                       typename SrcDataType::OperandType,
1129                       AddrOperandType>::src.isScalarRegister();
1130            return false;
1131        }
1132        bool isSrcOperand(int operandIndex) override
1133        {
1134            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1135            return true;
1136        }
1137        bool isDstOperand(int operandIndex) override { return false; }
1138        int getOperandSize(int operandIndex) override
1139        {
1140            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1141            if (operandIndex == num_src_operands)
1142                return this->addr.opSize();
1143            if (num_src_operands > 1)
1144                return src_vect[operandIndex].opSize();
1145            else if (num_src_operands == 1)
1146                return StInstBase<MemDataType,
1147                       typename SrcDataType::OperandType,
1148                       AddrOperandType>::src.opSize();
1149            return 0;
1150        }
1151        int
1152        getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
1153        {
1154            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1155            if (operandIndex == num_src_operands)
1156                return this->addr.regIndex();
1157            if (num_src_operands > 1)
1158                return src_vect[operandIndex].regIndex();
1159            else if (num_src_operands == 1)
1160                return StInstBase<MemDataType,
1161                       typename SrcDataType::OperandType,
1162                       AddrOperandType>::src.regIndex();
1163            return -1;
1164        }
1165        int getNumOperands() override
1166        {
1167            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
1168                return num_src_operands + 1;
1169            else
1170                return num_src_operands;
1171        }
1172        void execute(GPUDynInstPtr gpuDynInst) override;
1173    };
1174
1175    template<typename DataType, typename SrcDataType>
1176    GPUStaticInst*
1177    decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
1178    {
1179        int srcIdx = 0;
1180        int destIdx = 1;
1181        if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC ||
1182            ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) {
1183            srcIdx = 1;
1184            destIdx = 0;
1185        }
1186        unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx);
1187
1188        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
1189
1190        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
1191            return new StInst<DataType, SrcDataType,
1192                              NoRegAddrOperand>(ib, obj, "st", srcIdx);
1193        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
1194            // V2/V4 not allowed
1195            switch (tmp.regKind) {
1196              case Brig::BRIG_REGISTER_KIND_SINGLE:
1197                return new StInst<DataType, SrcDataType,
1198                                  SRegAddrOperand>(ib, obj, "st", srcIdx);
1199              case Brig::BRIG_REGISTER_KIND_DOUBLE:
1200                return new StInst<DataType, SrcDataType,
1201                                  DRegAddrOperand>(ib, obj, "st", srcIdx);
1202              default:
1203                fatal("Bad st register operand type %d\n", tmp.type);
1204            }
1205        } else {
1206            fatal("Bad st register operand kind %d\n", tmp.kind);
1207        }
1208    }
1209
1210    template<typename OperandType, typename AddrOperandType, int NumSrcOperands,
1211             bool HasDst>
1212    class AtomicInstBase : public HsailGPUStaticInst
1213    {
1214      public:
1215        typename OperandType::DestOperand dest;
1216        typename OperandType::SrcOperand src[NumSrcOperands];
1217        AddrOperandType addr;
1218
1219        Brig::BrigSegment segment;
1220        Brig::BrigMemoryOrder memoryOrder;
1221        Brig::BrigAtomicOperation atomicOperation;
1222        Brig::BrigMemoryScope memoryScope;
1223        Brig::BrigOpcode opcode;
1224
1225        AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
1226                       const char *_opcode)
1227           : HsailGPUStaticInst(obj, _opcode)
1228        {
1229            using namespace Brig;
1230
1231            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
1232
1233            segment = (BrigSegment)at->segment;
1234            memoryScope = (BrigMemoryScope)at->memoryScope;
1235            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
1236            atomicOperation = (BrigAtomicOperation)at->atomicOperation;
1237            opcode = (BrigOpcode)ib->opcode;
1238
1239            assert(opcode == Brig::BRIG_OPCODE_ATOMICNORET ||
1240                   opcode == Brig::BRIG_OPCODE_ATOMIC);
1241
1242            setFlag(MemoryRef);
1243
1244            if (opcode == Brig::BRIG_OPCODE_ATOMIC) {
1245                setFlag(AtomicReturn);
1246            } else {
1247                setFlag(AtomicNoReturn);
1248            }
1249
1250            switch (memoryOrder) {
1251              case BRIG_MEMORY_ORDER_NONE:
1252                setFlag(NoOrder);
1253                break;
1254              case BRIG_MEMORY_ORDER_RELAXED:
1255                setFlag(RelaxedOrder);
1256                break;
1257              case BRIG_MEMORY_ORDER_SC_ACQUIRE:
1258                setFlag(Acquire);
1259                break;
1260              case BRIG_MEMORY_ORDER_SC_RELEASE:
1261                setFlag(Release);
1262                break;
1263              case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
1264                setFlag(AcquireRelease);
1265                break;
1266              default:
1267                fatal("AtomicInst has bad memory order type\n");
1268            }
1269
1270            switch (memoryScope) {
1271              case BRIG_MEMORY_SCOPE_NONE:
1272                setFlag(NoScope);
1273                break;
1274              case BRIG_MEMORY_SCOPE_WORKITEM:
1275                setFlag(WorkitemScope);
1276                break;
1277              case BRIG_MEMORY_SCOPE_WORKGROUP:
1278                setFlag(WorkgroupScope);
1279                break;
1280              case BRIG_MEMORY_SCOPE_AGENT:
1281                setFlag(DeviceScope);
1282                break;
1283              case BRIG_MEMORY_SCOPE_SYSTEM:
1284                setFlag(SystemScope);
1285                break;
1286              default:
1287                fatal("AtomicInst has bad memory scope type\n");
1288            }
1289
1290            switch (atomicOperation) {
1291              case Brig::BRIG_ATOMIC_AND:
1292                setFlag(AtomicAnd);
1293                break;
1294              case Brig::BRIG_ATOMIC_OR:
1295                setFlag(AtomicOr);
1296                break;
1297              case Brig::BRIG_ATOMIC_XOR:
1298                setFlag(AtomicXor);
1299                break;
1300              case Brig::BRIG_ATOMIC_CAS:
1301                setFlag(AtomicCAS);
1302                break;
1303              case Brig::BRIG_ATOMIC_EXCH:
1304                setFlag(AtomicExch);
1305                break;
1306              case Brig::BRIG_ATOMIC_ADD:
1307                setFlag(AtomicAdd);
1308                break;
1309              case Brig::BRIG_ATOMIC_WRAPINC:
1310                setFlag(AtomicInc);
1311                break;
1312              case Brig::BRIG_ATOMIC_WRAPDEC:
1313                setFlag(AtomicDec);
1314                break;
1315              case Brig::BRIG_ATOMIC_MIN:
1316                setFlag(AtomicMin);
1317                break;
1318              case Brig::BRIG_ATOMIC_MAX:
1319                setFlag(AtomicMax);
1320                break;
1321              case Brig::BRIG_ATOMIC_SUB:
1322                setFlag(AtomicSub);
1323                break;
1324              default:
1325                fatal("Bad BrigAtomicOperation code %d\n", atomicOperation);
1326            }
1327
1328            switch (segment) {
1329              case BRIG_SEGMENT_GLOBAL:
1330                setFlag(GlobalSegment);
1331                break;
1332              case BRIG_SEGMENT_GROUP:
1333                setFlag(GroupSegment);
1334                break;
1335              case BRIG_SEGMENT_FLAT:
1336                setFlag(Flat);
1337                break;
1338              default:
1339                panic("Atomic: segment %d not supported\n", segment);
1340            }
1341
1342            if (HasDst) {
1343                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
1344                dest.init(op_offs, obj);
1345
1346                op_offs = obj->getOperandPtr(ib->operands, 1);
1347                addr.init(op_offs, obj);
1348
1349                for (int i = 0; i < NumSrcOperands; ++i) {
1350                    op_offs = obj->getOperandPtr(ib->operands, i + 2);
1351                    src[i].init(op_offs, obj);
1352                }
1353            } else {
1354
1355                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
1356                addr.init(op_offs, obj);
1357
1358                for (int i = 0; i < NumSrcOperands; ++i) {
1359                    op_offs = obj->getOperandPtr(ib->operands, i + 1);
1360                    src[i].init(op_offs, obj);
1361                }
1362            }
1363        }
1364
1365        int numSrcRegOperands()
1366        {
1367            int operands = 0;
1368            for (int i = 0; i < NumSrcOperands; i++) {
1369                if (src[i].isVectorRegister()) {
1370                    operands++;
1371                }
1372            }
1373            if (addr.isVectorRegister())
1374                operands++;
1375            return operands;
1376        }
1377        int numDstRegOperands() { return dest.isVectorRegister(); }
1378        int getNumOperands()
1379        {
1380            if (addr.isVectorRegister())
1381                return(NumSrcOperands + 2);
1382            return(NumSrcOperands + 1);
1383        }
1384        bool isVectorRegister(int operandIndex)
1385        {
1386            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1387            if (operandIndex < NumSrcOperands)
1388                return src[operandIndex].isVectorRegister();
1389            else if (operandIndex == NumSrcOperands)
1390                return(addr.isVectorRegister());
1391            else
1392                return dest.isVectorRegister();
1393        }
1394        bool isCondRegister(int operandIndex)
1395        {
1396            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1397            if (operandIndex < NumSrcOperands)
1398                return src[operandIndex].isCondRegister();
1399            else if (operandIndex == NumSrcOperands)
1400                return(addr.isCondRegister());
1401            else
1402                return dest.isCondRegister();
1403        }
1404        bool isScalarRegister(int operandIndex)
1405        {
1406            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1407            if (operandIndex < NumSrcOperands)
1408                return src[operandIndex].isScalarRegister();
1409            else if (operandIndex == NumSrcOperands)
1410                return(addr.isScalarRegister());
1411            else
1412                return dest.isScalarRegister();
1413        }
1414        bool isSrcOperand(int operandIndex)
1415        {
1416            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1417            if (operandIndex < NumSrcOperands)
1418                return true;
1419            else if (operandIndex == NumSrcOperands)
1420                return(addr.isVectorRegister());
1421            else
1422                return false;
1423        }
1424        bool isDstOperand(int operandIndex)
1425        {
1426            if (operandIndex <= NumSrcOperands)
1427                return false;
1428            else
1429                return true;
1430        }
1431        int getOperandSize(int operandIndex)
1432        {
1433            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1434            if (operandIndex < NumSrcOperands)
1435                return(src[operandIndex].opSize());
1436            else if (operandIndex == NumSrcOperands)
1437                return(addr.opSize());
1438            else
1439                return(dest.opSize());
1440        }
1441        int
1442        getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst)
1443        {
1444            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1445            if (operandIndex < NumSrcOperands)
1446                return(src[operandIndex].regIndex());
1447            else if (operandIndex == NumSrcOperands)
1448                return(addr.regIndex());
1449            else
1450                return(dest.regIndex());
1451            return -1;
1452        }
1453    };
1454
1455    template<typename MemDataType, typename AddrOperandType, int NumSrcOperands,
1456             bool HasDst>
1457    class AtomicInst :
1458        public AtomicInstBase<typename MemDataType::OperandType,
1459                              AddrOperandType, NumSrcOperands, HasDst>,
1460        public MemInst
1461    {
1462      public:
1463        void generateDisassembly() override;
1464
1465        AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
1466                   const char *_opcode)
1467            : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
1468                             NumSrcOperands, HasDst>
1469                (ib, obj, _opcode),
1470              MemInst(MemDataType::memType)
1471        {
1472            init_addr(&this->addr);
1473        }
1474
1475        void
1476        initiateAcc(GPUDynInstPtr gpuDynInst) override
1477        {
1478            // before doing the RMW, check if this atomic has
1479            // release semantics, and if so issue a release first
1480            if (!this->isLocalMem()) {
1481                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1482                    && (gpuDynInst->isRelease()
1483                    || gpuDynInst->isAcquireRelease())) {
1484
1485                    gpuDynInst->statusBitVector = VectorMask(1);
1486
1487                    gpuDynInst->execContinuation = &GPUStaticInst::execAtomic;
1488                    gpuDynInst->useContinuation = true;
1489
1490                    // create request
1491                    Request *req = new Request(0, 0, 0, 0,
1492                                  gpuDynInst->computeUnit()->masterId(),
1493                                  0, gpuDynInst->wfDynId);
1494                    req->setFlags(Request::RELEASE);
1495                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
1496
1497                    return;
1498                }
1499            }
1500
1501            // if there is no release semantic, execute the RMW immediately
1502            execAtomic(gpuDynInst);
1503
1504        }
1505
1506        void
1507        completeAcc(GPUDynInstPtr gpuDynInst) override
1508        {
1509            // if this is not an atomic return op, then we
1510            // have nothing more to do.
1511            if (this->isAtomicRet()) {
1512                // the size of the src operands and the
1513                // memory being operated on must match
1514                // for HSAIL atomics - this assumption may
1515                // not apply to all ISAs
1516                typedef typename MemDataType::CType CType;
1517
1518                Wavefront *w = gpuDynInst->wavefront();
1519                int dst = this->dest.regIndex();
1520                std::vector<uint32_t> regVec;
1521                // virtual->physical VGPR mapping
1522                int physVgpr = w->remap(dst, sizeof(CType), 1);
1523                regVec.push_back(physVgpr);
1524                CType *p1 = &((CType*)gpuDynInst->d_data)[0];
1525
1526                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
1527                    if (gpuDynInst->exec_mask[i]) {
1528                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
1529                                "$%s%d <- %d global ld done (src = wavefront "
1530                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
1531                                w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d",
1532                                dst, *p1);
1533                        // write the value into the physical VGPR. This is a
1534                        // purely functional operation. No timing is modeled.
1535                        w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i);
1536                    }
1537                    ++p1;
1538                }
1539
1540                // Schedule the write operation of the load data on the VRF.
1541                // This simply models the timing aspect of the VRF write operation.
1542                // It does not modify the physical VGPR.
1543                int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
1544                    vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
1545                                         sizeof(CType), gpuDynInst->time);
1546
1547                if (this->isGlobalMem()) {
1548                    gpuDynInst->computeUnit()->globalMemoryPipe
1549                        .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
1550                } else {
1551                    assert(this->isLocalMem());
1552                    gpuDynInst->computeUnit()->localMemoryPipe
1553                        .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
1554                }
1555            }
1556        }
1557
1558        void execute(GPUDynInstPtr gpuDynInst) override;
1559
1560      private:
1561        // execAtomic may be called through a continuation
1562        // if the RMW had release semantics. see comment for
1563        // execContinuation in gpu_dyn_inst.hh
1564        void
1565        execAtomic(GPUDynInstPtr gpuDynInst) override
1566        {
1567            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
1568
1569            typedef typename MemDataType::CType c0;
1570
1571            c0 *d = &((c0*) gpuDynInst->d_data)[0];
1572            c0 *e = &((c0*) gpuDynInst->a_data)[0];
1573            c0 *f = &((c0*) gpuDynInst->x_data)[0];
1574
1575            for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
1576                if (gpuDynInst->exec_mask[i]) {
1577                    Addr vaddr = gpuDynInst->addr[i];
1578
1579                    if (this->isLocalMem()) {
1580                        Wavefront *wavefront = gpuDynInst->wavefront();
1581                        *d = wavefront->ldsChunk->read<c0>(vaddr);
1582
1583                        if (this->isAtomicAdd()) {
1584                            wavefront->ldsChunk->write<c0>(vaddr,
1585                            wavefront->ldsChunk->read<c0>(vaddr) + (*e));
1586                        } else if (this->isAtomicSub()) {
1587                            wavefront->ldsChunk->write<c0>(vaddr,
1588                            wavefront->ldsChunk->read<c0>(vaddr) - (*e));
1589                        } else if (this->isAtomicMax()) {
1590                            wavefront->ldsChunk->write<c0>(vaddr,
1591                            std::max(wavefront->ldsChunk->read<c0>(vaddr),
1592                            (*e)));
1593                        } else if (this->isAtomicMin()) {
1594                            wavefront->ldsChunk->write<c0>(vaddr,
1595                            std::min(wavefront->ldsChunk->read<c0>(vaddr),
1596                            (*e)));
1597                        } else if (this->isAtomicAnd()) {
1598                            wavefront->ldsChunk->write<c0>(vaddr,
1599                            wavefront->ldsChunk->read<c0>(vaddr) & (*e));
1600                        } else if (this->isAtomicOr()) {
1601                            wavefront->ldsChunk->write<c0>(vaddr,
1602                            wavefront->ldsChunk->read<c0>(vaddr) | (*e));
1603                        } else if (this->isAtomicXor()) {
1604                            wavefront->ldsChunk->write<c0>(vaddr,
1605                            wavefront->ldsChunk->read<c0>(vaddr) ^ (*e));
1606                        } else if (this->isAtomicInc()) {
1607                            wavefront->ldsChunk->write<c0>(vaddr,
1608                            wavefront->ldsChunk->read<c0>(vaddr) + 1);
1609                        } else if (this->isAtomicDec()) {
1610                            wavefront->ldsChunk->write<c0>(vaddr,
1611                            wavefront->ldsChunk->read<c0>(vaddr) - 1);
1612                        } else if (this->isAtomicExch()) {
1613                            wavefront->ldsChunk->write<c0>(vaddr, (*e));
1614                        } else if (this->isAtomicCAS()) {
1615                            wavefront->ldsChunk->write<c0>(vaddr,
1616                            (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ?
1617                            (*f) : wavefront->ldsChunk->read<c0>(vaddr));
1618                        } else {
1619                            fatal("Unrecognized or invalid HSAIL atomic op "
1620                                  "type.\n");
1621                        }
1622                    } else {
1623                        Request *req =
1624                            new Request(0, vaddr, sizeof(c0), 0,
1625                                        gpuDynInst->computeUnit()->masterId(),
1626                                        0, gpuDynInst->wfDynId,
1627                                        gpuDynInst->makeAtomicOpFunctor<c0>(e,
1628                                        f));
1629
1630                        gpuDynInst->setRequestFlags(req);
1631                        PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
1632                        pkt->dataStatic(d);
1633
1634                        if (gpuDynInst->computeUnit()->shader->
1635                            separate_acquire_release &&
1636                            (gpuDynInst->isAcquire())) {
1637                            // if this atomic has acquire semantics,
1638                            // schedule the continuation to perform an
1639                            // acquire after the RMW completes
1640                            gpuDynInst->execContinuation =
1641                                &GPUStaticInst::execAtomicAcq;
1642
1643                            gpuDynInst->useContinuation = true;
1644                        } else {
1645                            // the request will be finished when the RMW completes
1646                            gpuDynInst->useContinuation = false;
1647                        }
1648                        // translation is performed in sendRequest()
1649                        gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i,
1650                                                               pkt);
1651                    }
1652                }
1653
1654                ++d;
1655                ++e;
1656                ++f;
1657            }
1658
1659            gpuDynInst->updateStats();
1660        }
1661
1662        // execAtomicACq will always be called through a continuation.
1663        // see comment for execContinuation in gpu_dyn_inst.hh
1664        void
1665        execAtomicAcq(GPUDynInstPtr gpuDynInst) override
1666        {
1667            // after performing the RMW, check to see if this instruction
1668            // has acquire semantics, and if so, issue an acquire
1669            if (!this->isLocalMem()) {
1670                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1671                     && gpuDynInst->isAcquire()) {
1672                    gpuDynInst->statusBitVector = VectorMask(1);
1673
1674                    // the request will be finished when
1675                    // the acquire completes
1676                    gpuDynInst->useContinuation = false;
1677                    // create request
1678                    Request *req = new Request(0, 0, 0, 0,
1679                                  gpuDynInst->computeUnit()->masterId(),
1680                                  0, gpuDynInst->wfDynId);
1681                    req->setFlags(Request::ACQUIRE);
1682                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
1683                }
1684            }
1685        }
1686    };
1687
1688    template<typename DataType, typename AddrOperandType, int NumSrcOperands>
1689    GPUStaticInst*
1690    constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
1691    {
1692        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
1693
1694        if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) {
1695            return decodeLd<DataType>(ib, obj);
1696        } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) {
1697            switch (ib->type) {
1698              case Brig::BRIG_TYPE_B8:
1699                return decodeSt<S8,S8>(ib, obj);
1700              case Brig::BRIG_TYPE_B16:
1701                return decodeSt<S16,S16>(ib, obj);
1702              case Brig::BRIG_TYPE_B32:
1703                return decodeSt<S32,S32>(ib, obj);
1704              case Brig::BRIG_TYPE_B64:
1705                return decodeSt<S64,S64>(ib, obj);
1706              default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type);
1707            }
1708        } else {
1709            if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET)
1710                return new AtomicInst<DataType, AddrOperandType,
1711                    NumSrcOperands, false>(ib, obj, "atomicnoret");
1712            else
1713                return new AtomicInst<DataType, AddrOperandType,
1714                    NumSrcOperands, true>(ib, obj, "atomic");
1715        }
1716    }
1717
1718    template<typename DataType, int NumSrcOperands>
1719    GPUStaticInst*
1720    decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
1721    {
1722        unsigned addrIndex = (Brig::BrigOpcode)ib->opcode ==
1723            Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1;
1724
1725        unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex);
1726
1727        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
1728
1729        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
1730            return constructAtomic<DataType, NoRegAddrOperand,
1731                                   NumSrcOperands>(ib, obj);
1732        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
1733            // V2/V4 not allowed
1734            switch (tmp.regKind) {
1735              case Brig::BRIG_REGISTER_KIND_SINGLE:
1736                  return constructAtomic<DataType, SRegAddrOperand,
1737                                         NumSrcOperands>(ib, obj);
1738              case Brig::BRIG_REGISTER_KIND_DOUBLE:
1739                return constructAtomic<DataType, DRegAddrOperand,
1740                                       NumSrcOperands>(ib, obj);
1741              default:
1742                fatal("Bad atomic register operand type %d\n", tmp.type);
1743            }
1744        } else {
1745            fatal("Bad atomic register operand kind %d\n", tmp.kind);
1746        }
1747    }
1748
1749
1750    template<typename DataType>
1751    GPUStaticInst*
1752    decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
1753    {
1754        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
1755
1756        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
1757            return decodeAtomicHelper<DataType, 2>(ib, obj);
1758        } else {
1759            return decodeAtomicHelper<DataType, 1>(ib, obj);
1760        }
1761    }
1762
1763    template<typename DataType>
1764    GPUStaticInst*
1765    decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
1766    {
1767        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
1768        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
1769            return decodeAtomicHelper<DataType, 2>(ib, obj);
1770        } else {
1771            return decodeAtomicHelper<DataType, 1>(ib, obj);
1772        }
1773    }
1774} // namespace HsailISA
1775
1776#endif // __ARCH_HSAIL_INSTS_MEM_HH__
1777