1/*
2 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Steve Reinhardt
34 */
35
36#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
37#define __ARCH_HSAIL_INSTS_MEM_HH__
38
39#include <type_traits>
40
41#include "arch/hsail/insts/decl.hh"
42#include "arch/hsail/insts/gpu_static_inst.hh"
43#include "arch/hsail/operand.hh"
44#include "gpu-compute/compute_unit.hh"
45
46namespace HsailISA
47{
48    class MemInst
49    {
50      public:
51        MemInst() : size(0), addr_operand(nullptr) { }
52
53        MemInst(Enums::MemType m_type)
54        {
55            if (m_type == Enums::M_U64 ||
56                m_type == Enums::M_S64 ||
57                m_type == Enums::M_F64) {
58                size = 8;
59            } else if (m_type == Enums::M_U32 ||
60                       m_type == Enums::M_S32 ||
61                       m_type == Enums::M_F32) {
62                size = 4;
63            } else if (m_type == Enums::M_U16 ||
64                       m_type == Enums::M_S16 ||
65                       m_type == Enums::M_F16) {
66                size = 2;
67            } else {
68                size = 1;
69            }
70
71            addr_operand = nullptr;
72        }
73
74        void
75        init_addr(AddrOperandBase *_addr_operand)
76        {
77            addr_operand = _addr_operand;
78        }
79
80      private:
81        int size;
82        AddrOperandBase *addr_operand;
83
84      public:
85        int getMemOperandSize() { return size; }
86        AddrOperandBase *getAddressOperand() { return addr_operand; }
87    };
88
89    template<typename DestOperandType, typename AddrOperandType>
90    class LdaInstBase : public HsailGPUStaticInst
91    {
92      public:
93        typename DestOperandType::DestOperand dest;
94        AddrOperandType addr;
95
96        LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
97                    const char *_opcode)
98           : HsailGPUStaticInst(obj, _opcode)
99        {
100            using namespace Brig;
101
102            setFlag(ALU);
103
104            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
105            dest.init(op_offs, obj);
106            op_offs = obj->getOperandPtr(ib->operands, 1);
107            addr.init(op_offs, obj);
108        }
109
110        int numSrcRegOperands() override
111        { return(this->addr.isVectorRegister()); }
112        int numDstRegOperands() override
113        { return dest.isVectorRegister(); }
114        bool isVectorRegister(int operandIndex) override
115        {
116            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
117            return((operandIndex == 0) ? dest.isVectorRegister() :
118                   this->addr.isVectorRegister());
119        }
120        bool isCondRegister(int operandIndex) override
121        {
122            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
123            return((operandIndex == 0) ? dest.isCondRegister() :
124                   this->addr.isCondRegister());
125        }
126        bool isScalarRegister(int operandIndex) override
127        {
128            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
129            return((operandIndex == 0) ? dest.isScalarRegister() :
130                   this->addr.isScalarRegister());
131        }
132        bool isSrcOperand(int operandIndex) override
133        {
134            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
135            if (operandIndex > 0)
136                return(this->addr.isVectorRegister());
137            return false;
138        }
139        bool isDstOperand(int operandIndex) override {
140            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
141            return(operandIndex == 0);
142        }
143        int getOperandSize(int operandIndex) override
144        {
145            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
146            return((operandIndex == 0) ? dest.opSize() :
147                   this->addr.opSize());
148        }
149        int
150        getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
151        {
152            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
153            return((operandIndex == 0) ? dest.regIndex() :
154                   this->addr.regIndex());
155        }
156        int getNumOperands() override
157        {
158            if (this->addr.isVectorRegister())
159                return 2;
160            return 1;
161        }
162    };
163
164    template<typename DestDataType, typename AddrOperandType>
165    class LdaInst :
166        public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
167        public MemInst
168    {
169      public:
170        void generateDisassembly();
171
172        LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
173                        const char *_opcode)
174            : LdaInstBase<typename DestDataType::OperandType,
175                          AddrOperandType>(ib, obj, _opcode)
176        {
177            init_addr(&this->addr);
178        }
179
180        void execute(GPUDynInstPtr gpuDynInst);
181    };
182
183    template<typename DataType>
184    GPUStaticInst*
185    decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
186    {
187        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
188        BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj);
189
190        if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
191            return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas");
192        } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
193            // V2/V4 not allowed
194            switch (regDataType.regKind) {
195              case Brig::BRIG_REGISTER_KIND_SINGLE:
196                return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas");
197              case Brig::BRIG_REGISTER_KIND_DOUBLE:
198                return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas");
199              default:
200                fatal("Bad ldas register operand type %d\n", regDataType.type);
201            }
202        } else {
203            fatal("Bad ldas register operand kind %d\n", regDataType.kind);
204        }
205    }
206
207    template<typename MemOperandType, typename DestOperandType,
208             typename AddrOperandType>
209    class LdInstBase : public HsailGPUStaticInst
210    {
211      public:
212        Brig::BrigWidth8_t width;
213        typename DestOperandType::DestOperand dest;
214        AddrOperandType addr;
215
216        Brig::BrigSegment segment;
217        Brig::BrigMemoryOrder memoryOrder;
218        Brig::BrigMemoryScope memoryScope;
219        unsigned int equivClass;
220
221        LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
222                   const char *_opcode)
223           : HsailGPUStaticInst(obj, _opcode)
224        {
225            using namespace Brig;
226
227            setFlag(MemoryRef);
228            setFlag(Load);
229
230            if (ib->opcode == BRIG_OPCODE_LD) {
231                const BrigInstMem *ldst = (const BrigInstMem*)ib;
232
233                segment = (BrigSegment)ldst->segment;
234                memoryOrder = BRIG_MEMORY_ORDER_NONE;
235                memoryScope = BRIG_MEMORY_SCOPE_NONE;
236                equivClass = ldst->equivClass;
237
238                width = ldst->width;
239                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
240                const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
241                if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
242                    dest.init(op_offs, obj);
243
244                op_offs = obj->getOperandPtr(ib->operands, 1);
245                addr.init(op_offs, obj);
246            } else {
247                const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
248
249                segment = (BrigSegment)at->segment;
250                memoryOrder = (BrigMemoryOrder)at->memoryOrder;
251                memoryScope = (BrigMemoryScope)at->memoryScope;
252                equivClass = 0;
253
254                width = BRIG_WIDTH_1;
255                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
256                const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
257
258                if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
259                    dest.init(op_offs, obj);
260
261                op_offs = obj->getOperandPtr(ib->operands,1);
262                addr.init(op_offs, obj);
263            }
264
265            switch (memoryOrder) {
266              case BRIG_MEMORY_ORDER_NONE:
267                setFlag(NoOrder);
268                break;
269              case BRIG_MEMORY_ORDER_RELAXED:
270                setFlag(RelaxedOrder);
271                break;
272              case BRIG_MEMORY_ORDER_SC_ACQUIRE:
273                setFlag(Acquire);
274                break;
275              case BRIG_MEMORY_ORDER_SC_RELEASE:
276                setFlag(Release);
277                break;
278              case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
279                setFlag(AcquireRelease);
280                break;
281              default:
282                fatal("LdInst has bad memory order type\n");
283            }
284
285            switch (memoryScope) {
286              case BRIG_MEMORY_SCOPE_NONE:
287                setFlag(NoScope);
288                break;
289              case BRIG_MEMORY_SCOPE_WORKITEM:
290                setFlag(WorkitemScope);
291                break;
292              case BRIG_MEMORY_SCOPE_WORKGROUP:
293                setFlag(WorkgroupScope);
294                break;
295              case BRIG_MEMORY_SCOPE_AGENT:
296                setFlag(DeviceScope);
297                break;
298              case BRIG_MEMORY_SCOPE_SYSTEM:
299                setFlag(SystemScope);
300                break;
301              default:
302                fatal("LdInst has bad memory scope type\n");
303            }
304
305            switch (segment) {
306              case BRIG_SEGMENT_GLOBAL:
307                setFlag(GlobalSegment);
308                break;
309              case BRIG_SEGMENT_GROUP:
310                setFlag(GroupSegment);
311                break;
312              case BRIG_SEGMENT_PRIVATE:
313                setFlag(PrivateSegment);
314                break;
315              case BRIG_SEGMENT_READONLY:
316                setFlag(ReadOnlySegment);
317                break;
318              case BRIG_SEGMENT_SPILL:
319                setFlag(SpillSegment);
320                break;
321              case BRIG_SEGMENT_FLAT:
322                setFlag(Flat);
323                break;
324              case BRIG_SEGMENT_KERNARG:
325                setFlag(KernArgSegment);
326                break;
327              case BRIG_SEGMENT_ARG:
328                setFlag(ArgSegment);
329                break;
330              default:
331                panic("Ld: segment %d not supported\n", segment);
332            }
333        }
334
335        int numSrcRegOperands() override
336        { return(this->addr.isVectorRegister()); }
337        int numDstRegOperands() override { return dest.isVectorRegister(); }
338        int getNumOperands() override
339        {
340            if (this->addr.isVectorRegister())
341                return 2;
342            else
343                return 1;
344        }
345        bool isVectorRegister(int operandIndex) override
346        {
347            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
348            return((operandIndex == 0) ? dest.isVectorRegister() :
349                   this->addr.isVectorRegister());
350        }
351        bool isCondRegister(int operandIndex) override
352        {
353            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
354            return((operandIndex == 0) ? dest.isCondRegister() :
355                   this->addr.isCondRegister());
356        }
357        bool isScalarRegister(int operandIndex) override
358        {
359            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
360            return((operandIndex == 0) ? dest.isScalarRegister() :
361                   this->addr.isScalarRegister());
362        }
363        bool isSrcOperand(int operandIndex) override
364        {
365            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
366            if (operandIndex > 0)
367                return(this->addr.isVectorRegister());
368            return false;
369        }
370        bool isDstOperand(int operandIndex) override
371        {
372            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
373            return(operandIndex == 0);
374        }
375        int getOperandSize(int operandIndex) override
376        {
377            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
378            return((operandIndex == 0) ? dest.opSize() :
379                   this->addr.opSize());
380        }
381        int
382        getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
383        {
384            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
385            return((operandIndex == 0) ? dest.regIndex() :
386                   this->addr.regIndex());
387        }
388    };
389
390    template<typename MemDataType, typename DestDataType,
391             typename AddrOperandType>
392    class LdInst :
393        public LdInstBase<typename MemDataType::CType,
394                          typename DestDataType::OperandType, AddrOperandType>,
395        public MemInst
396    {
397        typename DestDataType::OperandType::DestOperand dest_vect[4];
398        uint16_t num_dest_operands;
399        void generateDisassembly() override;
400
401      public:
402        LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
403               const char *_opcode)
404            : LdInstBase<typename MemDataType::CType,
405                         typename DestDataType::OperandType,
406                         AddrOperandType>(ib, obj, _opcode),
407              MemInst(MemDataType::memType)
408        {
409            init_addr(&this->addr);
410
411            unsigned op_offs = obj->getOperandPtr(ib->operands,0);
412            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
413
414            if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
415                const Brig::BrigOperandOperandList *brigRegVecOp =
416                    (const Brig::BrigOperandOperandList*)brigOp;
417
418                num_dest_operands =
419                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
420
421                assert(num_dest_operands <= 4);
422            } else {
423                num_dest_operands = 1;
424            }
425
426            if (num_dest_operands > 1) {
427                assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
428
429                for (int i = 0; i < num_dest_operands; ++i) {
430                    dest_vect[i].init_from_vect(op_offs, obj, i);
431                }
432            }
433        }
434
435        void
436        initiateAcc(GPUDynInstPtr gpuDynInst) override
437        {
438            typedef typename MemDataType::CType c0;
439
440            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
441
442            if (num_dest_operands > 1) {
443                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
444                    if (gpuDynInst->exec_mask[i])
445                        gpuDynInst->statusVector.push_back(num_dest_operands);
446                    else
447                        gpuDynInst->statusVector.push_back(0);
448            }
449
450            for (int k = 0; k < num_dest_operands; ++k) {
451
452                c0 *d = &((c0*)gpuDynInst->d_data)
453                    [k * gpuDynInst->computeUnit()->wfSize()];
454
455                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
456                    if (gpuDynInst->exec_mask[i]) {
457                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
458
459                        if (this->isLocalMem()) {
460                            // load from shared memory
461                            *d = gpuDynInst->wavefront()->ldsChunk->
462                                read<c0>(vaddr);
463                        } else {
464                            RequestPtr req = std::make_shared<Request>(0,
465                                vaddr, sizeof(c0), 0,
466                                gpuDynInst->computeUnit()->masterId(),
467                                0, gpuDynInst->wfDynId);
468
469                            gpuDynInst->setRequestFlags(req);
470                            PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
471                            pkt->dataStatic(d);
472
473                            if (gpuDynInst->computeUnit()->shader->
474                                separate_acquire_release &&
475                                gpuDynInst->isAcquire()) {
476                                // if this load has acquire semantics,
477                                // set the response continuation function
478                                // to perform an Acquire request
479                                gpuDynInst->execContinuation =
480                                    &GPUStaticInst::execLdAcq;
481
482                                gpuDynInst->useContinuation = true;
483                            } else {
484                                // the request will be finished when
485                                // the load completes
486                                gpuDynInst->useContinuation = false;
487                            }
488                            // translation is performed in sendRequest()
489                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
490                                                                   i, pkt);
491                        }
492                    }
493                    ++d;
494                }
495            }
496
497            gpuDynInst->updateStats();
498        }
499
500        void
501        completeAcc(GPUDynInstPtr gpuDynInst) override
502        {
503            typedef typename MemDataType::CType c1;
504
505            constexpr bool is_vt_32 = DestDataType::vgprType == VT_32;
506
507            /**
508              * this code essentially replaces the long if-else chain
509              * that was in used GlobalMemPipeline::exec() to infer the
510              * size (single/double) and type (floating point/integer) of
511              * the destination register. this is needed for load
512              * instructions because the loaded value and the
513              * destination type can be of different sizes, and we also
514              * need to know if the value we're writing back is floating
515              * point and signed/unsigned, so we can properly cast the
516              * writeback value
517              */
518            typedef typename std::conditional<is_vt_32,
519                typename std::conditional<std::is_floating_point<c1>::value,
520                    float, typename std::conditional<std::is_signed<c1>::value,
521                    int32_t, uint32_t>::type>::type,
522                typename std::conditional<std::is_floating_point<c1>::value,
523                    double, typename std::conditional<std::is_signed<c1>::value,
524                    int64_t, uint64_t>::type>::type>::type c0;
525
526
527            Wavefront *w = gpuDynInst->wavefront();
528
529            std::vector<uint32_t> regVec;
530            // iterate over number of destination register operands since
531            // this is a load
532            for (int k = 0; k < num_dest_operands; ++k) {
533                assert((sizeof(c1) * num_dest_operands)
534                       <= MAX_WIDTH_FOR_MEM_INST);
535
536                int dst = this->dest.regIndex() + k;
537                if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST)
538                    dst = dest_vect[k].regIndex();
539                // virtual->physical VGPR mapping
540                int physVgpr = w->remap(dst, sizeof(c0), 1);
541                // save the physical VGPR index
542                regVec.push_back(physVgpr);
543
544                c1 *p1 =
545                    &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()];
546
547                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
548                    if (gpuDynInst->exec_mask[i]) {
549                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
550                                "$%s%d <- %d global ld done (src = wavefront "
551                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
552                                w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
553                                dst, *p1);
554                        // write the value into the physical VGPR. This is a
555                        // purely functional operation. No timing is modeled.
556                        w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
557                                                                    *p1, i);
558                    }
559                    ++p1;
560                }
561            }
562
563            // Schedule the write operation of the load data on the VRF.
564            // This simply models the timing aspect of the VRF write operation.
565            // It does not modify the physical VGPR.
566            int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
567                vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
568                                     sizeof(c0), gpuDynInst->time);
569
570            if (this->isGlobalMem()) {
571                gpuDynInst->computeUnit()->globalMemoryPipe
572                    .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
573            } else {
574                assert(this->isLocalMem());
575                gpuDynInst->computeUnit()->localMemoryPipe
576                    .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
577            }
578        }
579
580      private:
581        void
582        execLdAcq(GPUDynInstPtr gpuDynInst) override
583        {
584            // after the load has complete and if the load has acquire
585            // semantics, issue an acquire request.
586            if (!this->isLocalMem()) {
587                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
588                    && gpuDynInst->isAcquire()) {
589                    gpuDynInst->statusBitVector = VectorMask(1);
590                    gpuDynInst->useContinuation = false;
591                    // create request
592                    RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
593                                  gpuDynInst->computeUnit()->masterId(),
594                                  0, gpuDynInst->wfDynId);
595                    req->setFlags(Request::ACQUIRE);
596                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
597                }
598            }
599        }
600
601      public:
602        bool isVectorRegister(int operandIndex) override
603        {
604            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
605            if ((num_dest_operands != getNumOperands()) &&
606                (operandIndex == (getNumOperands()-1)))
607                return(this->addr.isVectorRegister());
608            if (num_dest_operands > 1) {
609                return dest_vect[operandIndex].isVectorRegister();
610            }
611            else if (num_dest_operands == 1) {
612                return LdInstBase<typename MemDataType::CType,
613                       typename DestDataType::OperandType,
614                       AddrOperandType>::dest.isVectorRegister();
615            }
616            return false;
617        }
618        bool isCondRegister(int operandIndex) override
619        {
620            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
621            if ((num_dest_operands != getNumOperands()) &&
622                (operandIndex == (getNumOperands()-1)))
623                return(this->addr.isCondRegister());
624            if (num_dest_operands > 1)
625                return dest_vect[operandIndex].isCondRegister();
626            else if (num_dest_operands == 1)
627                return LdInstBase<typename MemDataType::CType,
628                       typename DestDataType::OperandType,
629                       AddrOperandType>::dest.isCondRegister();
630            return false;
631        }
632        bool isScalarRegister(int operandIndex) override
633        {
634            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
635            if ((num_dest_operands != getNumOperands()) &&
636                (operandIndex == (getNumOperands()-1)))
637                return(this->addr.isScalarRegister());
638            if (num_dest_operands > 1)
639                return dest_vect[operandIndex].isScalarRegister();
640            else if (num_dest_operands == 1)
641                return LdInstBase<typename MemDataType::CType,
642                       typename DestDataType::OperandType,
643                       AddrOperandType>::dest.isScalarRegister();
644            return false;
645        }
646        bool isSrcOperand(int operandIndex) override
647        {
648            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
649            if ((num_dest_operands != getNumOperands()) &&
650                (operandIndex == (getNumOperands()-1)))
651                return(this->addr.isVectorRegister());
652            return false;
653        }
654        bool isDstOperand(int operandIndex) override
655        {
656            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
657            if ((num_dest_operands != getNumOperands()) &&
658                (operandIndex == (getNumOperands()-1)))
659                return false;
660            return true;
661        }
662        int getOperandSize(int operandIndex) override
663        {
664            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
665            if ((num_dest_operands != getNumOperands()) &&
666                (operandIndex == (getNumOperands()-1)))
667                return(this->addr.opSize());
668            if (num_dest_operands > 1)
669                return(dest_vect[operandIndex].opSize());
670            else if (num_dest_operands == 1)
671                return(LdInstBase<typename MemDataType::CType,
672                       typename DestDataType::OperandType,
673                       AddrOperandType>::dest.opSize());
674            return 0;
675        }
676        int
677        getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
678        {
679            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
680            if ((num_dest_operands != getNumOperands()) &&
681                (operandIndex == (getNumOperands()-1)))
682                return(this->addr.regIndex());
683            if (num_dest_operands > 1)
684                return(dest_vect[operandIndex].regIndex());
685            else if (num_dest_operands == 1)
686                return(LdInstBase<typename MemDataType::CType,
687                       typename DestDataType::OperandType,
688                       AddrOperandType>::dest.regIndex());
689            return -1;
690        }
691        int getNumOperands() override
692        {
693            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
694                return(num_dest_operands+1);
695            else
696                return(num_dest_operands);
697        }
698        void execute(GPUDynInstPtr gpuDynInst) override;
699    };
700
701    template<typename MemDT, typename DestDT>
702    GPUStaticInst*
703    decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
704    {
705        unsigned op_offs = obj->getOperandPtr(ib->operands,1);
706        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
707
708        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
709            return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld");
710        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
711                   tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
712            switch (tmp.regKind) {
713              case Brig::BRIG_REGISTER_KIND_SINGLE:
714                return new LdInst<MemDT, DestDT,
715                                  SRegAddrOperand>(ib, obj, "ld");
716              case Brig::BRIG_REGISTER_KIND_DOUBLE:
717                return new LdInst<MemDT, DestDT,
718                                  DRegAddrOperand>(ib, obj, "ld");
719              default:
720                fatal("Bad ld register operand type %d\n", tmp.regKind);
721            }
722        } else {
723            fatal("Bad ld register operand kind %d\n", tmp.kind);
724        }
725    }
726
727    template<typename MemDT>
728    GPUStaticInst*
729    decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
730    {
731        unsigned op_offs = obj->getOperandPtr(ib->operands,0);
732        BrigRegOperandInfo dest = findRegDataType(op_offs, obj);
733
734        assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
735               dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
736        switch(dest.regKind) {
737          case Brig::BRIG_REGISTER_KIND_SINGLE:
738            switch (ib->type) {
739              case Brig::BRIG_TYPE_B8:
740              case Brig::BRIG_TYPE_B16:
741              case Brig::BRIG_TYPE_B32:
742                return decodeLd2<MemDT, B32>(ib, obj);
743              case Brig::BRIG_TYPE_U8:
744              case Brig::BRIG_TYPE_U16:
745              case Brig::BRIG_TYPE_U32:
746                return decodeLd2<MemDT, U32>(ib, obj);
747              case Brig::BRIG_TYPE_S8:
748              case Brig::BRIG_TYPE_S16:
749              case Brig::BRIG_TYPE_S32:
750                return decodeLd2<MemDT, S32>(ib, obj);
751              case Brig::BRIG_TYPE_F16:
752              case Brig::BRIG_TYPE_F32:
753                return decodeLd2<MemDT, U32>(ib, obj);
754              default:
755                fatal("Bad ld register operand type %d, %d\n",
756                      dest.regKind, ib->type);
757            };
758          case Brig::BRIG_REGISTER_KIND_DOUBLE:
759            switch (ib->type) {
760              case Brig::BRIG_TYPE_B64:
761                return decodeLd2<MemDT, B64>(ib, obj);
762              case Brig::BRIG_TYPE_U64:
763                return decodeLd2<MemDT, U64>(ib, obj);
764              case Brig::BRIG_TYPE_S64:
765                return decodeLd2<MemDT, S64>(ib, obj);
766              case Brig::BRIG_TYPE_F64:
767                return decodeLd2<MemDT, U64>(ib, obj);
768              default:
769                fatal("Bad ld register operand type %d, %d\n",
770                      dest.regKind, ib->type);
771            };
772          default:
773            fatal("Bad ld register operand type %d, %d\n", dest.regKind,
774                  ib->type);
775        }
776    }
777
778    template<typename MemDataType, typename SrcOperandType,
779             typename AddrOperandType>
780    class StInstBase : public HsailGPUStaticInst
781    {
782      public:
783        typename SrcOperandType::SrcOperand src;
784        AddrOperandType addr;
785
786        Brig::BrigSegment segment;
787        Brig::BrigMemoryScope memoryScope;
788        Brig::BrigMemoryOrder memoryOrder;
789        unsigned int equivClass;
790
791        StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
792                   const char *_opcode)
793           : HsailGPUStaticInst(obj, _opcode)
794        {
795            using namespace Brig;
796
797            setFlag(MemoryRef);
798            setFlag(Store);
799
800            if (ib->opcode == BRIG_OPCODE_ST) {
801                const BrigInstMem *ldst = (const BrigInstMem*)ib;
802
803                segment = (BrigSegment)ldst->segment;
804                memoryOrder = BRIG_MEMORY_ORDER_NONE;
805                memoryScope = BRIG_MEMORY_SCOPE_NONE;
806                equivClass = ldst->equivClass;
807
808                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
809                const BrigOperand *baseOp = obj->getOperand(op_offs);
810
811                if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) ||
812                    (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) {
813                    src.init(op_offs, obj);
814                }
815
816                op_offs = obj->getOperandPtr(ib->operands, 1);
817                addr.init(op_offs, obj);
818            } else {
819                const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
820
821                segment = (BrigSegment)at->segment;
822                memoryScope = (BrigMemoryScope)at->memoryScope;
823                memoryOrder = (BrigMemoryOrder)at->memoryOrder;
824                equivClass = 0;
825
826                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
827                addr.init(op_offs, obj);
828
829                op_offs = obj->getOperandPtr(ib->operands, 1);
830                src.init(op_offs, obj);
831            }
832
833            switch (memoryOrder) {
834              case BRIG_MEMORY_ORDER_NONE:
835                setFlag(NoOrder);
836                break;
837              case BRIG_MEMORY_ORDER_RELAXED:
838                setFlag(RelaxedOrder);
839                break;
840              case BRIG_MEMORY_ORDER_SC_ACQUIRE:
841                setFlag(Acquire);
842                break;
843              case BRIG_MEMORY_ORDER_SC_RELEASE:
844                setFlag(Release);
845                break;
846              case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
847                setFlag(AcquireRelease);
848                break;
849              default:
850                fatal("StInst has bad memory order type\n");
851            }
852
853            switch (memoryScope) {
854              case BRIG_MEMORY_SCOPE_NONE:
855                setFlag(NoScope);
856                break;
857              case BRIG_MEMORY_SCOPE_WORKITEM:
858                setFlag(WorkitemScope);
859                break;
860              case BRIG_MEMORY_SCOPE_WORKGROUP:
861                setFlag(WorkgroupScope);
862                break;
863              case BRIG_MEMORY_SCOPE_AGENT:
864                setFlag(DeviceScope);
865                break;
866              case BRIG_MEMORY_SCOPE_SYSTEM:
867                setFlag(SystemScope);
868                break;
869              default:
870                fatal("StInst has bad memory scope type\n");
871            }
872
873            switch (segment) {
874              case BRIG_SEGMENT_GLOBAL:
875                setFlag(GlobalSegment);
876                break;
877              case BRIG_SEGMENT_GROUP:
878                setFlag(GroupSegment);
879                break;
880              case BRIG_SEGMENT_PRIVATE:
881                setFlag(PrivateSegment);
882                break;
883              case BRIG_SEGMENT_READONLY:
884                setFlag(ReadOnlySegment);
885                break;
886              case BRIG_SEGMENT_SPILL:
887                setFlag(SpillSegment);
888                break;
889              case BRIG_SEGMENT_FLAT:
890                setFlag(Flat);
891                break;
892              case BRIG_SEGMENT_ARG:
893                setFlag(ArgSegment);
894                break;
895              default:
896                panic("St: segment %d not supported\n", segment);
897            }
898        }
899
900        int numDstRegOperands() override { return 0; }
901        int numSrcRegOperands() override
902        {
903            return src.isVectorRegister() + this->addr.isVectorRegister();
904        }
905        int getNumOperands() override
906        {
907            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
908                return 2;
909            else
910                return 1;
911        }
912        bool isVectorRegister(int operandIndex) override
913        {
914            assert(operandIndex >= 0 && operandIndex < getNumOperands());
915            return !operandIndex ? src.isVectorRegister() :
916                   this->addr.isVectorRegister();
917        }
918        bool isCondRegister(int operandIndex) override
919        {
920            assert(operandIndex >= 0 && operandIndex < getNumOperands());
921            return !operandIndex ? src.isCondRegister() :
922                   this->addr.isCondRegister();
923        }
924        bool isScalarRegister(int operandIndex) override
925        {
926            assert(operandIndex >= 0 && operandIndex < getNumOperands());
927            return !operandIndex ? src.isScalarRegister() :
928                   this->addr.isScalarRegister();
929        }
930        bool isSrcOperand(int operandIndex) override
931        {
932            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
933            return true;
934        }
935        bool isDstOperand(int operandIndex) override { return false; }
936        int getOperandSize(int operandIndex) override
937        {
938            assert(operandIndex >= 0 && operandIndex < getNumOperands());
939            return !operandIndex ? src.opSize() : this->addr.opSize();
940        }
941        int
942        getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
943        {
944            assert(operandIndex >= 0 && operandIndex < getNumOperands());
945            return !operandIndex ? src.regIndex() : this->addr.regIndex();
946        }
947    };
948
949
950    template<typename MemDataType, typename SrcDataType,
951             typename AddrOperandType>
952    class StInst :
953        public StInstBase<MemDataType, typename SrcDataType::OperandType,
954                          AddrOperandType>,
955        public MemInst
956    {
957      public:
958        typename SrcDataType::OperandType::SrcOperand src_vect[4];
959        uint16_t num_src_operands;
960        void generateDisassembly() override;
961
962        StInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
963                        const char *_opcode, int srcIdx)
964            : StInstBase<MemDataType, typename SrcDataType::OperandType,
965                         AddrOperandType>(ib, obj, _opcode),
966              MemInst(SrcDataType::memType)
967        {
968            init_addr(&this->addr);
969
970            BrigRegOperandInfo rinfo;
971            unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx);
972            const Brig::BrigOperand *baseOp = obj->getOperand(op_offs);
973
974            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
975                const Brig::BrigOperandConstantBytes *op =
976                    (Brig::BrigOperandConstantBytes*)baseOp;
977
978                rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind,
979                                           Brig::BRIG_TYPE_NONE);
980            } else {
981                rinfo = findRegDataType(op_offs, obj);
982            }
983
984            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
985                const Brig::BrigOperandOperandList *brigRegVecOp =
986                    (const Brig::BrigOperandOperandList*)baseOp;
987
988                num_src_operands =
989                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
990
991                assert(num_src_operands <= 4);
992            } else {
993                num_src_operands = 1;
994            }
995
996            if (num_src_operands > 1) {
997                assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
998
999                for (int i = 0; i < num_src_operands; ++i) {
1000                    src_vect[i].init_from_vect(op_offs, obj, i);
1001                }
1002            }
1003        }
1004
1005        void
1006        initiateAcc(GPUDynInstPtr gpuDynInst) override
1007        {
1008            // before performing a store, check if this store has
1009            // release semantics, and if so issue a release first
1010            if (!this->isLocalMem()) {
1011                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1012                    && gpuDynInst->isRelease()) {
1013
1014                    gpuDynInst->statusBitVector = VectorMask(1);
1015                    gpuDynInst->execContinuation = &GPUStaticInst::execSt;
1016                    gpuDynInst->useContinuation = true;
1017                    // create request
1018                    RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
1019                                  gpuDynInst->computeUnit()->masterId(),
1020                                  0, gpuDynInst->wfDynId);
1021                    req->setFlags(Request::RELEASE);
1022                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
1023
1024                    return;
1025                }
1026            }
1027
1028            // if there is no release semantic, perform stores immediately
1029            execSt(gpuDynInst);
1030        }
1031
1032        // stores don't write anything back, so there is nothing
1033        // to do here. we only override this method to avoid the
1034        // fatal in the base class implementation
1035        void completeAcc(GPUDynInstPtr gpuDynInst) override { }
1036
1037      private:
1038        // execSt may be called through a continuation
1039        // if the store had release semantics. see comment for
1040        // execSt in gpu_static_inst.hh
1041        void
1042        execSt(GPUDynInstPtr gpuDynInst) override
1043        {
1044            typedef typename MemDataType::CType c0;
1045
1046            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
1047
1048            if (num_src_operands > 1) {
1049                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
1050                    if (gpuDynInst->exec_mask[i])
1051                        gpuDynInst->statusVector.push_back(num_src_operands);
1052                    else
1053                        gpuDynInst->statusVector.push_back(0);
1054            }
1055
1056            for (int k = 0; k < num_src_operands; ++k) {
1057                c0 *d = &((c0*)gpuDynInst->d_data)
1058                    [k * gpuDynInst->computeUnit()->wfSize()];
1059
1060                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
1061                    if (gpuDynInst->exec_mask[i]) {
1062                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
1063
1064                        if (this->isLocalMem()) {
1065                            //store to shared memory
1066                            gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr,
1067                                                                         *d);
1068                        } else {
1069                            RequestPtr req = std::make_shared<Request>(
1070                                0, vaddr, sizeof(c0), 0,
1071                                gpuDynInst->computeUnit()->masterId(),
1072                                0, gpuDynInst->wfDynId);
1073
1074                            gpuDynInst->setRequestFlags(req);
1075                            PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
1076                            pkt->dataStatic<c0>(d);
1077
1078                            // translation is performed in sendRequest()
1079                            // the request will be finished when the store completes
1080                            gpuDynInst->useContinuation = false;
1081                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
1082                                                                   i, pkt);
1083
1084                        }
1085                    }
1086                    ++d;
1087                }
1088            }
1089
1090            gpuDynInst->updateStats();
1091        }
1092
1093      public:
1094        bool isVectorRegister(int operandIndex) override
1095        {
1096            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1097            if (operandIndex == num_src_operands)
1098                return this->addr.isVectorRegister();
1099            if (num_src_operands > 1)
1100                return src_vect[operandIndex].isVectorRegister();
1101            else if (num_src_operands == 1)
1102                return StInstBase<MemDataType,
1103                       typename SrcDataType::OperandType,
1104                       AddrOperandType>::src.isVectorRegister();
1105            return false;
1106        }
1107        bool isCondRegister(int operandIndex) override
1108        {
1109            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1110            if (operandIndex == num_src_operands)
1111                return this->addr.isCondRegister();
1112            if (num_src_operands > 1)
1113                return src_vect[operandIndex].isCondRegister();
1114            else if (num_src_operands == 1)
1115                return StInstBase<MemDataType,
1116                       typename SrcDataType::OperandType,
1117                       AddrOperandType>::src.isCondRegister();
1118            return false;
1119        }
1120        bool isScalarRegister(int operandIndex) override
1121        {
1122            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1123            if (operandIndex == num_src_operands)
1124                return this->addr.isScalarRegister();
1125            if (num_src_operands > 1)
1126                return src_vect[operandIndex].isScalarRegister();
1127            else if (num_src_operands == 1)
1128                return StInstBase<MemDataType,
1129                       typename SrcDataType::OperandType,
1130                       AddrOperandType>::src.isScalarRegister();
1131            return false;
1132        }
1133        bool isSrcOperand(int operandIndex) override
1134        {
1135            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1136            return true;
1137        }
1138        bool isDstOperand(int operandIndex) override { return false; }
1139        int getOperandSize(int operandIndex) override
1140        {
1141            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1142            if (operandIndex == num_src_operands)
1143                return this->addr.opSize();
1144            if (num_src_operands > 1)
1145                return src_vect[operandIndex].opSize();
1146            else if (num_src_operands == 1)
1147                return StInstBase<MemDataType,
1148                       typename SrcDataType::OperandType,
1149                       AddrOperandType>::src.opSize();
1150            return 0;
1151        }
1152        int
1153        getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst) override
1154        {
1155            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1156            if (operandIndex == num_src_operands)
1157                return this->addr.regIndex();
1158            if (num_src_operands > 1)
1159                return src_vect[operandIndex].regIndex();
1160            else if (num_src_operands == 1)
1161                return StInstBase<MemDataType,
1162                       typename SrcDataType::OperandType,
1163                       AddrOperandType>::src.regIndex();
1164            return -1;
1165        }
1166        int getNumOperands() override
1167        {
1168            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
1169                return num_src_operands + 1;
1170            else
1171                return num_src_operands;
1172        }
1173        void execute(GPUDynInstPtr gpuDynInst) override;
1174    };
1175
1176    template<typename DataType, typename SrcDataType>
1177    GPUStaticInst*
1178    decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
1179    {
1180        int srcIdx = 0;
1181        int destIdx = 1;
1182        if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC ||
1183            ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) {
1184            srcIdx = 1;
1185            destIdx = 0;
1186        }
1187        unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx);
1188
1189        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
1190
1191        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
1192            return new StInst<DataType, SrcDataType,
1193                              NoRegAddrOperand>(ib, obj, "st", srcIdx);
1194        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
1195            // V2/V4 not allowed
1196            switch (tmp.regKind) {
1197              case Brig::BRIG_REGISTER_KIND_SINGLE:
1198                return new StInst<DataType, SrcDataType,
1199                                  SRegAddrOperand>(ib, obj, "st", srcIdx);
1200              case Brig::BRIG_REGISTER_KIND_DOUBLE:
1201                return new StInst<DataType, SrcDataType,
1202                                  DRegAddrOperand>(ib, obj, "st", srcIdx);
1203              default:
1204                fatal("Bad st register operand type %d\n", tmp.type);
1205            }
1206        } else {
1207            fatal("Bad st register operand kind %d\n", tmp.kind);
1208        }
1209    }
1210
1211    template<typename OperandType, typename AddrOperandType, int NumSrcOperands,
1212             bool HasDst>
1213    class AtomicInstBase : public HsailGPUStaticInst
1214    {
1215      public:
1216        typename OperandType::DestOperand dest;
1217        typename OperandType::SrcOperand src[NumSrcOperands];
1218        AddrOperandType addr;
1219
1220        Brig::BrigSegment segment;
1221        Brig::BrigMemoryOrder memoryOrder;
1222        Brig::BrigAtomicOperation atomicOperation;
1223        Brig::BrigMemoryScope memoryScope;
1224        Brig::BrigOpcode opcode;
1225
1226        AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
1227                       const char *_opcode)
1228           : HsailGPUStaticInst(obj, _opcode)
1229        {
1230            using namespace Brig;
1231
1232            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
1233
1234            segment = (BrigSegment)at->segment;
1235            memoryScope = (BrigMemoryScope)at->memoryScope;
1236            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
1237            atomicOperation = (BrigAtomicOperation)at->atomicOperation;
1238            opcode = (BrigOpcode)ib->opcode;
1239
1240            assert(opcode == Brig::BRIG_OPCODE_ATOMICNORET ||
1241                   opcode == Brig::BRIG_OPCODE_ATOMIC);
1242
1243            setFlag(MemoryRef);
1244
1245            if (opcode == Brig::BRIG_OPCODE_ATOMIC) {
1246                setFlag(AtomicReturn);
1247            } else {
1248                setFlag(AtomicNoReturn);
1249            }
1250
1251            switch (memoryOrder) {
1252              case BRIG_MEMORY_ORDER_NONE:
1253                setFlag(NoOrder);
1254                break;
1255              case BRIG_MEMORY_ORDER_RELAXED:
1256                setFlag(RelaxedOrder);
1257                break;
1258              case BRIG_MEMORY_ORDER_SC_ACQUIRE:
1259                setFlag(Acquire);
1260                break;
1261              case BRIG_MEMORY_ORDER_SC_RELEASE:
1262                setFlag(Release);
1263                break;
1264              case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
1265                setFlag(AcquireRelease);
1266                break;
1267              default:
1268                fatal("AtomicInst has bad memory order type\n");
1269            }
1270
1271            switch (memoryScope) {
1272              case BRIG_MEMORY_SCOPE_NONE:
1273                setFlag(NoScope);
1274                break;
1275              case BRIG_MEMORY_SCOPE_WORKITEM:
1276                setFlag(WorkitemScope);
1277                break;
1278              case BRIG_MEMORY_SCOPE_WORKGROUP:
1279                setFlag(WorkgroupScope);
1280                break;
1281              case BRIG_MEMORY_SCOPE_AGENT:
1282                setFlag(DeviceScope);
1283                break;
1284              case BRIG_MEMORY_SCOPE_SYSTEM:
1285                setFlag(SystemScope);
1286                break;
1287              default:
1288                fatal("AtomicInst has bad memory scope type\n");
1289            }
1290
1291            switch (atomicOperation) {
1292              case Brig::BRIG_ATOMIC_AND:
1293                setFlag(AtomicAnd);
1294                break;
1295              case Brig::BRIG_ATOMIC_OR:
1296                setFlag(AtomicOr);
1297                break;
1298              case Brig::BRIG_ATOMIC_XOR:
1299                setFlag(AtomicXor);
1300                break;
1301              case Brig::BRIG_ATOMIC_CAS:
1302                setFlag(AtomicCAS);
1303                break;
1304              case Brig::BRIG_ATOMIC_EXCH:
1305                setFlag(AtomicExch);
1306                break;
1307              case Brig::BRIG_ATOMIC_ADD:
1308                setFlag(AtomicAdd);
1309                break;
1310              case Brig::BRIG_ATOMIC_WRAPINC:
1311                setFlag(AtomicInc);
1312                break;
1313              case Brig::BRIG_ATOMIC_WRAPDEC:
1314                setFlag(AtomicDec);
1315                break;
1316              case Brig::BRIG_ATOMIC_MIN:
1317                setFlag(AtomicMin);
1318                break;
1319              case Brig::BRIG_ATOMIC_MAX:
1320                setFlag(AtomicMax);
1321                break;
1322              case Brig::BRIG_ATOMIC_SUB:
1323                setFlag(AtomicSub);
1324                break;
1325              default:
1326                fatal("Bad BrigAtomicOperation code %d\n", atomicOperation);
1327            }
1328
1329            switch (segment) {
1330              case BRIG_SEGMENT_GLOBAL:
1331                setFlag(GlobalSegment);
1332                break;
1333              case BRIG_SEGMENT_GROUP:
1334                setFlag(GroupSegment);
1335                break;
1336              case BRIG_SEGMENT_FLAT:
1337                setFlag(Flat);
1338                break;
1339              default:
1340                panic("Atomic: segment %d not supported\n", segment);
1341            }
1342
1343            if (HasDst) {
1344                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
1345                dest.init(op_offs, obj);
1346
1347                op_offs = obj->getOperandPtr(ib->operands, 1);
1348                addr.init(op_offs, obj);
1349
1350                for (int i = 0; i < NumSrcOperands; ++i) {
1351                    op_offs = obj->getOperandPtr(ib->operands, i + 2);
1352                    src[i].init(op_offs, obj);
1353                }
1354            } else {
1355
1356                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
1357                addr.init(op_offs, obj);
1358
1359                for (int i = 0; i < NumSrcOperands; ++i) {
1360                    op_offs = obj->getOperandPtr(ib->operands, i + 1);
1361                    src[i].init(op_offs, obj);
1362                }
1363            }
1364        }
1365
1366        int numSrcRegOperands()
1367        {
1368            int operands = 0;
1369            for (int i = 0; i < NumSrcOperands; i++) {
1370                if (src[i].isVectorRegister()) {
1371                    operands++;
1372                }
1373            }
1374            if (addr.isVectorRegister())
1375                operands++;
1376            return operands;
1377        }
1378        int numDstRegOperands() { return dest.isVectorRegister(); }
1379        int getNumOperands()
1380        {
1381            if (addr.isVectorRegister())
1382                return(NumSrcOperands + 2);
1383            return(NumSrcOperands + 1);
1384        }
1385        bool isVectorRegister(int operandIndex)
1386        {
1387            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1388            if (operandIndex < NumSrcOperands)
1389                return src[operandIndex].isVectorRegister();
1390            else if (operandIndex == NumSrcOperands)
1391                return(addr.isVectorRegister());
1392            else
1393                return dest.isVectorRegister();
1394        }
1395        bool isCondRegister(int operandIndex)
1396        {
1397            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1398            if (operandIndex < NumSrcOperands)
1399                return src[operandIndex].isCondRegister();
1400            else if (operandIndex == NumSrcOperands)
1401                return(addr.isCondRegister());
1402            else
1403                return dest.isCondRegister();
1404        }
1405        bool isScalarRegister(int operandIndex)
1406        {
1407            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1408            if (operandIndex < NumSrcOperands)
1409                return src[operandIndex].isScalarRegister();
1410            else if (operandIndex == NumSrcOperands)
1411                return(addr.isScalarRegister());
1412            else
1413                return dest.isScalarRegister();
1414        }
1415        bool isSrcOperand(int operandIndex)
1416        {
1417            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1418            if (operandIndex < NumSrcOperands)
1419                return true;
1420            else if (operandIndex == NumSrcOperands)
1421                return(addr.isVectorRegister());
1422            else
1423                return false;
1424        }
1425        bool isDstOperand(int operandIndex)
1426        {
1427            if (operandIndex <= NumSrcOperands)
1428                return false;
1429            else
1430                return true;
1431        }
1432        int getOperandSize(int operandIndex)
1433        {
1434            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1435            if (operandIndex < NumSrcOperands)
1436                return(src[operandIndex].opSize());
1437            else if (operandIndex == NumSrcOperands)
1438                return(addr.opSize());
1439            else
1440                return(dest.opSize());
1441        }
1442        int
1443        getRegisterIndex(int operandIndex, GPUDynInstPtr gpuDynInst)
1444        {
1445            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1446            if (operandIndex < NumSrcOperands)
1447                return(src[operandIndex].regIndex());
1448            else if (operandIndex == NumSrcOperands)
1449                return(addr.regIndex());
1450            else
1451                return(dest.regIndex());
1452            return -1;
1453        }
1454    };
1455
1456    template<typename MemDataType, typename AddrOperandType, int NumSrcOperands,
1457             bool HasDst>
1458    class AtomicInst :
1459        public AtomicInstBase<typename MemDataType::OperandType,
1460                              AddrOperandType, NumSrcOperands, HasDst>,
1461        public MemInst
1462    {
1463      public:
1464        void generateDisassembly() override;
1465
1466        AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
1467                   const char *_opcode)
1468            : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
1469                             NumSrcOperands, HasDst>
1470                (ib, obj, _opcode),
1471              MemInst(MemDataType::memType)
1472        {
1473            init_addr(&this->addr);
1474        }
1475
1476        void
1477        initiateAcc(GPUDynInstPtr gpuDynInst) override
1478        {
1479            // before doing the RMW, check if this atomic has
1480            // release semantics, and if so issue a release first
1481            if (!this->isLocalMem()) {
1482                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1483                    && (gpuDynInst->isRelease()
1484                    || gpuDynInst->isAcquireRelease())) {
1485
1486                    gpuDynInst->statusBitVector = VectorMask(1);
1487
1488                    gpuDynInst->execContinuation = &GPUStaticInst::execAtomic;
1489                    gpuDynInst->useContinuation = true;
1490
1491                    // create request
1492                    RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
1493                                  gpuDynInst->computeUnit()->masterId(),
1494                                  0, gpuDynInst->wfDynId);
1495                    req->setFlags(Request::RELEASE);
1496                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
1497
1498                    return;
1499                }
1500            }
1501
1502            // if there is no release semantic, execute the RMW immediately
1503            execAtomic(gpuDynInst);
1504
1505        }
1506
1507        void
1508        completeAcc(GPUDynInstPtr gpuDynInst) override
1509        {
1510            // if this is not an atomic return op, then we
1511            // have nothing more to do.
1512            if (this->isAtomicRet()) {
1513                // the size of the src operands and the
1514                // memory being operated on must match
1515                // for HSAIL atomics - this assumption may
1516                // not apply to all ISAs
1517                typedef typename MemDataType::CType CType;
1518
1519                Wavefront *w = gpuDynInst->wavefront();
1520                int dst = this->dest.regIndex();
1521                std::vector<uint32_t> regVec;
1522                // virtual->physical VGPR mapping
1523                int physVgpr = w->remap(dst, sizeof(CType), 1);
1524                regVec.push_back(physVgpr);
1525                CType *p1 = &((CType*)gpuDynInst->d_data)[0];
1526
1527                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
1528                    if (gpuDynInst->exec_mask[i]) {
1529                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
1530                                "$%s%d <- %d global ld done (src = wavefront "
1531                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
1532                                w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d",
1533                                dst, *p1);
1534                        // write the value into the physical VGPR. This is a
1535                        // purely functional operation. No timing is modeled.
1536                        w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i);
1537                    }
1538                    ++p1;
1539                }
1540
1541                // Schedule the write operation of the load data on the VRF.
1542                // This simply models the timing aspect of the VRF write operation.
1543                // It does not modify the physical VGPR.
1544                int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
1545                    vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
1546                                         sizeof(CType), gpuDynInst->time);
1547
1548                if (this->isGlobalMem()) {
1549                    gpuDynInst->computeUnit()->globalMemoryPipe
1550                        .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
1551                } else {
1552                    assert(this->isLocalMem());
1553                    gpuDynInst->computeUnit()->localMemoryPipe
1554                        .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
1555                }
1556            }
1557        }
1558
1559        void execute(GPUDynInstPtr gpuDynInst) override;
1560
1561      private:
1562        // execAtomic may be called through a continuation
1563        // if the RMW had release semantics. see comment for
1564        // execContinuation in gpu_dyn_inst.hh
1565        void
1566        execAtomic(GPUDynInstPtr gpuDynInst) override
1567        {
1568            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
1569
1570            typedef typename MemDataType::CType c0;
1571
1572            c0 *d = &((c0*) gpuDynInst->d_data)[0];
1573            c0 *e = &((c0*) gpuDynInst->a_data)[0];
1574            c0 *f = &((c0*) gpuDynInst->x_data)[0];
1575
1576            for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
1577                if (gpuDynInst->exec_mask[i]) {
1578                    Addr vaddr = gpuDynInst->addr[i];
1579
1580                    if (this->isLocalMem()) {
1581                        Wavefront *wavefront = gpuDynInst->wavefront();
1582                        *d = wavefront->ldsChunk->read<c0>(vaddr);
1583
1584                        if (this->isAtomicAdd()) {
1585                            wavefront->ldsChunk->write<c0>(vaddr,
1586                            wavefront->ldsChunk->read<c0>(vaddr) + (*e));
1587                        } else if (this->isAtomicSub()) {
1588                            wavefront->ldsChunk->write<c0>(vaddr,
1589                            wavefront->ldsChunk->read<c0>(vaddr) - (*e));
1590                        } else if (this->isAtomicMax()) {
1591                            wavefront->ldsChunk->write<c0>(vaddr,
1592                            std::max(wavefront->ldsChunk->read<c0>(vaddr),
1593                            (*e)));
1594                        } else if (this->isAtomicMin()) {
1595                            wavefront->ldsChunk->write<c0>(vaddr,
1596                            std::min(wavefront->ldsChunk->read<c0>(vaddr),
1597                            (*e)));
1598                        } else if (this->isAtomicAnd()) {
1599                            wavefront->ldsChunk->write<c0>(vaddr,
1600                            wavefront->ldsChunk->read<c0>(vaddr) & (*e));
1601                        } else if (this->isAtomicOr()) {
1602                            wavefront->ldsChunk->write<c0>(vaddr,
1603                            wavefront->ldsChunk->read<c0>(vaddr) | (*e));
1604                        } else if (this->isAtomicXor()) {
1605                            wavefront->ldsChunk->write<c0>(vaddr,
1606                            wavefront->ldsChunk->read<c0>(vaddr) ^ (*e));
1607                        } else if (this->isAtomicInc()) {
1608                            wavefront->ldsChunk->write<c0>(vaddr,
1609                            wavefront->ldsChunk->read<c0>(vaddr) + 1);
1610                        } else if (this->isAtomicDec()) {
1611                            wavefront->ldsChunk->write<c0>(vaddr,
1612                            wavefront->ldsChunk->read<c0>(vaddr) - 1);
1613                        } else if (this->isAtomicExch()) {
1614                            wavefront->ldsChunk->write<c0>(vaddr, (*e));
1615                        } else if (this->isAtomicCAS()) {
1616                            wavefront->ldsChunk->write<c0>(vaddr,
1617                            (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ?
1618                            (*f) : wavefront->ldsChunk->read<c0>(vaddr));
1619                        } else {
1620                            fatal("Unrecognized or invalid HSAIL atomic op "
1621                                  "type.\n");
1622                        }
1623                    } else {
1624                        RequestPtr req =
1625                            std::make_shared<Request>(0, vaddr, sizeof(c0), 0,
1626                                        gpuDynInst->computeUnit()->masterId(),
1627                                        0, gpuDynInst->wfDynId,
1628                                        gpuDynInst->makeAtomicOpFunctor<c0>(e,
1629                                        f));
1630
1631                        gpuDynInst->setRequestFlags(req);
1632                        PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
1633                        pkt->dataStatic(d);
1634
1635                        if (gpuDynInst->computeUnit()->shader->
1636                            separate_acquire_release &&
1637                            (gpuDynInst->isAcquire())) {
1638                            // if this atomic has acquire semantics,
1639                            // schedule the continuation to perform an
1640                            // acquire after the RMW completes
1641                            gpuDynInst->execContinuation =
1642                                &GPUStaticInst::execAtomicAcq;
1643
1644                            gpuDynInst->useContinuation = true;
1645                        } else {
1646                            // the request will be finished when the RMW completes
1647                            gpuDynInst->useContinuation = false;
1648                        }
1649                        // translation is performed in sendRequest()
1650                        gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i,
1651                                                               pkt);
1652                    }
1653                }
1654
1655                ++d;
1656                ++e;
1657                ++f;
1658            }
1659
1660            gpuDynInst->updateStats();
1661        }
1662
1663        // execAtomicACq will always be called through a continuation.
1664        // see comment for execContinuation in gpu_dyn_inst.hh
1665        void
1666        execAtomicAcq(GPUDynInstPtr gpuDynInst) override
1667        {
1668            // after performing the RMW, check to see if this instruction
1669            // has acquire semantics, and if so, issue an acquire
1670            if (!this->isLocalMem()) {
1671                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1672                     && gpuDynInst->isAcquire()) {
1673                    gpuDynInst->statusBitVector = VectorMask(1);
1674
1675                    // the request will be finished when
1676                    // the acquire completes
1677                    gpuDynInst->useContinuation = false;
1678                    // create request
1679                    RequestPtr req = std::make_shared<Request>(0, 0, 0, 0,
1680                                  gpuDynInst->computeUnit()->masterId(),
1681                                  0, gpuDynInst->wfDynId);
1682                    req->setFlags(Request::ACQUIRE);
1683                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
1684                }
1685            }
1686        }
1687    };
1688
1689    template<typename DataType, typename AddrOperandType, int NumSrcOperands>
1690    GPUStaticInst*
1691    constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
1692    {
1693        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
1694
1695        if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) {
1696            return decodeLd<DataType>(ib, obj);
1697        } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) {
1698            switch (ib->type) {
1699              case Brig::BRIG_TYPE_B8:
1700                return decodeSt<S8,S8>(ib, obj);
1701              case Brig::BRIG_TYPE_B16:
1702                return decodeSt<S16,S16>(ib, obj);
1703              case Brig::BRIG_TYPE_B32:
1704                return decodeSt<S32,S32>(ib, obj);
1705              case Brig::BRIG_TYPE_B64:
1706                return decodeSt<S64,S64>(ib, obj);
1707              default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type);
1708            }
1709        } else {
1710            if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET)
1711                return new AtomicInst<DataType, AddrOperandType,
1712                    NumSrcOperands, false>(ib, obj, "atomicnoret");
1713            else
1714                return new AtomicInst<DataType, AddrOperandType,
1715                    NumSrcOperands, true>(ib, obj, "atomic");
1716        }
1717    }
1718
1719    template<typename DataType, int NumSrcOperands>
1720    GPUStaticInst*
1721    decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
1722    {
1723        unsigned addrIndex = (Brig::BrigOpcode)ib->opcode ==
1724            Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1;
1725
1726        unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex);
1727
1728        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
1729
1730        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
1731            return constructAtomic<DataType, NoRegAddrOperand,
1732                                   NumSrcOperands>(ib, obj);
1733        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
1734            // V2/V4 not allowed
1735            switch (tmp.regKind) {
1736              case Brig::BRIG_REGISTER_KIND_SINGLE:
1737                  return constructAtomic<DataType, SRegAddrOperand,
1738                                         NumSrcOperands>(ib, obj);
1739              case Brig::BRIG_REGISTER_KIND_DOUBLE:
1740                return constructAtomic<DataType, DRegAddrOperand,
1741                                       NumSrcOperands>(ib, obj);
1742              default:
1743                fatal("Bad atomic register operand type %d\n", tmp.type);
1744            }
1745        } else {
1746            fatal("Bad atomic register operand kind %d\n", tmp.kind);
1747        }
1748    }
1749
1750
1751    template<typename DataType>
1752    GPUStaticInst*
1753    decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
1754    {
1755        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
1756
1757        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
1758            return decodeAtomicHelper<DataType, 2>(ib, obj);
1759        } else {
1760            return decodeAtomicHelper<DataType, 1>(ib, obj);
1761        }
1762    }
1763
1764    template<typename DataType>
1765    GPUStaticInst*
1766    decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
1767    {
1768        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
1769        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
1770            return decodeAtomicHelper<DataType, 2>(ib, obj);
1771        } else {
1772            return decodeAtomicHelper<DataType, 1>(ib, obj);
1773        }
1774    }
1775} // namespace HsailISA
1776
1777#endif // __ARCH_HSAIL_INSTS_MEM_HH__
1778