mem.hh revision 11693:bc1f702c25b9
1/*
2 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Steve Reinhardt
34 */
35
36#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
37#define __ARCH_HSAIL_INSTS_MEM_HH__
38
39#include <type_traits>
40
41#include "arch/hsail/insts/decl.hh"
42#include "arch/hsail/insts/gpu_static_inst.hh"
43#include "arch/hsail/operand.hh"
44#include "gpu-compute/compute_unit.hh"
45
46namespace HsailISA
47{
48    class MemInst
49    {
50      public:
51        MemInst() : size(0), addr_operand(nullptr) { }
52
53        MemInst(Enums::MemType m_type)
54        {
55            if (m_type == Enums::M_U64 ||
56                m_type == Enums::M_S64 ||
57                m_type == Enums::M_F64) {
58                size = 8;
59            } else if (m_type == Enums::M_U32 ||
60                       m_type == Enums::M_S32 ||
61                       m_type == Enums::M_F32) {
62                size = 4;
63            } else if (m_type == Enums::M_U16 ||
64                       m_type == Enums::M_S16 ||
65                       m_type == Enums::M_F16) {
66                size = 2;
67            } else {
68                size = 1;
69            }
70
71            addr_operand = nullptr;
72        }
73
74        void
75        init_addr(AddrOperandBase *_addr_operand)
76        {
77            addr_operand = _addr_operand;
78        }
79
80      private:
81        int size;
82        AddrOperandBase *addr_operand;
83
84      public:
85        int getMemOperandSize() { return size; }
86        AddrOperandBase *getAddressOperand() { return addr_operand; }
87    };
88
89    template<typename DestOperandType, typename AddrOperandType>
90    class LdaInstBase : public HsailGPUStaticInst
91    {
92      public:
93        typename DestOperandType::DestOperand dest;
94        AddrOperandType addr;
95
96        LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
97                    const char *_opcode)
98           : HsailGPUStaticInst(obj, _opcode)
99        {
100            using namespace Brig;
101
102            setFlag(ALU);
103
104            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
105            dest.init(op_offs, obj);
106            op_offs = obj->getOperandPtr(ib->operands, 1);
107            addr.init(op_offs, obj);
108        }
109
110        int numSrcRegOperands() override
111        { return(this->addr.isVectorRegister()); }
112        int numDstRegOperands() override
113        { return dest.isVectorRegister(); }
114        bool isVectorRegister(int operandIndex) override
115        {
116            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
117            return((operandIndex == 0) ? dest.isVectorRegister() :
118                   this->addr.isVectorRegister());
119        }
120        bool isCondRegister(int operandIndex) override
121        {
122            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
123            return((operandIndex == 0) ? dest.isCondRegister() :
124                   this->addr.isCondRegister());
125        }
126        bool isScalarRegister(int operandIndex) override
127        {
128            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
129            return((operandIndex == 0) ? dest.isScalarRegister() :
130                   this->addr.isScalarRegister());
131        }
132        bool isSrcOperand(int operandIndex) override
133        {
134            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
135            if (operandIndex > 0)
136                return(this->addr.isVectorRegister());
137            return false;
138        }
139        bool isDstOperand(int operandIndex) override {
140            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
141            return(operandIndex == 0);
142        }
143        int getOperandSize(int operandIndex) override
144        {
145            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
146            return((operandIndex == 0) ? dest.opSize() :
147                   this->addr.opSize());
148        }
149        int getRegisterIndex(int operandIndex) override
150        {
151            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
152            return((operandIndex == 0) ? dest.regIndex() :
153                   this->addr.regIndex());
154        }
155        int getNumOperands() override
156        {
157            if (this->addr.isVectorRegister())
158                return 2;
159            return 1;
160        }
161    };
162
163    template<typename DestDataType, typename AddrOperandType>
164    class LdaInst :
165        public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
166        public MemInst
167    {
168      public:
169        void generateDisassembly();
170
171        LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
172                        const char *_opcode)
173            : LdaInstBase<typename DestDataType::OperandType,
174                          AddrOperandType>(ib, obj, _opcode)
175        {
176            init_addr(&this->addr);
177        }
178
179        void execute(GPUDynInstPtr gpuDynInst);
180    };
181
182    template<typename DataType>
183    GPUStaticInst*
184    decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
185    {
186        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
187        BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj);
188
189        if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
190            return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas");
191        } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
192            // V2/V4 not allowed
193            switch (regDataType.regKind) {
194              case Brig::BRIG_REGISTER_KIND_SINGLE:
195                return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas");
196              case Brig::BRIG_REGISTER_KIND_DOUBLE:
197                return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas");
198              default:
199                fatal("Bad ldas register operand type %d\n", regDataType.type);
200            }
201        } else {
202            fatal("Bad ldas register operand kind %d\n", regDataType.kind);
203        }
204    }
205
206    template<typename MemOperandType, typename DestOperandType,
207             typename AddrOperandType>
208    class LdInstBase : public HsailGPUStaticInst
209    {
210      public:
211        Brig::BrigWidth8_t width;
212        typename DestOperandType::DestOperand dest;
213        AddrOperandType addr;
214
215        Brig::BrigSegment segment;
216        Brig::BrigMemoryOrder memoryOrder;
217        Brig::BrigMemoryScope memoryScope;
218        unsigned int equivClass;
219
220        LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
221                   const char *_opcode)
222           : HsailGPUStaticInst(obj, _opcode)
223        {
224            using namespace Brig;
225
226            setFlag(MemoryRef);
227            setFlag(Load);
228
229            if (ib->opcode == BRIG_OPCODE_LD) {
230                const BrigInstMem *ldst = (const BrigInstMem*)ib;
231
232                segment = (BrigSegment)ldst->segment;
233                memoryOrder = BRIG_MEMORY_ORDER_NONE;
234                memoryScope = BRIG_MEMORY_SCOPE_NONE;
235                equivClass = ldst->equivClass;
236
237                width = ldst->width;
238                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
239                const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
240                if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
241                    dest.init(op_offs, obj);
242
243                op_offs = obj->getOperandPtr(ib->operands, 1);
244                addr.init(op_offs, obj);
245            } else {
246                const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
247
248                segment = (BrigSegment)at->segment;
249                memoryOrder = (BrigMemoryOrder)at->memoryOrder;
250                memoryScope = (BrigMemoryScope)at->memoryScope;
251                equivClass = 0;
252
253                width = BRIG_WIDTH_1;
254                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
255                const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
256
257                if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
258                    dest.init(op_offs, obj);
259
260                op_offs = obj->getOperandPtr(ib->operands,1);
261                addr.init(op_offs, obj);
262            }
263
264            switch (memoryOrder) {
265              case BRIG_MEMORY_ORDER_NONE:
266                setFlag(NoOrder);
267                break;
268              case BRIG_MEMORY_ORDER_RELAXED:
269                setFlag(RelaxedOrder);
270                break;
271              case BRIG_MEMORY_ORDER_SC_ACQUIRE:
272                setFlag(Acquire);
273                break;
274              case BRIG_MEMORY_ORDER_SC_RELEASE:
275                setFlag(Release);
276                break;
277              case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
278                setFlag(AcquireRelease);
279                break;
280              default:
281                fatal("LdInst has bad memory order type\n");
282            }
283
284            switch (memoryScope) {
285              case BRIG_MEMORY_SCOPE_NONE:
286                setFlag(NoScope);
287                break;
288              case BRIG_MEMORY_SCOPE_WORKITEM:
289                setFlag(WorkitemScope);
290                break;
291              case BRIG_MEMORY_SCOPE_WORKGROUP:
292                setFlag(WorkgroupScope);
293                break;
294              case BRIG_MEMORY_SCOPE_AGENT:
295                setFlag(DeviceScope);
296                break;
297              case BRIG_MEMORY_SCOPE_SYSTEM:
298                setFlag(SystemScope);
299                break;
300              default:
301                fatal("LdInst has bad memory scope type\n");
302            }
303
304            switch (segment) {
305              case BRIG_SEGMENT_GLOBAL:
306                setFlag(GlobalSegment);
307                break;
308              case BRIG_SEGMENT_GROUP:
309                setFlag(GroupSegment);
310                break;
311              case BRIG_SEGMENT_PRIVATE:
312                setFlag(PrivateSegment);
313                break;
314              case BRIG_SEGMENT_READONLY:
315                setFlag(ReadOnlySegment);
316                break;
317              case BRIG_SEGMENT_SPILL:
318                setFlag(SpillSegment);
319                break;
320              case BRIG_SEGMENT_FLAT:
321                setFlag(Flat);
322                break;
323              case BRIG_SEGMENT_KERNARG:
324                setFlag(KernArgSegment);
325                break;
326              case BRIG_SEGMENT_ARG:
327                setFlag(ArgSegment);
328                break;
329              default:
330                panic("Ld: segment %d not supported\n", segment);
331            }
332        }
333
334        int numSrcRegOperands() override
335        { return(this->addr.isVectorRegister()); }
336        int numDstRegOperands() override { return dest.isVectorRegister(); }
337        int getNumOperands() override
338        {
339            if (this->addr.isVectorRegister())
340                return 2;
341            else
342                return 1;
343        }
344        bool isVectorRegister(int operandIndex) override
345        {
346            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
347            return((operandIndex == 0) ? dest.isVectorRegister() :
348                   this->addr.isVectorRegister());
349        }
350        bool isCondRegister(int operandIndex) override
351        {
352            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
353            return((operandIndex == 0) ? dest.isCondRegister() :
354                   this->addr.isCondRegister());
355        }
356        bool isScalarRegister(int operandIndex) override
357        {
358            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
359            return((operandIndex == 0) ? dest.isScalarRegister() :
360                   this->addr.isScalarRegister());
361        }
362        bool isSrcOperand(int operandIndex) override
363        {
364            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
365            if (operandIndex > 0)
366                return(this->addr.isVectorRegister());
367            return false;
368        }
369        bool isDstOperand(int operandIndex) override
370        {
371            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
372            return(operandIndex == 0);
373        }
374        int getOperandSize(int operandIndex) override
375        {
376            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
377            return((operandIndex == 0) ? dest.opSize() :
378                   this->addr.opSize());
379        }
380        int getRegisterIndex(int operandIndex) override
381        {
382            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
383            return((operandIndex == 0) ? dest.regIndex() :
384                   this->addr.regIndex());
385        }
386    };
387
388    template<typename MemDataType, typename DestDataType,
389             typename AddrOperandType>
390    class LdInst :
391        public LdInstBase<typename MemDataType::CType,
392                          typename DestDataType::OperandType, AddrOperandType>,
393        public MemInst
394    {
395        typename DestDataType::OperandType::DestOperand dest_vect[4];
396        uint16_t num_dest_operands;
397        void generateDisassembly() override;
398
399      public:
400        LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
401               const char *_opcode)
402            : LdInstBase<typename MemDataType::CType,
403                         typename DestDataType::OperandType,
404                         AddrOperandType>(ib, obj, _opcode),
405              MemInst(MemDataType::memType)
406        {
407            init_addr(&this->addr);
408
409            unsigned op_offs = obj->getOperandPtr(ib->operands,0);
410            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
411
412            if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
413                const Brig::BrigOperandOperandList *brigRegVecOp =
414                    (const Brig::BrigOperandOperandList*)brigOp;
415
416                num_dest_operands =
417                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
418
419                assert(num_dest_operands <= 4);
420            } else {
421                num_dest_operands = 1;
422            }
423
424            if (num_dest_operands > 1) {
425                assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
426
427                for (int i = 0; i < num_dest_operands; ++i) {
428                    dest_vect[i].init_from_vect(op_offs, obj, i);
429                }
430            }
431        }
432
433        void
434        initiateAcc(GPUDynInstPtr gpuDynInst) override
435        {
436            typedef typename MemDataType::CType c0;
437
438            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
439
440            if (num_dest_operands > 1) {
441                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
442                    if (gpuDynInst->exec_mask[i])
443                        gpuDynInst->statusVector.push_back(num_dest_operands);
444                    else
445                        gpuDynInst->statusVector.push_back(0);
446            }
447
448            for (int k = 0; k < num_dest_operands; ++k) {
449
450                c0 *d = &((c0*)gpuDynInst->d_data)
451                    [k * gpuDynInst->computeUnit()->wfSize()];
452
453                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
454                    if (gpuDynInst->exec_mask[i]) {
455                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
456
457                        if (this->isLocalMem()) {
458                            // load from shared memory
459                            *d = gpuDynInst->wavefront()->ldsChunk->
460                                read<c0>(vaddr);
461                        } else {
462                            Request *req = new Request(0, vaddr, sizeof(c0), 0,
463                                          gpuDynInst->computeUnit()->masterId(),
464                                          0, gpuDynInst->wfDynId);
465
466                            gpuDynInst->setRequestFlags(req);
467                            PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
468                            pkt->dataStatic(d);
469
470                            if (gpuDynInst->computeUnit()->shader->
471                                separate_acquire_release &&
472                                gpuDynInst->isAcquire()) {
473                                // if this load has acquire semantics,
474                                // set the response continuation function
475                                // to perform an Acquire request
476                                gpuDynInst->execContinuation =
477                                    &GPUStaticInst::execLdAcq;
478
479                                gpuDynInst->useContinuation = true;
480                            } else {
481                                // the request will be finished when
482                                // the load completes
483                                gpuDynInst->useContinuation = false;
484                            }
485                            // translation is performed in sendRequest()
486                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
487                                                                   i, pkt);
488                        }
489                    }
490                    ++d;
491                }
492            }
493
494            gpuDynInst->updateStats();
495        }
496
497        void
498        completeAcc(GPUDynInstPtr gpuDynInst) override
499        {
500            typedef typename MemDataType::CType c1;
501
502            constexpr bool is_vt_32 = DestDataType::vgprType == VT_32;
503
504            /**
505              * this code essentially replaces the long if-else chain
506              * that was in used GlobalMemPipeline::exec() to infer the
507              * size (single/double) and type (floating point/integer) of
508              * the destination register. this is needed for load
509              * instructions because the loaded value and the
510              * destination type can be of different sizes, and we also
511              * need to know if the value we're writing back is floating
512              * point and signed/unsigned, so we can properly cast the
513              * writeback value
514              */
515            typedef typename std::conditional<is_vt_32,
516                typename std::conditional<std::is_floating_point<c1>::value,
517                    float, typename std::conditional<std::is_signed<c1>::value,
518                    int32_t, uint32_t>::type>::type,
519                typename std::conditional<std::is_floating_point<c1>::value,
520                    double, typename std::conditional<std::is_signed<c1>::value,
521                    int64_t, uint64_t>::type>::type>::type c0;
522
523
524            Wavefront *w = gpuDynInst->wavefront();
525
526            std::vector<uint32_t> regVec;
527            // iterate over number of destination register operands since
528            // this is a load
529            for (int k = 0; k < num_dest_operands; ++k) {
530                assert((sizeof(c1) * num_dest_operands)
531                       <= MAX_WIDTH_FOR_MEM_INST);
532
533                int dst = this->dest.regIndex() + k;
534                if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST)
535                    dst = dest_vect[k].regIndex();
536                // virtual->physical VGPR mapping
537                int physVgpr = w->remap(dst, sizeof(c0), 1);
538                // save the physical VGPR index
539                regVec.push_back(physVgpr);
540
541                c1 *p1 =
542                    &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()];
543
544                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
545                    if (gpuDynInst->exec_mask[i]) {
546                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
547                                "$%s%d <- %d global ld done (src = wavefront "
548                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
549                                w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
550                                dst, *p1);
551                        // write the value into the physical VGPR. This is a
552                        // purely functional operation. No timing is modeled.
553                        w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
554                                                                    *p1, i);
555                    }
556                    ++p1;
557                }
558            }
559
560            // Schedule the write operation of the load data on the VRF.
561            // This simply models the timing aspect of the VRF write operation.
562            // It does not modify the physical VGPR.
563            int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
564                vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
565                                     sizeof(c0), gpuDynInst->time);
566
567            if (this->isGlobalMem()) {
568                gpuDynInst->computeUnit()->globalMemoryPipe
569                    .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
570            } else {
571                assert(this->isLocalMem());
572                gpuDynInst->computeUnit()->localMemoryPipe
573                    .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
574            }
575        }
576
577      private:
578        void
579        execLdAcq(GPUDynInstPtr gpuDynInst) override
580        {
581            // after the load has complete and if the load has acquire
582            // semantics, issue an acquire request.
583            if (!this->isLocalMem()) {
584                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
585                    && gpuDynInst->isAcquire()) {
586                    gpuDynInst->statusBitVector = VectorMask(1);
587                    gpuDynInst->useContinuation = false;
588                    // create request
589                    Request *req = new Request(0, 0, 0, 0,
590                                  gpuDynInst->computeUnit()->masterId(),
591                                  0, gpuDynInst->wfDynId);
592                    req->setFlags(Request::ACQUIRE);
593                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
594                }
595            }
596        }
597
598      public:
599        bool isVectorRegister(int operandIndex) override
600        {
601            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
602            if ((num_dest_operands != getNumOperands()) &&
603                (operandIndex == (getNumOperands()-1)))
604                return(this->addr.isVectorRegister());
605            if (num_dest_operands > 1) {
606                return dest_vect[operandIndex].isVectorRegister();
607            }
608            else if (num_dest_operands == 1) {
609                return LdInstBase<typename MemDataType::CType,
610                       typename DestDataType::OperandType,
611                       AddrOperandType>::dest.isVectorRegister();
612            }
613            return false;
614        }
615        bool isCondRegister(int operandIndex) override
616        {
617            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
618            if ((num_dest_operands != getNumOperands()) &&
619                (operandIndex == (getNumOperands()-1)))
620                return(this->addr.isCondRegister());
621            if (num_dest_operands > 1)
622                return dest_vect[operandIndex].isCondRegister();
623            else if (num_dest_operands == 1)
624                return LdInstBase<typename MemDataType::CType,
625                       typename DestDataType::OperandType,
626                       AddrOperandType>::dest.isCondRegister();
627            return false;
628        }
629        bool isScalarRegister(int operandIndex) override
630        {
631            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
632            if ((num_dest_operands != getNumOperands()) &&
633                (operandIndex == (getNumOperands()-1)))
634                return(this->addr.isScalarRegister());
635            if (num_dest_operands > 1)
636                return dest_vect[operandIndex].isScalarRegister();
637            else if (num_dest_operands == 1)
638                return LdInstBase<typename MemDataType::CType,
639                       typename DestDataType::OperandType,
640                       AddrOperandType>::dest.isScalarRegister();
641            return false;
642        }
643        bool isSrcOperand(int operandIndex) override
644        {
645            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
646            if ((num_dest_operands != getNumOperands()) &&
647                (operandIndex == (getNumOperands()-1)))
648                return(this->addr.isVectorRegister());
649            return false;
650        }
651        bool isDstOperand(int operandIndex) override
652        {
653            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
654            if ((num_dest_operands != getNumOperands()) &&
655                (operandIndex == (getNumOperands()-1)))
656                return false;
657            return true;
658        }
659        int getOperandSize(int operandIndex) override
660        {
661            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
662            if ((num_dest_operands != getNumOperands()) &&
663                (operandIndex == (getNumOperands()-1)))
664                return(this->addr.opSize());
665            if (num_dest_operands > 1)
666                return(dest_vect[operandIndex].opSize());
667            else if (num_dest_operands == 1)
668                return(LdInstBase<typename MemDataType::CType,
669                       typename DestDataType::OperandType,
670                       AddrOperandType>::dest.opSize());
671            return 0;
672        }
673        int getRegisterIndex(int operandIndex) override
674        {
675            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
676            if ((num_dest_operands != getNumOperands()) &&
677                (operandIndex == (getNumOperands()-1)))
678                return(this->addr.regIndex());
679            if (num_dest_operands > 1)
680                return(dest_vect[operandIndex].regIndex());
681            else if (num_dest_operands == 1)
682                return(LdInstBase<typename MemDataType::CType,
683                       typename DestDataType::OperandType,
684                       AddrOperandType>::dest.regIndex());
685            return -1;
686        }
687        int getNumOperands() override
688        {
689            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
690                return(num_dest_operands+1);
691            else
692                return(num_dest_operands);
693        }
694        void execute(GPUDynInstPtr gpuDynInst) override;
695    };
696
697    template<typename MemDT, typename DestDT>
698    GPUStaticInst*
699    decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
700    {
701        unsigned op_offs = obj->getOperandPtr(ib->operands,1);
702        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
703
704        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
705            return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld");
706        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
707                   tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
708            switch (tmp.regKind) {
709              case Brig::BRIG_REGISTER_KIND_SINGLE:
710                return new LdInst<MemDT, DestDT,
711                                  SRegAddrOperand>(ib, obj, "ld");
712              case Brig::BRIG_REGISTER_KIND_DOUBLE:
713                return new LdInst<MemDT, DestDT,
714                                  DRegAddrOperand>(ib, obj, "ld");
715              default:
716                fatal("Bad ld register operand type %d\n", tmp.regKind);
717            }
718        } else {
719            fatal("Bad ld register operand kind %d\n", tmp.kind);
720        }
721    }
722
723    template<typename MemDT>
724    GPUStaticInst*
725    decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
726    {
727        unsigned op_offs = obj->getOperandPtr(ib->operands,0);
728        BrigRegOperandInfo dest = findRegDataType(op_offs, obj);
729
730        assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
731               dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
732        switch(dest.regKind) {
733          case Brig::BRIG_REGISTER_KIND_SINGLE:
734            switch (ib->type) {
735              case Brig::BRIG_TYPE_B8:
736              case Brig::BRIG_TYPE_B16:
737              case Brig::BRIG_TYPE_B32:
738                return decodeLd2<MemDT, B32>(ib, obj);
739              case Brig::BRIG_TYPE_U8:
740              case Brig::BRIG_TYPE_U16:
741              case Brig::BRIG_TYPE_U32:
742                return decodeLd2<MemDT, U32>(ib, obj);
743              case Brig::BRIG_TYPE_S8:
744              case Brig::BRIG_TYPE_S16:
745              case Brig::BRIG_TYPE_S32:
746                return decodeLd2<MemDT, S32>(ib, obj);
747              case Brig::BRIG_TYPE_F16:
748              case Brig::BRIG_TYPE_F32:
749                return decodeLd2<MemDT, U32>(ib, obj);
750              default:
751                fatal("Bad ld register operand type %d, %d\n",
752                      dest.regKind, ib->type);
753            };
754          case Brig::BRIG_REGISTER_KIND_DOUBLE:
755            switch (ib->type) {
756              case Brig::BRIG_TYPE_B64:
757                return decodeLd2<MemDT, B64>(ib, obj);
758              case Brig::BRIG_TYPE_U64:
759                return decodeLd2<MemDT, U64>(ib, obj);
760              case Brig::BRIG_TYPE_S64:
761                return decodeLd2<MemDT, S64>(ib, obj);
762              case Brig::BRIG_TYPE_F64:
763                return decodeLd2<MemDT, U64>(ib, obj);
764              default:
765                fatal("Bad ld register operand type %d, %d\n",
766                      dest.regKind, ib->type);
767            };
768          default:
769            fatal("Bad ld register operand type %d, %d\n", dest.regKind,
770                  ib->type);
771        }
772    }
773
774    template<typename MemDataType, typename SrcOperandType,
775             typename AddrOperandType>
776    class StInstBase : public HsailGPUStaticInst
777    {
778      public:
779        typename SrcOperandType::SrcOperand src;
780        AddrOperandType addr;
781
782        Brig::BrigSegment segment;
783        Brig::BrigMemoryScope memoryScope;
784        Brig::BrigMemoryOrder memoryOrder;
785        unsigned int equivClass;
786
787        StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
788                   const char *_opcode)
789           : HsailGPUStaticInst(obj, _opcode)
790        {
791            using namespace Brig;
792
793            setFlag(MemoryRef);
794            setFlag(Store);
795
796            if (ib->opcode == BRIG_OPCODE_ST) {
797                const BrigInstMem *ldst = (const BrigInstMem*)ib;
798
799                segment = (BrigSegment)ldst->segment;
800                memoryOrder = BRIG_MEMORY_ORDER_NONE;
801                memoryScope = BRIG_MEMORY_SCOPE_NONE;
802                equivClass = ldst->equivClass;
803
804                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
805                const BrigOperand *baseOp = obj->getOperand(op_offs);
806
807                if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) ||
808                    (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) {
809                    src.init(op_offs, obj);
810                }
811
812                op_offs = obj->getOperandPtr(ib->operands, 1);
813                addr.init(op_offs, obj);
814            } else {
815                const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
816
817                segment = (BrigSegment)at->segment;
818                memoryScope = (BrigMemoryScope)at->memoryScope;
819                memoryOrder = (BrigMemoryOrder)at->memoryOrder;
820                equivClass = 0;
821
822                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
823                addr.init(op_offs, obj);
824
825                op_offs = obj->getOperandPtr(ib->operands, 1);
826                src.init(op_offs, obj);
827            }
828
829            switch (memoryOrder) {
830              case BRIG_MEMORY_ORDER_NONE:
831                setFlag(NoOrder);
832                break;
833              case BRIG_MEMORY_ORDER_RELAXED:
834                setFlag(RelaxedOrder);
835                break;
836              case BRIG_MEMORY_ORDER_SC_ACQUIRE:
837                setFlag(Acquire);
838                break;
839              case BRIG_MEMORY_ORDER_SC_RELEASE:
840                setFlag(Release);
841                break;
842              case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
843                setFlag(AcquireRelease);
844                break;
845              default:
846                fatal("StInst has bad memory order type\n");
847            }
848
849            switch (memoryScope) {
850              case BRIG_MEMORY_SCOPE_NONE:
851                setFlag(NoScope);
852                break;
853              case BRIG_MEMORY_SCOPE_WORKITEM:
854                setFlag(WorkitemScope);
855                break;
856              case BRIG_MEMORY_SCOPE_WORKGROUP:
857                setFlag(WorkgroupScope);
858                break;
859              case BRIG_MEMORY_SCOPE_AGENT:
860                setFlag(DeviceScope);
861                break;
862              case BRIG_MEMORY_SCOPE_SYSTEM:
863                setFlag(SystemScope);
864                break;
865              default:
866                fatal("StInst has bad memory scope type\n");
867            }
868
869            switch (segment) {
870              case BRIG_SEGMENT_GLOBAL:
871                setFlag(GlobalSegment);
872                break;
873              case BRIG_SEGMENT_GROUP:
874                setFlag(GroupSegment);
875                break;
876              case BRIG_SEGMENT_PRIVATE:
877                setFlag(PrivateSegment);
878                break;
879              case BRIG_SEGMENT_READONLY:
880                setFlag(ReadOnlySegment);
881                break;
882              case BRIG_SEGMENT_SPILL:
883                setFlag(SpillSegment);
884                break;
885              case BRIG_SEGMENT_FLAT:
886                setFlag(Flat);
887                break;
888              case BRIG_SEGMENT_ARG:
889                setFlag(ArgSegment);
890                break;
891              default:
892                panic("St: segment %d not supported\n", segment);
893            }
894        }
895
896        int numDstRegOperands() override { return 0; }
897        int numSrcRegOperands() override
898        {
899            return src.isVectorRegister() + this->addr.isVectorRegister();
900        }
901        int getNumOperands() override
902        {
903            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
904                return 2;
905            else
906                return 1;
907        }
908        bool isVectorRegister(int operandIndex) override
909        {
910            assert(operandIndex >= 0 && operandIndex < getNumOperands());
911            return !operandIndex ? src.isVectorRegister() :
912                   this->addr.isVectorRegister();
913        }
914        bool isCondRegister(int operandIndex) override
915        {
916            assert(operandIndex >= 0 && operandIndex < getNumOperands());
917            return !operandIndex ? src.isCondRegister() :
918                   this->addr.isCondRegister();
919        }
920        bool isScalarRegister(int operandIndex) override
921        {
922            assert(operandIndex >= 0 && operandIndex < getNumOperands());
923            return !operandIndex ? src.isScalarRegister() :
924                   this->addr.isScalarRegister();
925        }
926        bool isSrcOperand(int operandIndex) override
927        {
928            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
929            return true;
930        }
931        bool isDstOperand(int operandIndex) override { return false; }
932        int getOperandSize(int operandIndex) override
933        {
934            assert(operandIndex >= 0 && operandIndex < getNumOperands());
935            return !operandIndex ? src.opSize() : this->addr.opSize();
936        }
937        int getRegisterIndex(int operandIndex) override
938        {
939            assert(operandIndex >= 0 && operandIndex < getNumOperands());
940            return !operandIndex ? src.regIndex() : this->addr.regIndex();
941        }
942    };
943
944
945    template<typename MemDataType, typename SrcDataType,
946             typename AddrOperandType>
947    class StInst :
948        public StInstBase<MemDataType, typename SrcDataType::OperandType,
949                          AddrOperandType>,
950        public MemInst
951    {
952      public:
953        typename SrcDataType::OperandType::SrcOperand src_vect[4];
954        uint16_t num_src_operands;
955        void generateDisassembly() override;
956
957        StInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
958                        const char *_opcode, int srcIdx)
959            : StInstBase<MemDataType, typename SrcDataType::OperandType,
960                         AddrOperandType>(ib, obj, _opcode),
961              MemInst(SrcDataType::memType)
962        {
963            init_addr(&this->addr);
964
965            BrigRegOperandInfo rinfo;
966            unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx);
967            const Brig::BrigOperand *baseOp = obj->getOperand(op_offs);
968
969            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
970                const Brig::BrigOperandConstantBytes *op =
971                    (Brig::BrigOperandConstantBytes*)baseOp;
972
973                rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind,
974                                           Brig::BRIG_TYPE_NONE);
975            } else {
976                rinfo = findRegDataType(op_offs, obj);
977            }
978
979            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
980                const Brig::BrigOperandOperandList *brigRegVecOp =
981                    (const Brig::BrigOperandOperandList*)baseOp;
982
983                num_src_operands =
984                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
985
986                assert(num_src_operands <= 4);
987            } else {
988                num_src_operands = 1;
989            }
990
991            if (num_src_operands > 1) {
992                assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
993
994                for (int i = 0; i < num_src_operands; ++i) {
995                    src_vect[i].init_from_vect(op_offs, obj, i);
996                }
997            }
998        }
999
1000        void
1001        initiateAcc(GPUDynInstPtr gpuDynInst) override
1002        {
1003            // before performing a store, check if this store has
1004            // release semantics, and if so issue a release first
1005            if (!this->isLocalMem()) {
1006                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1007                    && gpuDynInst->isRelease()) {
1008
1009                    gpuDynInst->statusBitVector = VectorMask(1);
1010                    gpuDynInst->execContinuation = &GPUStaticInst::execSt;
1011                    gpuDynInst->useContinuation = true;
1012                    // create request
1013                    Request *req = new Request(0, 0, 0, 0,
1014                                  gpuDynInst->computeUnit()->masterId(),
1015                                  0, gpuDynInst->wfDynId);
1016                    req->setFlags(Request::RELEASE);
1017                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
1018
1019                    return;
1020                }
1021            }
1022
1023            // if there is no release semantic, perform stores immediately
1024            execSt(gpuDynInst);
1025        }
1026
1027        // stores don't write anything back, so there is nothing
1028        // to do here. we only override this method to avoid the
1029        // fatal in the base class implementation
1030        void completeAcc(GPUDynInstPtr gpuDynInst) override { }
1031
1032      private:
1033        // execSt may be called through a continuation
1034        // if the store had release semantics. see comment for
1035        // execSt in gpu_static_inst.hh
1036        void
1037        execSt(GPUDynInstPtr gpuDynInst) override
1038        {
1039            typedef typename MemDataType::CType c0;
1040
1041            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
1042
1043            if (num_src_operands > 1) {
1044                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
1045                    if (gpuDynInst->exec_mask[i])
1046                        gpuDynInst->statusVector.push_back(num_src_operands);
1047                    else
1048                        gpuDynInst->statusVector.push_back(0);
1049            }
1050
1051            for (int k = 0; k < num_src_operands; ++k) {
1052                c0 *d = &((c0*)gpuDynInst->d_data)
1053                    [k * gpuDynInst->computeUnit()->wfSize()];
1054
1055                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
1056                    if (gpuDynInst->exec_mask[i]) {
1057                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
1058
1059                        if (this->isLocalMem()) {
1060                            //store to shared memory
1061                            gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr,
1062                                                                         *d);
1063                        } else {
1064                            Request *req =
1065                              new Request(0, vaddr, sizeof(c0), 0,
1066                                          gpuDynInst->computeUnit()->masterId(),
1067                                          0, gpuDynInst->wfDynId);
1068
1069                            gpuDynInst->setRequestFlags(req);
1070                            PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
1071                            pkt->dataStatic<c0>(d);
1072
1073                            // translation is performed in sendRequest()
1074                            // the request will be finished when the store completes
1075                            gpuDynInst->useContinuation = false;
1076                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
1077                                                                   i, pkt);
1078
1079                        }
1080                    }
1081                    ++d;
1082                }
1083            }
1084
1085            gpuDynInst->updateStats();
1086        }
1087
1088      public:
1089        bool isVectorRegister(int operandIndex) override
1090        {
1091            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1092            if (operandIndex == num_src_operands)
1093                return this->addr.isVectorRegister();
1094            if (num_src_operands > 1)
1095                return src_vect[operandIndex].isVectorRegister();
1096            else if (num_src_operands == 1)
1097                return StInstBase<MemDataType,
1098                       typename SrcDataType::OperandType,
1099                       AddrOperandType>::src.isVectorRegister();
1100            return false;
1101        }
1102        bool isCondRegister(int operandIndex) override
1103        {
1104            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1105            if (operandIndex == num_src_operands)
1106                return this->addr.isCondRegister();
1107            if (num_src_operands > 1)
1108                return src_vect[operandIndex].isCondRegister();
1109            else if (num_src_operands == 1)
1110                return StInstBase<MemDataType,
1111                       typename SrcDataType::OperandType,
1112                       AddrOperandType>::src.isCondRegister();
1113            return false;
1114        }
1115        bool isScalarRegister(int operandIndex) override
1116        {
1117            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1118            if (operandIndex == num_src_operands)
1119                return this->addr.isScalarRegister();
1120            if (num_src_operands > 1)
1121                return src_vect[operandIndex].isScalarRegister();
1122            else if (num_src_operands == 1)
1123                return StInstBase<MemDataType,
1124                       typename SrcDataType::OperandType,
1125                       AddrOperandType>::src.isScalarRegister();
1126            return false;
1127        }
1128        bool isSrcOperand(int operandIndex) override
1129        {
1130            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1131            return true;
1132        }
1133        bool isDstOperand(int operandIndex) override { return false; }
1134        int getOperandSize(int operandIndex) override
1135        {
1136            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1137            if (operandIndex == num_src_operands)
1138                return this->addr.opSize();
1139            if (num_src_operands > 1)
1140                return src_vect[operandIndex].opSize();
1141            else if (num_src_operands == 1)
1142                return StInstBase<MemDataType,
1143                       typename SrcDataType::OperandType,
1144                       AddrOperandType>::src.opSize();
1145            return 0;
1146        }
1147        int getRegisterIndex(int operandIndex) override
1148        {
1149            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1150            if (operandIndex == num_src_operands)
1151                return this->addr.regIndex();
1152            if (num_src_operands > 1)
1153                return src_vect[operandIndex].regIndex();
1154            else if (num_src_operands == 1)
1155                return StInstBase<MemDataType,
1156                       typename SrcDataType::OperandType,
1157                       AddrOperandType>::src.regIndex();
1158            return -1;
1159        }
1160        int getNumOperands() override
1161        {
1162            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
1163                return num_src_operands + 1;
1164            else
1165                return num_src_operands;
1166        }
1167        void execute(GPUDynInstPtr gpuDynInst) override;
1168    };
1169
1170    template<typename DataType, typename SrcDataType>
1171    GPUStaticInst*
1172    decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
1173    {
1174        int srcIdx = 0;
1175        int destIdx = 1;
1176        if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC ||
1177            ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) {
1178            srcIdx = 1;
1179            destIdx = 0;
1180        }
1181        unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx);
1182
1183        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
1184
1185        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
1186            return new StInst<DataType, SrcDataType,
1187                              NoRegAddrOperand>(ib, obj, "st", srcIdx);
1188        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
1189            // V2/V4 not allowed
1190            switch (tmp.regKind) {
1191              case Brig::BRIG_REGISTER_KIND_SINGLE:
1192                return new StInst<DataType, SrcDataType,
1193                                  SRegAddrOperand>(ib, obj, "st", srcIdx);
1194              case Brig::BRIG_REGISTER_KIND_DOUBLE:
1195                return new StInst<DataType, SrcDataType,
1196                                  DRegAddrOperand>(ib, obj, "st", srcIdx);
1197              default:
1198                fatal("Bad st register operand type %d\n", tmp.type);
1199            }
1200        } else {
1201            fatal("Bad st register operand kind %d\n", tmp.kind);
1202        }
1203    }
1204
1205    template<typename OperandType, typename AddrOperandType, int NumSrcOperands,
1206             bool HasDst>
1207    class AtomicInstBase : public HsailGPUStaticInst
1208    {
1209      public:
1210        typename OperandType::DestOperand dest;
1211        typename OperandType::SrcOperand src[NumSrcOperands];
1212        AddrOperandType addr;
1213
1214        Brig::BrigSegment segment;
1215        Brig::BrigMemoryOrder memoryOrder;
1216        Brig::BrigAtomicOperation atomicOperation;
1217        Brig::BrigMemoryScope memoryScope;
1218        Brig::BrigOpcode opcode;
1219
1220        AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
1221                       const char *_opcode)
1222           : HsailGPUStaticInst(obj, _opcode)
1223        {
1224            using namespace Brig;
1225
1226            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
1227
1228            segment = (BrigSegment)at->segment;
1229            memoryScope = (BrigMemoryScope)at->memoryScope;
1230            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
1231            atomicOperation = (BrigAtomicOperation)at->atomicOperation;
1232            opcode = (BrigOpcode)ib->opcode;
1233
1234            assert(opcode == Brig::BRIG_OPCODE_ATOMICNORET ||
1235                   opcode == Brig::BRIG_OPCODE_ATOMIC);
1236
1237            setFlag(MemoryRef);
1238
1239            if (opcode == Brig::BRIG_OPCODE_ATOMIC) {
1240                setFlag(AtomicReturn);
1241            } else {
1242                setFlag(AtomicNoReturn);
1243            }
1244
1245            switch (memoryOrder) {
1246              case BRIG_MEMORY_ORDER_NONE:
1247                setFlag(NoOrder);
1248                break;
1249              case BRIG_MEMORY_ORDER_RELAXED:
1250                setFlag(RelaxedOrder);
1251                break;
1252              case BRIG_MEMORY_ORDER_SC_ACQUIRE:
1253                setFlag(Acquire);
1254                break;
1255              case BRIG_MEMORY_ORDER_SC_RELEASE:
1256                setFlag(Release);
1257                break;
1258              case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
1259                setFlag(AcquireRelease);
1260                break;
1261              default:
1262                fatal("AtomicInst has bad memory order type\n");
1263            }
1264
1265            switch (memoryScope) {
1266              case BRIG_MEMORY_SCOPE_NONE:
1267                setFlag(NoScope);
1268                break;
1269              case BRIG_MEMORY_SCOPE_WORKITEM:
1270                setFlag(WorkitemScope);
1271                break;
1272              case BRIG_MEMORY_SCOPE_WORKGROUP:
1273                setFlag(WorkgroupScope);
1274                break;
1275              case BRIG_MEMORY_SCOPE_AGENT:
1276                setFlag(DeviceScope);
1277                break;
1278              case BRIG_MEMORY_SCOPE_SYSTEM:
1279                setFlag(SystemScope);
1280                break;
1281              default:
1282                fatal("AtomicInst has bad memory scope type\n");
1283            }
1284
1285            switch (atomicOperation) {
1286              case Brig::BRIG_ATOMIC_AND:
1287                setFlag(AtomicAnd);
1288                break;
1289              case Brig::BRIG_ATOMIC_OR:
1290                setFlag(AtomicOr);
1291                break;
1292              case Brig::BRIG_ATOMIC_XOR:
1293                setFlag(AtomicXor);
1294                break;
1295              case Brig::BRIG_ATOMIC_CAS:
1296                setFlag(AtomicCAS);
1297                break;
1298              case Brig::BRIG_ATOMIC_EXCH:
1299                setFlag(AtomicExch);
1300                break;
1301              case Brig::BRIG_ATOMIC_ADD:
1302                setFlag(AtomicAdd);
1303                break;
1304              case Brig::BRIG_ATOMIC_WRAPINC:
1305                setFlag(AtomicInc);
1306                break;
1307              case Brig::BRIG_ATOMIC_WRAPDEC:
1308                setFlag(AtomicDec);
1309                break;
1310              case Brig::BRIG_ATOMIC_MIN:
1311                setFlag(AtomicMin);
1312                break;
1313              case Brig::BRIG_ATOMIC_MAX:
1314                setFlag(AtomicMax);
1315                break;
1316              case Brig::BRIG_ATOMIC_SUB:
1317                setFlag(AtomicSub);
1318                break;
1319              default:
1320                fatal("Bad BrigAtomicOperation code %d\n", atomicOperation);
1321            }
1322
1323            switch (segment) {
1324              case BRIG_SEGMENT_GLOBAL:
1325                setFlag(GlobalSegment);
1326                break;
1327              case BRIG_SEGMENT_GROUP:
1328                setFlag(GroupSegment);
1329                break;
1330              case BRIG_SEGMENT_FLAT:
1331                setFlag(Flat);
1332                break;
1333              default:
1334                panic("Atomic: segment %d not supported\n", segment);
1335            }
1336
1337            if (HasDst) {
1338                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
1339                dest.init(op_offs, obj);
1340
1341                op_offs = obj->getOperandPtr(ib->operands, 1);
1342                addr.init(op_offs, obj);
1343
1344                for (int i = 0; i < NumSrcOperands; ++i) {
1345                    op_offs = obj->getOperandPtr(ib->operands, i + 2);
1346                    src[i].init(op_offs, obj);
1347                }
1348            } else {
1349
1350                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
1351                addr.init(op_offs, obj);
1352
1353                for (int i = 0; i < NumSrcOperands; ++i) {
1354                    op_offs = obj->getOperandPtr(ib->operands, i + 1);
1355                    src[i].init(op_offs, obj);
1356                }
1357            }
1358        }
1359
1360        int numSrcRegOperands()
1361        {
1362            int operands = 0;
1363            for (int i = 0; i < NumSrcOperands; i++) {
1364                if (src[i].isVectorRegister()) {
1365                    operands++;
1366                }
1367            }
1368            if (addr.isVectorRegister())
1369                operands++;
1370            return operands;
1371        }
1372        int numDstRegOperands() { return dest.isVectorRegister(); }
1373        int getNumOperands()
1374        {
1375            if (addr.isVectorRegister())
1376                return(NumSrcOperands + 2);
1377            return(NumSrcOperands + 1);
1378        }
1379        bool isVectorRegister(int operandIndex)
1380        {
1381            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1382            if (operandIndex < NumSrcOperands)
1383                return src[operandIndex].isVectorRegister();
1384            else if (operandIndex == NumSrcOperands)
1385                return(addr.isVectorRegister());
1386            else
1387                return dest.isVectorRegister();
1388        }
1389        bool isCondRegister(int operandIndex)
1390        {
1391            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1392            if (operandIndex < NumSrcOperands)
1393                return src[operandIndex].isCondRegister();
1394            else if (operandIndex == NumSrcOperands)
1395                return(addr.isCondRegister());
1396            else
1397                return dest.isCondRegister();
1398        }
1399        bool isScalarRegister(int operandIndex)
1400        {
1401            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1402            if (operandIndex < NumSrcOperands)
1403                return src[operandIndex].isScalarRegister();
1404            else if (operandIndex == NumSrcOperands)
1405                return(addr.isScalarRegister());
1406            else
1407                return dest.isScalarRegister();
1408        }
1409        bool isSrcOperand(int operandIndex)
1410        {
1411            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1412            if (operandIndex < NumSrcOperands)
1413                return true;
1414            else if (operandIndex == NumSrcOperands)
1415                return(addr.isVectorRegister());
1416            else
1417                return false;
1418        }
1419        bool isDstOperand(int operandIndex)
1420        {
1421            if (operandIndex <= NumSrcOperands)
1422                return false;
1423            else
1424                return true;
1425        }
1426        int getOperandSize(int operandIndex)
1427        {
1428            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1429            if (operandIndex < NumSrcOperands)
1430                return(src[operandIndex].opSize());
1431            else if (operandIndex == NumSrcOperands)
1432                return(addr.opSize());
1433            else
1434                return(dest.opSize());
1435        }
1436        int getRegisterIndex(int operandIndex)
1437        {
1438            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1439            if (operandIndex < NumSrcOperands)
1440                return(src[operandIndex].regIndex());
1441            else if (operandIndex == NumSrcOperands)
1442                return(addr.regIndex());
1443            else
1444                return(dest.regIndex());
1445            return -1;
1446        }
1447    };
1448
1449    template<typename MemDataType, typename AddrOperandType, int NumSrcOperands,
1450             bool HasDst>
1451    class AtomicInst :
1452        public AtomicInstBase<typename MemDataType::OperandType,
1453                              AddrOperandType, NumSrcOperands, HasDst>,
1454        public MemInst
1455    {
1456      public:
1457        void generateDisassembly() override;
1458
1459        AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
1460                   const char *_opcode)
1461            : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
1462                             NumSrcOperands, HasDst>
1463                (ib, obj, _opcode),
1464              MemInst(MemDataType::memType)
1465        {
1466            init_addr(&this->addr);
1467        }
1468
1469        void
1470        initiateAcc(GPUDynInstPtr gpuDynInst) override
1471        {
1472            // before doing the RMW, check if this atomic has
1473            // release semantics, and if so issue a release first
1474            if (!this->isLocalMem()) {
1475                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1476                    && (gpuDynInst->isRelease()
1477                    || gpuDynInst->isAcquireRelease())) {
1478
1479                    gpuDynInst->statusBitVector = VectorMask(1);
1480
1481                    gpuDynInst->execContinuation = &GPUStaticInst::execAtomic;
1482                    gpuDynInst->useContinuation = true;
1483
1484                    // create request
1485                    Request *req = new Request(0, 0, 0, 0,
1486                                  gpuDynInst->computeUnit()->masterId(),
1487                                  0, gpuDynInst->wfDynId);
1488                    req->setFlags(Request::RELEASE);
1489                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
1490
1491                    return;
1492                }
1493            }
1494
1495            // if there is no release semantic, execute the RMW immediately
1496            execAtomic(gpuDynInst);
1497
1498        }
1499
1500        void
1501        completeAcc(GPUDynInstPtr gpuDynInst) override
1502        {
1503            // if this is not an atomic return op, then we
1504            // have nothing more to do.
1505            if (this->isAtomicRet()) {
1506                // the size of the src operands and the
1507                // memory being operated on must match
1508                // for HSAIL atomics - this assumption may
1509                // not apply to all ISAs
1510                typedef typename MemDataType::CType CType;
1511
1512                Wavefront *w = gpuDynInst->wavefront();
1513                int dst = this->dest.regIndex();
1514                std::vector<uint32_t> regVec;
1515                // virtual->physical VGPR mapping
1516                int physVgpr = w->remap(dst, sizeof(CType), 1);
1517                regVec.push_back(physVgpr);
1518                CType *p1 = &((CType*)gpuDynInst->d_data)[0];
1519
1520                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
1521                    if (gpuDynInst->exec_mask[i]) {
1522                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
1523                                "$%s%d <- %d global ld done (src = wavefront "
1524                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
1525                                w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d",
1526                                dst, *p1);
1527                        // write the value into the physical VGPR. This is a
1528                        // purely functional operation. No timing is modeled.
1529                        w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i);
1530                    }
1531                    ++p1;
1532                }
1533
1534                // Schedule the write operation of the load data on the VRF.
1535                // This simply models the timing aspect of the VRF write operation.
1536                // It does not modify the physical VGPR.
1537                int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
1538                    vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
1539                                         sizeof(CType), gpuDynInst->time);
1540
1541                if (this->isGlobalMem()) {
1542                    gpuDynInst->computeUnit()->globalMemoryPipe
1543                        .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
1544                } else {
1545                    assert(this->isLocalMem());
1546                    gpuDynInst->computeUnit()->localMemoryPipe
1547                        .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
1548                }
1549            }
1550        }
1551
1552        void execute(GPUDynInstPtr gpuDynInst) override;
1553
1554      private:
1555        // execAtomic may be called through a continuation
1556        // if the RMW had release semantics. see comment for
1557        // execContinuation in gpu_dyn_inst.hh
1558        void
1559        execAtomic(GPUDynInstPtr gpuDynInst) override
1560        {
1561            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
1562
1563            typedef typename MemDataType::CType c0;
1564
1565            c0 *d = &((c0*) gpuDynInst->d_data)[0];
1566            c0 *e = &((c0*) gpuDynInst->a_data)[0];
1567            c0 *f = &((c0*) gpuDynInst->x_data)[0];
1568
1569            for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
1570                if (gpuDynInst->exec_mask[i]) {
1571                    Addr vaddr = gpuDynInst->addr[i];
1572
1573                    if (this->isLocalMem()) {
1574                        Wavefront *wavefront = gpuDynInst->wavefront();
1575                        *d = wavefront->ldsChunk->read<c0>(vaddr);
1576
1577                        if (this->isAtomicAdd()) {
1578                            wavefront->ldsChunk->write<c0>(vaddr,
1579                            wavefront->ldsChunk->read<c0>(vaddr) + (*e));
1580                        } else if (this->isAtomicSub()) {
1581                            wavefront->ldsChunk->write<c0>(vaddr,
1582                            wavefront->ldsChunk->read<c0>(vaddr) - (*e));
1583                        } else if (this->isAtomicMax()) {
1584                            wavefront->ldsChunk->write<c0>(vaddr,
1585                            std::max(wavefront->ldsChunk->read<c0>(vaddr),
1586                            (*e)));
1587                        } else if (this->isAtomicMin()) {
1588                            wavefront->ldsChunk->write<c0>(vaddr,
1589                            std::min(wavefront->ldsChunk->read<c0>(vaddr),
1590                            (*e)));
1591                        } else if (this->isAtomicAnd()) {
1592                            wavefront->ldsChunk->write<c0>(vaddr,
1593                            wavefront->ldsChunk->read<c0>(vaddr) & (*e));
1594                        } else if (this->isAtomicOr()) {
1595                            wavefront->ldsChunk->write<c0>(vaddr,
1596                            wavefront->ldsChunk->read<c0>(vaddr) | (*e));
1597                        } else if (this->isAtomicXor()) {
1598                            wavefront->ldsChunk->write<c0>(vaddr,
1599                            wavefront->ldsChunk->read<c0>(vaddr) ^ (*e));
1600                        } else if (this->isAtomicInc()) {
1601                            wavefront->ldsChunk->write<c0>(vaddr,
1602                            wavefront->ldsChunk->read<c0>(vaddr) + 1);
1603                        } else if (this->isAtomicDec()) {
1604                            wavefront->ldsChunk->write<c0>(vaddr,
1605                            wavefront->ldsChunk->read<c0>(vaddr) - 1);
1606                        } else if (this->isAtomicExch()) {
1607                            wavefront->ldsChunk->write<c0>(vaddr, (*e));
1608                        } else if (this->isAtomicCAS()) {
1609                            wavefront->ldsChunk->write<c0>(vaddr,
1610                            (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ?
1611                            (*f) : wavefront->ldsChunk->read<c0>(vaddr));
1612                        } else {
1613                            fatal("Unrecognized or invalid HSAIL atomic op "
1614                                  "type.\n");
1615                        }
1616                    } else {
1617                        Request *req =
1618                            new Request(0, vaddr, sizeof(c0), 0,
1619                                        gpuDynInst->computeUnit()->masterId(),
1620                                        0, gpuDynInst->wfDynId,
1621                                        gpuDynInst->makeAtomicOpFunctor<c0>(e,
1622                                        f));
1623
1624                        gpuDynInst->setRequestFlags(req);
1625                        PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
1626                        pkt->dataStatic(d);
1627
1628                        if (gpuDynInst->computeUnit()->shader->
1629                            separate_acquire_release &&
1630                            (gpuDynInst->isAcquire())) {
1631                            // if this atomic has acquire semantics,
1632                            // schedule the continuation to perform an
1633                            // acquire after the RMW completes
1634                            gpuDynInst->execContinuation =
1635                                &GPUStaticInst::execAtomicAcq;
1636
1637                            gpuDynInst->useContinuation = true;
1638                        } else {
1639                            // the request will be finished when the RMW completes
1640                            gpuDynInst->useContinuation = false;
1641                        }
1642                        // translation is performed in sendRequest()
1643                        gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i,
1644                                                               pkt);
1645                    }
1646                }
1647
1648                ++d;
1649                ++e;
1650                ++f;
1651            }
1652
1653            gpuDynInst->updateStats();
1654        }
1655
1656        // execAtomicACq will always be called through a continuation.
1657        // see comment for execContinuation in gpu_dyn_inst.hh
1658        void
1659        execAtomicAcq(GPUDynInstPtr gpuDynInst) override
1660        {
1661            // after performing the RMW, check to see if this instruction
1662            // has acquire semantics, and if so, issue an acquire
1663            if (!this->isLocalMem()) {
1664                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1665                     && gpuDynInst->isAcquire()) {
1666                    gpuDynInst->statusBitVector = VectorMask(1);
1667
1668                    // the request will be finished when
1669                    // the acquire completes
1670                    gpuDynInst->useContinuation = false;
1671                    // create request
1672                    Request *req = new Request(0, 0, 0, 0,
1673                                  gpuDynInst->computeUnit()->masterId(),
1674                                  0, gpuDynInst->wfDynId);
1675                    req->setFlags(Request::ACQUIRE);
1676                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
1677                }
1678            }
1679        }
1680    };
1681
1682    template<typename DataType, typename AddrOperandType, int NumSrcOperands>
1683    GPUStaticInst*
1684    constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
1685    {
1686        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
1687
1688        if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) {
1689            return decodeLd<DataType>(ib, obj);
1690        } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) {
1691            switch (ib->type) {
1692              case Brig::BRIG_TYPE_B8:
1693                return decodeSt<S8,S8>(ib, obj);
1694              case Brig::BRIG_TYPE_B16:
1695                return decodeSt<S16,S16>(ib, obj);
1696              case Brig::BRIG_TYPE_B32:
1697                return decodeSt<S32,S32>(ib, obj);
1698              case Brig::BRIG_TYPE_B64:
1699                return decodeSt<S64,S64>(ib, obj);
1700              default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type);
1701            }
1702        } else {
1703            if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET)
1704                return new AtomicInst<DataType, AddrOperandType,
1705                    NumSrcOperands, false>(ib, obj, "atomicnoret");
1706            else
1707                return new AtomicInst<DataType, AddrOperandType,
1708                    NumSrcOperands, true>(ib, obj, "atomic");
1709        }
1710    }
1711
1712    template<typename DataType, int NumSrcOperands>
1713    GPUStaticInst*
1714    decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
1715    {
1716        unsigned addrIndex = (Brig::BrigOpcode)ib->opcode ==
1717            Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1;
1718
1719        unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex);
1720
1721        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
1722
1723        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
1724            return constructAtomic<DataType, NoRegAddrOperand,
1725                                   NumSrcOperands>(ib, obj);
1726        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
1727            // V2/V4 not allowed
1728            switch (tmp.regKind) {
1729              case Brig::BRIG_REGISTER_KIND_SINGLE:
1730                  return constructAtomic<DataType, SRegAddrOperand,
1731                                         NumSrcOperands>(ib, obj);
1732              case Brig::BRIG_REGISTER_KIND_DOUBLE:
1733                return constructAtomic<DataType, DRegAddrOperand,
1734                                       NumSrcOperands>(ib, obj);
1735              default:
1736                fatal("Bad atomic register operand type %d\n", tmp.type);
1737            }
1738        } else {
1739            fatal("Bad atomic register operand kind %d\n", tmp.kind);
1740        }
1741    }
1742
1743
1744    template<typename DataType>
1745    GPUStaticInst*
1746    decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
1747    {
1748        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
1749
1750        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
1751            return decodeAtomicHelper<DataType, 2>(ib, obj);
1752        } else {
1753            return decodeAtomicHelper<DataType, 1>(ib, obj);
1754        }
1755    }
1756
1757    template<typename DataType>
1758    GPUStaticInst*
1759    decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
1760    {
1761        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
1762        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
1763            return decodeAtomicHelper<DataType, 2>(ib, obj);
1764        } else {
1765            return decodeAtomicHelper<DataType, 1>(ib, obj);
1766        }
1767    }
1768} // namespace HsailISA
1769
1770#endif // __ARCH_HSAIL_INSTS_MEM_HH__
1771