mem.hh revision 11536:fdfc2455b091
1/*
2 * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Steve Reinhardt
34 */
35
36#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
37#define __ARCH_HSAIL_INSTS_MEM_HH__
38
39#include "arch/hsail/insts/decl.hh"
40#include "arch/hsail/insts/gpu_static_inst.hh"
41#include "arch/hsail/operand.hh"
42
43namespace HsailISA
44{
45    class MemInst
46    {
47      public:
48        MemInst() : size(0), addr_operand(nullptr) { }
49
50        MemInst(Enums::MemType m_type)
51        {
52            if (m_type == Enums::M_U64 ||
53                m_type == Enums::M_S64 ||
54                m_type == Enums::M_F64) {
55                size = 8;
56            } else if (m_type == Enums::M_U32 ||
57                       m_type == Enums::M_S32 ||
58                       m_type == Enums::M_F32) {
59                size = 4;
60            } else if (m_type == Enums::M_U16 ||
61                       m_type == Enums::M_S16 ||
62                       m_type == Enums::M_F16) {
63                size = 2;
64            } else {
65                size = 1;
66            }
67
68            addr_operand = nullptr;
69        }
70
71        void
72        init_addr(AddrOperandBase *_addr_operand)
73        {
74            addr_operand = _addr_operand;
75        }
76
77      private:
78        int size;
79        AddrOperandBase *addr_operand;
80
81      public:
82        int getMemOperandSize() { return size; }
83        AddrOperandBase *getAddressOperand() { return addr_operand; }
84    };
85
86    template<typename DestOperandType, typename AddrOperandType>
87    class LdaInstBase : public HsailGPUStaticInst
88    {
89      public:
90        typename DestOperandType::DestOperand dest;
91        AddrOperandType addr;
92
93        LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
94                    const char *_opcode)
95           : HsailGPUStaticInst(obj, _opcode)
96        {
97            using namespace Brig;
98
99            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
100            dest.init(op_offs, obj);
101            op_offs = obj->getOperandPtr(ib->operands, 1);
102            addr.init(op_offs, obj);
103        }
104
105        int numSrcRegOperands() override
106        { return(this->addr.isVectorRegister()); }
107        int numDstRegOperands() override
108        { return dest.isVectorRegister(); }
109        bool isVectorRegister(int operandIndex) override
110        {
111            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
112            return((operandIndex == 0) ? dest.isVectorRegister() :
113                   this->addr.isVectorRegister());
114        }
115        bool isCondRegister(int operandIndex) override
116        {
117            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
118            return((operandIndex == 0) ? dest.isCondRegister() :
119                   this->addr.isCondRegister());
120        }
121        bool isScalarRegister(int operandIndex) override
122        {
123            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
124            return((operandIndex == 0) ? dest.isScalarRegister() :
125                   this->addr.isScalarRegister());
126        }
127        bool isSrcOperand(int operandIndex) override
128        {
129            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
130            if (operandIndex > 0)
131                return(this->addr.isVectorRegister());
132            return false;
133        }
134        bool isDstOperand(int operandIndex) override {
135            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
136            return(operandIndex == 0);
137        }
138        int getOperandSize(int operandIndex) override
139        {
140            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
141            return((operandIndex == 0) ? dest.opSize() :
142                   this->addr.opSize());
143        }
144        int getRegisterIndex(int operandIndex) override
145        {
146            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
147            return((operandIndex == 0) ? dest.regIndex() :
148                   this->addr.regIndex());
149        }
150        int getNumOperands() override
151        {
152            if (this->addr.isVectorRegister())
153                return 2;
154            return 1;
155        }
156    };
157
158    template<typename DestDataType, typename AddrOperandType>
159    class LdaInst :
160        public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
161        public MemInst
162    {
163      public:
164        void generateDisassembly();
165
166        LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
167                        const char *_opcode)
168            : LdaInstBase<typename DestDataType::OperandType,
169                          AddrOperandType>(ib, obj, _opcode)
170        {
171            init_addr(&this->addr);
172        }
173
174        void execute(GPUDynInstPtr gpuDynInst);
175    };
176
177    template<typename DataType>
178    GPUStaticInst*
179    decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
180    {
181        unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
182        BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj);
183
184        if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
185            return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas");
186        } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
187            // V2/V4 not allowed
188            switch (regDataType.regKind) {
189              case Brig::BRIG_REGISTER_KIND_SINGLE:
190                return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas");
191              case Brig::BRIG_REGISTER_KIND_DOUBLE:
192                return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas");
193              default:
194                fatal("Bad ldas register operand type %d\n", regDataType.type);
195            }
196        } else {
197            fatal("Bad ldas register operand kind %d\n", regDataType.kind);
198        }
199    }
200
201    template<typename MemOperandType, typename DestOperandType,
202             typename AddrOperandType>
203    class LdInstBase : public HsailGPUStaticInst
204    {
205      public:
206        Brig::BrigWidth8_t width;
207        typename DestOperandType::DestOperand dest;
208        AddrOperandType addr;
209
210        Brig::BrigSegment segment;
211        Brig::BrigMemoryOrder memoryOrder;
212        Brig::BrigMemoryScope memoryScope;
213        unsigned int equivClass;
214        bool isArgLoad()
215        {
216            return segment == Brig::BRIG_SEGMENT_KERNARG ||
217                   segment == Brig::BRIG_SEGMENT_ARG;
218        }
219        void
220        initLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
221               const char *_opcode)
222        {
223            using namespace Brig;
224
225            const BrigInstMem *ldst = (const BrigInstMem*)ib;
226
227            segment = (BrigSegment)ldst->segment;
228            memoryOrder = BRIG_MEMORY_ORDER_NONE;
229            memoryScope = BRIG_MEMORY_SCOPE_NONE;
230            equivClass = ldst->equivClass;
231
232            switch (segment) {
233              case BRIG_SEGMENT_GLOBAL:
234                o_type = Enums::OT_GLOBAL_READ;
235                break;
236
237              case BRIG_SEGMENT_GROUP:
238                o_type = Enums::OT_SHARED_READ;
239                break;
240
241              case BRIG_SEGMENT_PRIVATE:
242                o_type = Enums::OT_PRIVATE_READ;
243                break;
244
245              case BRIG_SEGMENT_READONLY:
246                o_type = Enums::OT_READONLY_READ;
247                break;
248
249              case BRIG_SEGMENT_SPILL:
250                o_type = Enums::OT_SPILL_READ;
251                break;
252
253              case BRIG_SEGMENT_FLAT:
254                o_type = Enums::OT_FLAT_READ;
255                break;
256
257              case BRIG_SEGMENT_KERNARG:
258                o_type = Enums::OT_KERN_READ;
259                break;
260
261              case BRIG_SEGMENT_ARG:
262                o_type = Enums::OT_ARG;
263                break;
264
265              default:
266                panic("Ld: segment %d not supported\n", segment);
267            }
268
269            width = ldst->width;
270            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
271            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
272            if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
273                dest.init(op_offs, obj);
274
275            op_offs = obj->getOperandPtr(ib->operands, 1);
276            addr.init(op_offs, obj);
277        }
278
279        void
280        initAtomicLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
281                     const char *_opcode)
282        {
283            using namespace Brig;
284
285            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
286
287            segment = (BrigSegment)at->segment;
288            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
289            memoryScope = (BrigMemoryScope)at->memoryScope;
290            equivClass = 0;
291
292            switch (segment) {
293              case BRIG_SEGMENT_GLOBAL:
294                o_type = Enums::OT_GLOBAL_READ;
295                break;
296
297              case BRIG_SEGMENT_GROUP:
298                o_type = Enums::OT_SHARED_READ;
299                break;
300
301              case BRIG_SEGMENT_PRIVATE:
302                o_type = Enums::OT_PRIVATE_READ;
303                break;
304
305              case BRIG_SEGMENT_READONLY:
306                o_type = Enums::OT_READONLY_READ;
307                break;
308
309              case BRIG_SEGMENT_SPILL:
310                o_type = Enums::OT_SPILL_READ;
311                break;
312
313              case BRIG_SEGMENT_FLAT:
314                o_type = Enums::OT_FLAT_READ;
315                break;
316
317              case BRIG_SEGMENT_KERNARG:
318                o_type = Enums::OT_KERN_READ;
319                break;
320
321              case BRIG_SEGMENT_ARG:
322                o_type = Enums::OT_ARG;
323                break;
324
325              default:
326                panic("Ld: segment %d not supported\n", segment);
327            }
328
329            width = BRIG_WIDTH_1;
330            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
331            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
332
333            if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
334                dest.init(op_offs, obj);
335
336            op_offs = obj->getOperandPtr(ib->operands,1);
337            addr.init(op_offs, obj);
338        }
339
340        LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
341                   const char *_opcode)
342           : HsailGPUStaticInst(obj, _opcode)
343        {
344            using namespace Brig;
345
346            if (ib->opcode == BRIG_OPCODE_LD) {
347                initLd(ib, obj, _opcode);
348            } else {
349                initAtomicLd(ib, obj, _opcode);
350            }
351        }
352
353        int numSrcRegOperands() override
354        { return(this->addr.isVectorRegister()); }
355        int numDstRegOperands() override { return dest.isVectorRegister(); }
356        int getNumOperands() override
357        {
358            if (this->addr.isVectorRegister())
359                return 2;
360            else
361                return 1;
362        }
363        bool isVectorRegister(int operandIndex) override
364        {
365            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
366            return((operandIndex == 0) ? dest.isVectorRegister() :
367                   this->addr.isVectorRegister());
368        }
369        bool isCondRegister(int operandIndex) override
370        {
371            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
372            return((operandIndex == 0) ? dest.isCondRegister() :
373                   this->addr.isCondRegister());
374        }
375        bool isScalarRegister(int operandIndex) override
376        {
377            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
378            return((operandIndex == 0) ? dest.isScalarRegister() :
379                   this->addr.isScalarRegister());
380        }
381        bool isSrcOperand(int operandIndex) override
382        {
383            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
384            if (operandIndex > 0)
385                return(this->addr.isVectorRegister());
386            return false;
387        }
388        bool isDstOperand(int operandIndex) override
389        {
390            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
391            return(operandIndex == 0);
392        }
393        int getOperandSize(int operandIndex) override
394        {
395            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
396            return((operandIndex == 0) ? dest.opSize() :
397                   this->addr.opSize());
398        }
399        int getRegisterIndex(int operandIndex) override
400        {
401            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
402            return((operandIndex == 0) ? dest.regIndex() :
403                   this->addr.regIndex());
404        }
405    };
406
407    template<typename MemDataType, typename DestDataType,
408             typename AddrOperandType>
409    class LdInst :
410        public LdInstBase<typename MemDataType::CType,
411                          typename DestDataType::OperandType, AddrOperandType>,
412        public MemInst
413    {
414        typename DestDataType::OperandType::DestOperand dest_vect[4];
415        uint16_t num_dest_operands;
416        void generateDisassembly() override;
417
418      public:
419        LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
420               const char *_opcode)
421            : LdInstBase<typename MemDataType::CType,
422                         typename DestDataType::OperandType,
423                         AddrOperandType>(ib, obj, _opcode),
424              MemInst(MemDataType::memType)
425        {
426            init_addr(&this->addr);
427
428            unsigned op_offs = obj->getOperandPtr(ib->operands,0);
429            const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
430
431            if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
432                const Brig::BrigOperandOperandList *brigRegVecOp =
433                    (const Brig::BrigOperandOperandList*)brigOp;
434
435                num_dest_operands =
436                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
437
438                assert(num_dest_operands <= 4);
439            } else {
440                num_dest_operands = 1;
441            }
442
443            if (num_dest_operands > 1) {
444                assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
445
446                for (int i = 0; i < num_dest_operands; ++i) {
447                    dest_vect[i].init_from_vect(op_offs, obj, i);
448                }
449            }
450        }
451
452        void
453        initiateAcc(GPUDynInstPtr gpuDynInst) override
454        {
455            typedef typename MemDataType::CType c0;
456
457            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
458
459            if (num_dest_operands > 1) {
460                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
461                    if (gpuDynInst->exec_mask[i])
462                        gpuDynInst->statusVector.push_back(num_dest_operands);
463                    else
464                        gpuDynInst->statusVector.push_back(0);
465            }
466
467            for (int k = 0; k < num_dest_operands; ++k) {
468
469                c0 *d = &((c0*)gpuDynInst->d_data)
470                    [k * gpuDynInst->computeUnit()->wfSize()];
471
472                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
473                    if (gpuDynInst->exec_mask[i]) {
474                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
475
476                        if (isLocalMem()) {
477                            // load from shared memory
478                            *d = gpuDynInst->wavefront()->ldsChunk->
479                                read<c0>(vaddr);
480                        } else {
481                            Request *req = new Request(0, vaddr, sizeof(c0), 0,
482                                          gpuDynInst->computeUnit()->masterId(),
483                                          0, gpuDynInst->wfDynId);
484
485                            gpuDynInst->setRequestFlags(req);
486                            PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
487                            pkt->dataStatic(d);
488
489                            if (gpuDynInst->computeUnit()->shader->
490                                separate_acquire_release &&
491                                gpuDynInst->memoryOrder ==
492                                Enums::MEMORY_ORDER_SC_ACQUIRE) {
493                                // if this load has acquire semantics,
494                                // set the response continuation function
495                                // to perform an Acquire request
496                                gpuDynInst->execContinuation =
497                                    &GPUStaticInst::execLdAcq;
498
499                                gpuDynInst->useContinuation = true;
500                            } else {
501                                // the request will be finished when
502                                // the load completes
503                                gpuDynInst->useContinuation = false;
504                            }
505                            // translation is performed in sendRequest()
506                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
507                                                                   i, pkt);
508                        }
509                    }
510                    ++d;
511                }
512            }
513
514            gpuDynInst->updateStats();
515        }
516
517      private:
518        void
519        execLdAcq(GPUDynInstPtr gpuDynInst) override
520        {
521            // after the load has complete and if the load has acquire
522            // semantics, issue an acquire request.
523            if (!isLocalMem()) {
524                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
525                    && gpuDynInst->memoryOrder ==
526                    Enums::MEMORY_ORDER_SC_ACQUIRE) {
527                    gpuDynInst->statusBitVector = VectorMask(1);
528                    gpuDynInst->useContinuation = false;
529                    // create request
530                    Request *req = new Request(0, 0, 0, 0,
531                                  gpuDynInst->computeUnit()->masterId(),
532                                  0, gpuDynInst->wfDynId);
533                    req->setFlags(Request::ACQUIRE);
534                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
535                }
536            }
537        }
538
539      public:
540        bool
541        isLocalMem() const override
542        {
543            return this->segment == Brig::BRIG_SEGMENT_GROUP;
544        }
545
546        bool isVectorRegister(int operandIndex) override
547        {
548            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
549            if ((num_dest_operands != getNumOperands()) &&
550                (operandIndex == (getNumOperands()-1)))
551                return(this->addr.isVectorRegister());
552            if (num_dest_operands > 1) {
553                return dest_vect[operandIndex].isVectorRegister();
554            }
555            else if (num_dest_operands == 1) {
556                return LdInstBase<typename MemDataType::CType,
557                       typename DestDataType::OperandType,
558                       AddrOperandType>::dest.isVectorRegister();
559            }
560            return false;
561        }
562        bool isCondRegister(int operandIndex) override
563        {
564            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
565            if ((num_dest_operands != getNumOperands()) &&
566                (operandIndex == (getNumOperands()-1)))
567                return(this->addr.isCondRegister());
568            if (num_dest_operands > 1)
569                return dest_vect[operandIndex].isCondRegister();
570            else if (num_dest_operands == 1)
571                return LdInstBase<typename MemDataType::CType,
572                       typename DestDataType::OperandType,
573                       AddrOperandType>::dest.isCondRegister();
574            return false;
575        }
576        bool isScalarRegister(int operandIndex) override
577        {
578            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
579            if ((num_dest_operands != getNumOperands()) &&
580                (operandIndex == (getNumOperands()-1)))
581                return(this->addr.isScalarRegister());
582            if (num_dest_operands > 1)
583                return dest_vect[operandIndex].isScalarRegister();
584            else if (num_dest_operands == 1)
585                return LdInstBase<typename MemDataType::CType,
586                       typename DestDataType::OperandType,
587                       AddrOperandType>::dest.isScalarRegister();
588            return false;
589        }
590        bool isSrcOperand(int operandIndex) override
591        {
592            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
593            if ((num_dest_operands != getNumOperands()) &&
594                (operandIndex == (getNumOperands()-1)))
595                return(this->addr.isVectorRegister());
596            return false;
597        }
598        bool isDstOperand(int operandIndex) override
599        {
600            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
601            if ((num_dest_operands != getNumOperands()) &&
602                (operandIndex == (getNumOperands()-1)))
603                return false;
604            return true;
605        }
606        int getOperandSize(int operandIndex) override
607        {
608            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
609            if ((num_dest_operands != getNumOperands()) &&
610                (operandIndex == (getNumOperands()-1)))
611                return(this->addr.opSize());
612            if (num_dest_operands > 1)
613                return(dest_vect[operandIndex].opSize());
614            else if (num_dest_operands == 1)
615                return(LdInstBase<typename MemDataType::CType,
616                       typename DestDataType::OperandType,
617                       AddrOperandType>::dest.opSize());
618            return 0;
619        }
620        int getRegisterIndex(int operandIndex) override
621        {
622            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
623            if ((num_dest_operands != getNumOperands()) &&
624                (operandIndex == (getNumOperands()-1)))
625                return(this->addr.regIndex());
626            if (num_dest_operands > 1)
627                return(dest_vect[operandIndex].regIndex());
628            else if (num_dest_operands == 1)
629                return(LdInstBase<typename MemDataType::CType,
630                       typename DestDataType::OperandType,
631                       AddrOperandType>::dest.regIndex());
632            return -1;
633        }
634        int getNumOperands() override
635        {
636            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
637                return(num_dest_operands+1);
638            else
639                return(num_dest_operands);
640        }
641        void execute(GPUDynInstPtr gpuDynInst) override;
642    };
643
644    template<typename MemDT, typename DestDT>
645    GPUStaticInst*
646    decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
647    {
648        unsigned op_offs = obj->getOperandPtr(ib->operands,1);
649        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
650
651        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
652            return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld");
653        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
654                   tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
655            switch (tmp.regKind) {
656              case Brig::BRIG_REGISTER_KIND_SINGLE:
657                return new LdInst<MemDT, DestDT,
658                                  SRegAddrOperand>(ib, obj, "ld");
659              case Brig::BRIG_REGISTER_KIND_DOUBLE:
660                return new LdInst<MemDT, DestDT,
661                                  DRegAddrOperand>(ib, obj, "ld");
662              default:
663                fatal("Bad ld register operand type %d\n", tmp.regKind);
664            }
665        } else {
666            fatal("Bad ld register operand kind %d\n", tmp.kind);
667        }
668    }
669
670    template<typename MemDT>
671    GPUStaticInst*
672    decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
673    {
674        unsigned op_offs = obj->getOperandPtr(ib->operands,0);
675        BrigRegOperandInfo dest = findRegDataType(op_offs, obj);
676
677        assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
678               dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
679        switch(dest.regKind) {
680          case Brig::BRIG_REGISTER_KIND_SINGLE:
681            switch (ib->type) {
682              case Brig::BRIG_TYPE_B8:
683              case Brig::BRIG_TYPE_B16:
684              case Brig::BRIG_TYPE_B32:
685                return decodeLd2<MemDT, B32>(ib, obj);
686              case Brig::BRIG_TYPE_U8:
687              case Brig::BRIG_TYPE_U16:
688              case Brig::BRIG_TYPE_U32:
689                return decodeLd2<MemDT, U32>(ib, obj);
690              case Brig::BRIG_TYPE_S8:
691              case Brig::BRIG_TYPE_S16:
692              case Brig::BRIG_TYPE_S32:
693                return decodeLd2<MemDT, S32>(ib, obj);
694              case Brig::BRIG_TYPE_F16:
695              case Brig::BRIG_TYPE_F32:
696                return decodeLd2<MemDT, U32>(ib, obj);
697              default:
698                fatal("Bad ld register operand type %d, %d\n",
699                      dest.regKind, ib->type);
700            };
701          case Brig::BRIG_REGISTER_KIND_DOUBLE:
702            switch (ib->type) {
703              case Brig::BRIG_TYPE_B64:
704                return decodeLd2<MemDT, B64>(ib, obj);
705              case Brig::BRIG_TYPE_U64:
706                return decodeLd2<MemDT, U64>(ib, obj);
707              case Brig::BRIG_TYPE_S64:
708                return decodeLd2<MemDT, S64>(ib, obj);
709              case Brig::BRIG_TYPE_F64:
710                return decodeLd2<MemDT, U64>(ib, obj);
711              default:
712                fatal("Bad ld register operand type %d, %d\n",
713                      dest.regKind, ib->type);
714            };
715          default:
716            fatal("Bad ld register operand type %d, %d\n", dest.regKind,
717                  ib->type);
718        }
719    }
720
721    template<typename MemDataType, typename SrcOperandType,
722             typename AddrOperandType>
723    class StInstBase : public HsailGPUStaticInst
724    {
725      public:
726        typename SrcOperandType::SrcOperand src;
727        AddrOperandType addr;
728
729        Brig::BrigSegment segment;
730        Brig::BrigMemoryScope memoryScope;
731        Brig::BrigMemoryOrder memoryOrder;
732        unsigned int equivClass;
733
734        void
735        initSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
736               const char *_opcode)
737        {
738            using namespace Brig;
739
740            const BrigInstMem *ldst = (const BrigInstMem*)ib;
741
742            segment = (BrigSegment)ldst->segment;
743            memoryOrder = BRIG_MEMORY_ORDER_NONE;
744            memoryScope = BRIG_MEMORY_SCOPE_NONE;
745            equivClass = ldst->equivClass;
746
747            switch (segment) {
748              case BRIG_SEGMENT_GLOBAL:
749                o_type = Enums::OT_GLOBAL_WRITE;
750                break;
751
752              case BRIG_SEGMENT_GROUP:
753                o_type = Enums::OT_SHARED_WRITE;
754                break;
755
756              case BRIG_SEGMENT_PRIVATE:
757                o_type = Enums::OT_PRIVATE_WRITE;
758                break;
759
760              case BRIG_SEGMENT_READONLY:
761                o_type = Enums::OT_READONLY_WRITE;
762                break;
763
764              case BRIG_SEGMENT_SPILL:
765                o_type = Enums::OT_SPILL_WRITE;
766                break;
767
768              case BRIG_SEGMENT_FLAT:
769                o_type = Enums::OT_FLAT_WRITE;
770                break;
771
772              case BRIG_SEGMENT_ARG:
773                o_type = Enums::OT_ARG;
774                break;
775
776              default:
777                panic("St: segment %d not supported\n", segment);
778            }
779
780            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
781            const BrigOperand *baseOp = obj->getOperand(op_offs);
782
783            if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) ||
784                (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) {
785                src.init(op_offs, obj);
786            }
787
788            op_offs = obj->getOperandPtr(ib->operands, 1);
789            addr.init(op_offs, obj);
790        }
791
792        void
793        initAtomicSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
794                     const char *_opcode)
795        {
796            using namespace Brig;
797
798            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
799
800            segment = (BrigSegment)at->segment;
801            memoryScope = (BrigMemoryScope)at->memoryScope;
802            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
803            equivClass = 0;
804
805            switch (segment) {
806              case BRIG_SEGMENT_GLOBAL:
807                o_type = Enums::OT_GLOBAL_WRITE;
808                break;
809
810              case BRIG_SEGMENT_GROUP:
811                o_type = Enums::OT_SHARED_WRITE;
812                break;
813
814              case BRIG_SEGMENT_PRIVATE:
815                o_type = Enums::OT_PRIVATE_WRITE;
816                break;
817
818              case BRIG_SEGMENT_READONLY:
819                o_type = Enums::OT_READONLY_WRITE;
820                break;
821
822              case BRIG_SEGMENT_SPILL:
823                o_type = Enums::OT_SPILL_WRITE;
824                break;
825
826              case BRIG_SEGMENT_FLAT:
827                o_type = Enums::OT_FLAT_WRITE;
828                break;
829
830              case BRIG_SEGMENT_ARG:
831                o_type = Enums::OT_ARG;
832                break;
833
834              default:
835                panic("St: segment %d not supported\n", segment);
836            }
837
838            unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
839            addr.init(op_offs, obj);
840
841            op_offs = obj->getOperandPtr(ib->operands, 1);
842            src.init(op_offs, obj);
843        }
844
845        StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
846                   const char *_opcode)
847           : HsailGPUStaticInst(obj, _opcode)
848        {
849            using namespace Brig;
850
851            if (ib->opcode == BRIG_OPCODE_ST) {
852                initSt(ib, obj, _opcode);
853            } else {
854                initAtomicSt(ib, obj, _opcode);
855            }
856        }
857
858        int numDstRegOperands() override { return 0; }
859        int numSrcRegOperands() override
860        {
861            return src.isVectorRegister() + this->addr.isVectorRegister();
862        }
863        int getNumOperands() override
864        {
865            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
866                return 2;
867            else
868                return 1;
869        }
870        bool isVectorRegister(int operandIndex) override
871        {
872            assert(operandIndex >= 0 && operandIndex < getNumOperands());
873            return !operandIndex ? src.isVectorRegister() :
874                   this->addr.isVectorRegister();
875        }
876        bool isCondRegister(int operandIndex) override
877        {
878            assert(operandIndex >= 0 && operandIndex < getNumOperands());
879            return !operandIndex ? src.isCondRegister() :
880                   this->addr.isCondRegister();
881        }
882        bool isScalarRegister(int operandIndex) override
883        {
884            assert(operandIndex >= 0 && operandIndex < getNumOperands());
885            return !operandIndex ? src.isScalarRegister() :
886                   this->addr.isScalarRegister();
887        }
888        bool isSrcOperand(int operandIndex) override
889        {
890            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
891            return true;
892        }
893        bool isDstOperand(int operandIndex) override { return false; }
894        int getOperandSize(int operandIndex) override
895        {
896            assert(operandIndex >= 0 && operandIndex < getNumOperands());
897            return !operandIndex ? src.opSize() : this->addr.opSize();
898        }
899        int getRegisterIndex(int operandIndex) override
900        {
901            assert(operandIndex >= 0 && operandIndex < getNumOperands());
902            return !operandIndex ? src.regIndex() : this->addr.regIndex();
903        }
904    };
905
906
907    template<typename MemDataType, typename SrcDataType,
908             typename AddrOperandType>
909    class StInst :
910        public StInstBase<MemDataType, typename SrcDataType::OperandType,
911                          AddrOperandType>,
912        public MemInst
913    {
914      public:
915        typename SrcDataType::OperandType::SrcOperand src_vect[4];
916        uint16_t num_src_operands;
917        void generateDisassembly() override;
918
919        StInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
920                        const char *_opcode, int srcIdx)
921            : StInstBase<MemDataType, typename SrcDataType::OperandType,
922                         AddrOperandType>(ib, obj, _opcode),
923              MemInst(SrcDataType::memType)
924        {
925            init_addr(&this->addr);
926
927            BrigRegOperandInfo rinfo;
928            unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx);
929            const Brig::BrigOperand *baseOp = obj->getOperand(op_offs);
930
931            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
932                const Brig::BrigOperandConstantBytes *op =
933                    (Brig::BrigOperandConstantBytes*)baseOp;
934
935                rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind,
936                                           Brig::BRIG_TYPE_NONE);
937            } else {
938                rinfo = findRegDataType(op_offs, obj);
939            }
940
941            if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
942                const Brig::BrigOperandOperandList *brigRegVecOp =
943                    (const Brig::BrigOperandOperandList*)baseOp;
944
945                num_src_operands =
946                    *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
947
948                assert(num_src_operands <= 4);
949            } else {
950                num_src_operands = 1;
951            }
952
953            if (num_src_operands > 1) {
954                assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
955
956                for (int i = 0; i < num_src_operands; ++i) {
957                    src_vect[i].init_from_vect(op_offs, obj, i);
958                }
959            }
960        }
961
962        void
963        initiateAcc(GPUDynInstPtr gpuDynInst) override
964        {
965            // before performing a store, check if this store has
966            // release semantics, and if so issue a release first
967            if (!isLocalMem()) {
968                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
969                    && gpuDynInst->memoryOrder ==
970                    Enums::MEMORY_ORDER_SC_RELEASE) {
971
972                    gpuDynInst->statusBitVector = VectorMask(1);
973                    gpuDynInst->execContinuation = &GPUStaticInst::execSt;
974                    gpuDynInst->useContinuation = true;
975                    // create request
976                    Request *req = new Request(0, 0, 0, 0,
977                                  gpuDynInst->computeUnit()->masterId(),
978                                  0, gpuDynInst->wfDynId);
979                    req->setFlags(Request::RELEASE);
980                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
981
982                    return;
983                }
984            }
985
986            // if there is no release semantic, perform stores immediately
987            execSt(gpuDynInst);
988        }
989
990        bool
991        isLocalMem() const override
992        {
993            return this->segment == Brig::BRIG_SEGMENT_GROUP;
994        }
995
996      private:
997        // execSt may be called through a continuation
998        // if the store had release semantics. see comment for
999        // execSt in gpu_static_inst.hh
1000        void
1001        execSt(GPUDynInstPtr gpuDynInst) override
1002        {
1003            typedef typename MemDataType::CType c0;
1004
1005            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
1006
1007            if (num_src_operands > 1) {
1008                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
1009                    if (gpuDynInst->exec_mask[i])
1010                        gpuDynInst->statusVector.push_back(num_src_operands);
1011                    else
1012                        gpuDynInst->statusVector.push_back(0);
1013            }
1014
1015            for (int k = 0; k < num_src_operands; ++k) {
1016                c0 *d = &((c0*)gpuDynInst->d_data)
1017                    [k * gpuDynInst->computeUnit()->wfSize()];
1018
1019                for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
1020                    if (gpuDynInst->exec_mask[i]) {
1021                        Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
1022
1023                        if (isLocalMem()) {
1024                            //store to shared memory
1025                            gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr,
1026                                                                         *d);
1027                        } else {
1028                            Request *req =
1029                              new Request(0, vaddr, sizeof(c0), 0,
1030                                          gpuDynInst->computeUnit()->masterId(),
1031                                          0, gpuDynInst->wfDynId);
1032
1033                            gpuDynInst->setRequestFlags(req);
1034                            PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
1035                            pkt->dataStatic<c0>(d);
1036
1037                            // translation is performed in sendRequest()
1038                            // the request will be finished when the store completes
1039                            gpuDynInst->useContinuation = false;
1040                            gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
1041                                                                   i, pkt);
1042
1043                        }
1044                    }
1045                    ++d;
1046                }
1047            }
1048
1049            gpuDynInst->updateStats();
1050        }
1051
1052      public:
1053        bool isVectorRegister(int operandIndex) override
1054        {
1055            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1056            if (operandIndex == num_src_operands)
1057                return this->addr.isVectorRegister();
1058            if (num_src_operands > 1)
1059                return src_vect[operandIndex].isVectorRegister();
1060            else if (num_src_operands == 1)
1061                return StInstBase<MemDataType,
1062                       typename SrcDataType::OperandType,
1063                       AddrOperandType>::src.isVectorRegister();
1064            return false;
1065        }
1066        bool isCondRegister(int operandIndex) override
1067        {
1068            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1069            if (operandIndex == num_src_operands)
1070                return this->addr.isCondRegister();
1071            if (num_src_operands > 1)
1072                return src_vect[operandIndex].isCondRegister();
1073            else if (num_src_operands == 1)
1074                return StInstBase<MemDataType,
1075                       typename SrcDataType::OperandType,
1076                       AddrOperandType>::src.isCondRegister();
1077            return false;
1078        }
1079        bool isScalarRegister(int operandIndex) override
1080        {
1081            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1082            if (operandIndex == num_src_operands)
1083                return this->addr.isScalarRegister();
1084            if (num_src_operands > 1)
1085                return src_vect[operandIndex].isScalarRegister();
1086            else if (num_src_operands == 1)
1087                return StInstBase<MemDataType,
1088                       typename SrcDataType::OperandType,
1089                       AddrOperandType>::src.isScalarRegister();
1090            return false;
1091        }
1092        bool isSrcOperand(int operandIndex) override
1093        {
1094            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1095            return true;
1096        }
1097        bool isDstOperand(int operandIndex) override { return false; }
1098        int getOperandSize(int operandIndex) override
1099        {
1100            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1101            if (operandIndex == num_src_operands)
1102                return this->addr.opSize();
1103            if (num_src_operands > 1)
1104                return src_vect[operandIndex].opSize();
1105            else if (num_src_operands == 1)
1106                return StInstBase<MemDataType,
1107                       typename SrcDataType::OperandType,
1108                       AddrOperandType>::src.opSize();
1109            return 0;
1110        }
1111        int getRegisterIndex(int operandIndex) override
1112        {
1113            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1114            if (operandIndex == num_src_operands)
1115                return this->addr.regIndex();
1116            if (num_src_operands > 1)
1117                return src_vect[operandIndex].regIndex();
1118            else if (num_src_operands == 1)
1119                return StInstBase<MemDataType,
1120                       typename SrcDataType::OperandType,
1121                       AddrOperandType>::src.regIndex();
1122            return -1;
1123        }
1124        int getNumOperands() override
1125        {
1126            if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
1127                return num_src_operands + 1;
1128            else
1129                return num_src_operands;
1130        }
1131        void execute(GPUDynInstPtr gpuDynInst) override;
1132    };
1133
1134    template<typename DataType, typename SrcDataType>
1135    GPUStaticInst*
1136    decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
1137    {
1138        int srcIdx = 0;
1139        int destIdx = 1;
1140        if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC ||
1141            ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) {
1142            srcIdx = 1;
1143            destIdx = 0;
1144        }
1145        unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx);
1146
1147        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
1148
1149        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
1150            return new StInst<DataType, SrcDataType,
1151                              NoRegAddrOperand>(ib, obj, "st", srcIdx);
1152        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
1153            // V2/V4 not allowed
1154            switch (tmp.regKind) {
1155              case Brig::BRIG_REGISTER_KIND_SINGLE:
1156                return new StInst<DataType, SrcDataType,
1157                                  SRegAddrOperand>(ib, obj, "st", srcIdx);
1158              case Brig::BRIG_REGISTER_KIND_DOUBLE:
1159                return new StInst<DataType, SrcDataType,
1160                                  DRegAddrOperand>(ib, obj, "st", srcIdx);
1161              default:
1162                fatal("Bad st register operand type %d\n", tmp.type);
1163            }
1164        } else {
1165            fatal("Bad st register operand kind %d\n", tmp.kind);
1166        }
1167    }
1168
1169    Enums::MemOpType brigAtomicToMemOpType(Brig::BrigOpcode brigOpCode,
1170                                           Brig::BrigAtomicOperation brigOp);
1171
1172    template<typename OperandType, typename AddrOperandType, int NumSrcOperands,
1173             bool HasDst>
1174    class AtomicInstBase : public HsailGPUStaticInst
1175    {
1176      public:
1177        typename OperandType::DestOperand dest;
1178        typename OperandType::SrcOperand src[NumSrcOperands];
1179        AddrOperandType addr;
1180
1181        Brig::BrigSegment segment;
1182        Brig::BrigMemoryOrder memoryOrder;
1183        Brig::BrigAtomicOperation atomicOperation;
1184        Brig::BrigMemoryScope memoryScope;
1185        Brig::BrigOpcode opcode;
1186        Enums::MemOpType opType;
1187
1188        AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
1189                       const char *_opcode)
1190           : HsailGPUStaticInst(obj, _opcode)
1191        {
1192            using namespace Brig;
1193
1194            const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
1195
1196            segment = (BrigSegment)at->segment;
1197            memoryScope = (BrigMemoryScope)at->memoryScope;
1198            memoryOrder = (BrigMemoryOrder)at->memoryOrder;
1199            atomicOperation = (BrigAtomicOperation)at->atomicOperation;
1200            opcode = (BrigOpcode)ib->opcode;
1201            opType = brigAtomicToMemOpType(opcode, atomicOperation);
1202
1203            switch (segment) {
1204              case BRIG_SEGMENT_GLOBAL:
1205                o_type = Enums::OT_GLOBAL_ATOMIC;
1206                break;
1207
1208              case BRIG_SEGMENT_GROUP:
1209                o_type = Enums::OT_SHARED_ATOMIC;
1210                break;
1211
1212              case BRIG_SEGMENT_FLAT:
1213                o_type = Enums::OT_FLAT_ATOMIC;
1214                break;
1215
1216              default:
1217                panic("Atomic: segment %d not supported\n", segment);
1218            }
1219
1220            if (HasDst) {
1221                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
1222                dest.init(op_offs, obj);
1223
1224                op_offs = obj->getOperandPtr(ib->operands, 1);
1225                addr.init(op_offs, obj);
1226
1227                for (int i = 0; i < NumSrcOperands; ++i) {
1228                    op_offs = obj->getOperandPtr(ib->operands, i + 2);
1229                    src[i].init(op_offs, obj);
1230                }
1231            } else {
1232
1233                unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
1234                addr.init(op_offs, obj);
1235
1236                for (int i = 0; i < NumSrcOperands; ++i) {
1237                    op_offs = obj->getOperandPtr(ib->operands, i + 1);
1238                    src[i].init(op_offs, obj);
1239                }
1240            }
1241        }
1242
1243        int numSrcRegOperands()
1244        {
1245            int operands = 0;
1246            for (int i = 0; i < NumSrcOperands; i++) {
1247                if (src[i].isVectorRegister()) {
1248                    operands++;
1249                }
1250            }
1251            if (addr.isVectorRegister())
1252                operands++;
1253            return operands;
1254        }
1255        int numDstRegOperands() { return dest.isVectorRegister(); }
1256        int getNumOperands()
1257        {
1258            if (addr.isVectorRegister())
1259                return(NumSrcOperands + 2);
1260            return(NumSrcOperands + 1);
1261        }
1262        bool isVectorRegister(int operandIndex)
1263        {
1264            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1265            if (operandIndex < NumSrcOperands)
1266                return src[operandIndex].isVectorRegister();
1267            else if (operandIndex == NumSrcOperands)
1268                return(addr.isVectorRegister());
1269            else
1270                return dest.isVectorRegister();
1271        }
1272        bool isCondRegister(int operandIndex)
1273        {
1274            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1275            if (operandIndex < NumSrcOperands)
1276                return src[operandIndex].isCondRegister();
1277            else if (operandIndex == NumSrcOperands)
1278                return(addr.isCondRegister());
1279            else
1280                return dest.isCondRegister();
1281        }
1282        bool isScalarRegister(int operandIndex)
1283        {
1284            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1285            if (operandIndex < NumSrcOperands)
1286                return src[operandIndex].isScalarRegister();
1287            else if (operandIndex == NumSrcOperands)
1288                return(addr.isScalarRegister());
1289            else
1290                return dest.isScalarRegister();
1291        }
1292        bool isSrcOperand(int operandIndex)
1293        {
1294            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1295            if (operandIndex < NumSrcOperands)
1296                return true;
1297            else if (operandIndex == NumSrcOperands)
1298                return(addr.isVectorRegister());
1299            else
1300                return false;
1301        }
1302        bool isDstOperand(int operandIndex)
1303        {
1304            if (operandIndex <= NumSrcOperands)
1305                return false;
1306            else
1307                return true;
1308        }
1309        int getOperandSize(int operandIndex)
1310        {
1311            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1312            if (operandIndex < NumSrcOperands)
1313                return(src[operandIndex].opSize());
1314            else if (operandIndex == NumSrcOperands)
1315                return(addr.opSize());
1316            else
1317                return(dest.opSize());
1318        }
1319        int getRegisterIndex(int operandIndex)
1320        {
1321            assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
1322            if (operandIndex < NumSrcOperands)
1323                return(src[operandIndex].regIndex());
1324            else if (operandIndex == NumSrcOperands)
1325                return(addr.regIndex());
1326            else
1327                return(dest.regIndex());
1328            return -1;
1329        }
1330    };
1331
1332    template<typename MemDataType, typename AddrOperandType, int NumSrcOperands,
1333             bool HasDst>
1334    class AtomicInst :
1335        public AtomicInstBase<typename MemDataType::OperandType,
1336                              AddrOperandType, NumSrcOperands, HasDst>,
1337        public MemInst
1338    {
1339      public:
1340        void generateDisassembly() override;
1341
1342        AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
1343                   const char *_opcode)
1344            : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
1345                             NumSrcOperands, HasDst>
1346                (ib, obj, _opcode),
1347              MemInst(MemDataType::memType)
1348        {
1349            init_addr(&this->addr);
1350        }
1351
1352        void
1353        initiateAcc(GPUDynInstPtr gpuDynInst) override
1354        {
1355            // before doing the RMW, check if this atomic has
1356            // release semantics, and if so issue a release first
1357            if (!isLocalMem()) {
1358                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1359                    && (gpuDynInst->memoryOrder ==
1360                    Enums::MEMORY_ORDER_SC_RELEASE || gpuDynInst->memoryOrder ==
1361                    Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE)) {
1362
1363                    gpuDynInst->statusBitVector = VectorMask(1);
1364
1365                    gpuDynInst->execContinuation = &GPUStaticInst::execAtomic;
1366                    gpuDynInst->useContinuation = true;
1367
1368                    // create request
1369                    Request *req = new Request(0, 0, 0, 0,
1370                                  gpuDynInst->computeUnit()->masterId(),
1371                                  0, gpuDynInst->wfDynId);
1372                    req->setFlags(Request::RELEASE);
1373                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
1374
1375                    return;
1376                }
1377            }
1378
1379            // if there is no release semantic, execute the RMW immediately
1380            execAtomic(gpuDynInst);
1381
1382        }
1383
1384        void execute(GPUDynInstPtr gpuDynInst) override;
1385
1386        bool
1387        isLocalMem() const override
1388        {
1389            return this->segment == Brig::BRIG_SEGMENT_GROUP;
1390        }
1391
1392      private:
1393        // execAtomic may be called through a continuation
1394        // if the RMW had release semantics. see comment for
1395        // execContinuation in gpu_dyn_inst.hh
1396        void
1397        execAtomic(GPUDynInstPtr gpuDynInst) override
1398        {
1399            gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
1400
1401            typedef typename MemDataType::CType c0;
1402
1403            c0 *d = &((c0*) gpuDynInst->d_data)[0];
1404            c0 *e = &((c0*) gpuDynInst->a_data)[0];
1405            c0 *f = &((c0*) gpuDynInst->x_data)[0];
1406
1407            for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
1408                if (gpuDynInst->exec_mask[i]) {
1409                    Addr vaddr = gpuDynInst->addr[i];
1410
1411                    if (isLocalMem()) {
1412                        Wavefront *wavefront = gpuDynInst->wavefront();
1413                        *d = wavefront->ldsChunk->read<c0>(vaddr);
1414
1415                        switch (this->opType) {
1416                          case Enums::MO_AADD:
1417                          case Enums::MO_ANRADD:
1418                            wavefront->ldsChunk->write<c0>(vaddr,
1419                            wavefront->ldsChunk->read<c0>(vaddr) + (*e));
1420                            break;
1421                          case Enums::MO_ASUB:
1422                          case Enums::MO_ANRSUB:
1423                            wavefront->ldsChunk->write<c0>(vaddr,
1424                            wavefront->ldsChunk->read<c0>(vaddr) - (*e));
1425                            break;
1426                          case Enums::MO_AMAX:
1427                          case Enums::MO_ANRMAX:
1428                            wavefront->ldsChunk->write<c0>(vaddr,
1429                            std::max(wavefront->ldsChunk->read<c0>(vaddr),
1430                            (*e)));
1431                            break;
1432                          case Enums::MO_AMIN:
1433                          case Enums::MO_ANRMIN:
1434                            wavefront->ldsChunk->write<c0>(vaddr,
1435                            std::min(wavefront->ldsChunk->read<c0>(vaddr),
1436                            (*e)));
1437                            break;
1438                          case Enums::MO_AAND:
1439                          case Enums::MO_ANRAND:
1440                            wavefront->ldsChunk->write<c0>(vaddr,
1441                            wavefront->ldsChunk->read<c0>(vaddr) & (*e));
1442                            break;
1443                          case Enums::MO_AOR:
1444                          case Enums::MO_ANROR:
1445                            wavefront->ldsChunk->write<c0>(vaddr,
1446                            wavefront->ldsChunk->read<c0>(vaddr) | (*e));
1447                            break;
1448                          case Enums::MO_AXOR:
1449                          case Enums::MO_ANRXOR:
1450                            wavefront->ldsChunk->write<c0>(vaddr,
1451                            wavefront->ldsChunk->read<c0>(vaddr) ^ (*e));
1452                            break;
1453                          case Enums::MO_AINC:
1454                          case Enums::MO_ANRINC:
1455                            wavefront->ldsChunk->write<c0>(vaddr,
1456                            wavefront->ldsChunk->read<c0>(vaddr) + 1);
1457                            break;
1458                          case Enums::MO_ADEC:
1459                          case Enums::MO_ANRDEC:
1460                            wavefront->ldsChunk->write<c0>(vaddr,
1461                            wavefront->ldsChunk->read<c0>(vaddr) - 1);
1462                            break;
1463                          case Enums::MO_AEXCH:
1464                          case Enums::MO_ANREXCH:
1465                            wavefront->ldsChunk->write<c0>(vaddr, (*e));
1466                            break;
1467                          case Enums::MO_ACAS:
1468                          case Enums::MO_ANRCAS:
1469                            wavefront->ldsChunk->write<c0>(vaddr,
1470                            (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ?
1471                            (*f) : wavefront->ldsChunk->read<c0>(vaddr));
1472                            break;
1473                          default:
1474                            fatal("Unrecognized or invalid HSAIL atomic op "
1475                                  "type.\n");
1476                            break;
1477                        }
1478                    } else {
1479                        Request *req =
1480                            new Request(0, vaddr, sizeof(c0), 0,
1481                                        gpuDynInst->computeUnit()->masterId(),
1482                                        0, gpuDynInst->wfDynId,
1483                                        gpuDynInst->makeAtomicOpFunctor<c0>(e,
1484                                        f, this->opType));
1485
1486                        gpuDynInst->setRequestFlags(req);
1487                        PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
1488                        pkt->dataStatic(d);
1489
1490                        if (gpuDynInst->computeUnit()->shader->
1491                            separate_acquire_release &&
1492                            (gpuDynInst->memoryOrder ==
1493                             Enums::MEMORY_ORDER_SC_ACQUIRE)) {
1494                            // if this atomic has acquire semantics,
1495                            // schedule the continuation to perform an
1496                            // acquire after the RMW completes
1497                            gpuDynInst->execContinuation =
1498                                &GPUStaticInst::execAtomicAcq;
1499
1500                            gpuDynInst->useContinuation = true;
1501                        } else {
1502                            // the request will be finished when the RMW completes
1503                            gpuDynInst->useContinuation = false;
1504                        }
1505                        // translation is performed in sendRequest()
1506                        gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i,
1507                                                               pkt);
1508                    }
1509                }
1510
1511                ++d;
1512                ++e;
1513                ++f;
1514            }
1515
1516            gpuDynInst->updateStats();
1517        }
1518
1519        // execAtomicACq will always be called through a continuation.
1520        // see comment for execContinuation in gpu_dyn_inst.hh
1521        void
1522        execAtomicAcq(GPUDynInstPtr gpuDynInst) override
1523        {
1524            // after performing the RMW, check to see if this instruction
1525            // has acquire semantics, and if so, issue an acquire
1526            if (!isLocalMem()) {
1527                if (gpuDynInst->computeUnit()->shader->separate_acquire_release
1528                     && gpuDynInst->memoryOrder ==
1529                     Enums::MEMORY_ORDER_SC_ACQUIRE) {
1530                    gpuDynInst->statusBitVector = VectorMask(1);
1531
1532                    // the request will be finished when
1533                    // the acquire completes
1534                    gpuDynInst->useContinuation = false;
1535                    // create request
1536                    Request *req = new Request(0, 0, 0, 0,
1537                                  gpuDynInst->computeUnit()->masterId(),
1538                                  0, gpuDynInst->wfDynId);
1539                    req->setFlags(Request::ACQUIRE);
1540                    gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
1541                }
1542            }
1543        }
1544    };
1545
1546    template<typename DataType, typename AddrOperandType, int NumSrcOperands>
1547    GPUStaticInst*
1548    constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
1549    {
1550        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
1551
1552        if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) {
1553            return decodeLd<DataType>(ib, obj);
1554        } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) {
1555            switch (ib->type) {
1556              case Brig::BRIG_TYPE_B8:
1557                return decodeSt<S8,S8>(ib, obj);
1558              case Brig::BRIG_TYPE_B16:
1559                return decodeSt<S16,S16>(ib, obj);
1560              case Brig::BRIG_TYPE_B32:
1561                return decodeSt<S32,S32>(ib, obj);
1562              case Brig::BRIG_TYPE_B64:
1563                return decodeSt<S64,S64>(ib, obj);
1564              default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type);
1565            }
1566        } else {
1567            if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET)
1568                return new AtomicInst<DataType, AddrOperandType,
1569                    NumSrcOperands, false>(ib, obj, "atomicnoret");
1570            else
1571                return new AtomicInst<DataType, AddrOperandType,
1572                    NumSrcOperands, true>(ib, obj, "atomic");
1573        }
1574    }
1575
1576    template<typename DataType, int NumSrcOperands>
1577    GPUStaticInst*
1578    decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
1579    {
1580        unsigned addrIndex = (Brig::BrigOpcode)ib->opcode ==
1581            Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1;
1582
1583        unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex);
1584
1585        BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
1586
1587        if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
1588            return constructAtomic<DataType, NoRegAddrOperand,
1589                                   NumSrcOperands>(ib, obj);
1590        } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
1591            // V2/V4 not allowed
1592            switch (tmp.regKind) {
1593              case Brig::BRIG_REGISTER_KIND_SINGLE:
1594                  return constructAtomic<DataType, SRegAddrOperand,
1595                                         NumSrcOperands>(ib, obj);
1596              case Brig::BRIG_REGISTER_KIND_DOUBLE:
1597                return constructAtomic<DataType, DRegAddrOperand,
1598                                       NumSrcOperands>(ib, obj);
1599              default:
1600                fatal("Bad atomic register operand type %d\n", tmp.type);
1601            }
1602        } else {
1603            fatal("Bad atomic register operand kind %d\n", tmp.kind);
1604        }
1605    }
1606
1607
1608    template<typename DataType>
1609    GPUStaticInst*
1610    decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
1611    {
1612        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
1613
1614        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
1615            return decodeAtomicHelper<DataType, 2>(ib, obj);
1616        } else {
1617            return decodeAtomicHelper<DataType, 1>(ib, obj);
1618        }
1619    }
1620
1621    template<typename DataType>
1622    GPUStaticInst*
1623    decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
1624    {
1625        const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
1626        if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
1627            return decodeAtomicHelper<DataType, 2>(ib, obj);
1628        } else {
1629            return decodeAtomicHelper<DataType, 1>(ib, obj);
1630        }
1631    }
1632} // namespace HsailISA
1633
1634#endif // __ARCH_HSAIL_INSTS_MEM_HH__
1635