hsail/insts/mem_impl.hh

3898Ssaidi@eecs.umich.edu/*
2934Sktlim@umich.edu * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
2934Sktlim@umich.edu * All rights reserved.
2934Sktlim@umich.edu *
2934Sktlim@umich.edu * For use for simulation and test purposes only
2934Sktlim@umich.edu *
2934Sktlim@umich.edu * Redistribution and use in source and binary forms, with or without
2934Sktlim@umich.edu * modification, are permitted provided that the following conditions are met:
2934Sktlim@umich.edu *
2934Sktlim@umich.edu * 1. Redistributions of source code must retain the above copyright notice,
2934Sktlim@umich.edu * this list of conditions and the following disclaimer.
2934Sktlim@umich.edu *
2934Sktlim@umich.edu * 2. Redistributions in binary form must reproduce the above copyright notice,
2934Sktlim@umich.edu * this list of conditions and the following disclaimer in the documentation
2934Sktlim@umich.edu * and/or other materials provided with the distribution.
2934Sktlim@umich.edu *
2934Sktlim@umich.edu * 3. Neither the name of the copyright holder nor the names of its contributors
2934Sktlim@umich.edu * may be used to endorse or promote products derived from this software
2934Sktlim@umich.edu * without specific prior written permission.
2934Sktlim@umich.edu *
2934Sktlim@umich.edu * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
2934Sktlim@umich.edu * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2934Sktlim@umich.edu * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2934Sktlim@umich.edu * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
2934Sktlim@umich.edu * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
2934Sktlim@umich.edu * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
2934Sktlim@umich.edu * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
2934Sktlim@umich.edu * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
2934Sktlim@umich.edu * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
2969Sktlim@umich.edu * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
2934Sktlim@umich.edu * POSSIBILITY OF SUCH DAMAGE.
2995Ssaidi@eecs.umich.edu *
2934Sktlim@umich.edu * Author: Steve Reinhardt
2934Sktlim@umich.edu */
2934Sktlim@umich.edu
2934Sktlim@umich.edu#include "arch/hsail/generic_types.hh"
2934Sktlim@umich.edu#include "gpu-compute/hsail_code.hh"
2934Sktlim@umich.edu
2934Sktlim@umich.edu// defined in code.cc, but not worth sucking in all of code.h for this
2934Sktlim@umich.edu// at this point
3898Ssaidi@eecs.umich.eduextern const char *segmentNames[];
3898Ssaidi@eecs.umich.edu
3898Ssaidi@eecs.umich.edunamespace HsailISA
3898Ssaidi@eecs.umich.edu{
3898Ssaidi@eecs.umich.edu    template<typename DestDataType, typename AddrRegOperandType>
3898Ssaidi@eecs.umich.edu    void
3898Ssaidi@eecs.umich.edu    LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly()
3898Ssaidi@eecs.umich.edu    {
2934Sktlim@umich.edu        this->disassembly = csprintf("%s_%s %s,%s", this->opcode,
2934Sktlim@umich.edu                                     DestDataType::label,
2934Sktlim@umich.edu                                     this->dest.disassemble(),
2934Sktlim@umich.edu                                     this->addr.disassemble());
2934Sktlim@umich.edu    }
2934Sktlim@umich.edu
2934Sktlim@umich.edu    template<typename DestDataType, typename AddrRegOperandType>
3005Sstever@eecs.umich.edu    void
2934Sktlim@umich.edu    LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
3005Sstever@eecs.umich.edu    {
3005Sstever@eecs.umich.edu        Wavefront *w = gpuDynInst->wavefront();
3304Sstever@eecs.umich.edu
2995Ssaidi@eecs.umich.edu        typedef typename DestDataType::CType CType M5_VAR_USED;
2934Sktlim@umich.edu        const VectorMask &mask = w->get_pred();
2934Sktlim@umich.edu        uint64_t addr_vec[VSZ];
2934Sktlim@umich.edu        this->addr.calcVector(w, addr_vec);
2995Ssaidi@eecs.umich.edu
2934Sktlim@umich.edu        for (int lane = 0; lane < VSZ; ++lane) {
2934Sktlim@umich.edu            if (mask[lane]) {
2934Sktlim@umich.edu                this->dest.set(w, lane, addr_vec[lane]);
2934Sktlim@umich.edu            }
2934Sktlim@umich.edu        }
2995Ssaidi@eecs.umich.edu    }
2934Sktlim@umich.edu
2934Sktlim@umich.edu    template<typename MemDataType, typename DestDataType,
2934Sktlim@umich.edu             typename AddrRegOperandType>
2934Sktlim@umich.edu    void
2934Sktlim@umich.edu    LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly()
2995Ssaidi@eecs.umich.edu    {
2934Sktlim@umich.edu        switch (num_dest_operands) {
2934Sktlim@umich.edu          case 1:
2953Sktlim@umich.edu            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
2934Sktlim@umich.edu                                         segmentNames[this->segment],
2934Sktlim@umich.edu                                         MemDataType::label,
3449Shsul@eecs.umich.edu                                         this->dest.disassemble(),
2934Sktlim@umich.edu                                         this->addr.disassemble());
2934Sktlim@umich.edu            break;
2934Sktlim@umich.edu          case 2:
2934Sktlim@umich.edu            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
2934Sktlim@umich.edu                                         segmentNames[this->segment],
3584Ssaidi@eecs.umich.edu                                         MemDataType::label,
3584Ssaidi@eecs.umich.edu                                         this->dest_vect[0].disassemble(),
3584Ssaidi@eecs.umich.edu                                         this->dest_vect[1].disassemble(),
3584Ssaidi@eecs.umich.edu                                         this->addr.disassemble());
3584Ssaidi@eecs.umich.edu            break;
3584Ssaidi@eecs.umich.edu          case 4:
3743Sgblack@eecs.umich.edu            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
3584Ssaidi@eecs.umich.edu                                         this->opcode,
3743Sgblack@eecs.umich.edu                                         segmentNames[this->segment],
3743Sgblack@eecs.umich.edu                                         MemDataType::label,
3743Sgblack@eecs.umich.edu                                         this->dest_vect[0].disassemble(),
3823Ssaidi@eecs.umich.edu                                         this->dest_vect[1].disassemble(),
3814Ssaidi@eecs.umich.edu                                         this->dest_vect[2].disassemble(),
3743Sgblack@eecs.umich.edu                                         this->dest_vect[3].disassemble(),
3743Sgblack@eecs.umich.edu                                         this->addr.disassemble());
3584Ssaidi@eecs.umich.edu            break;
3814Ssaidi@eecs.umich.edu          default:
3584Ssaidi@eecs.umich.edu            fatal("Bad ld register dest operand, num vector operands: %d \n",
3745Sgblack@eecs.umich.edu                  num_dest_operands);
3745Sgblack@eecs.umich.edu            break;
3745Sgblack@eecs.umich.edu        }
3584Ssaidi@eecs.umich.edu    }
3898Ssaidi@eecs.umich.edu
3898Ssaidi@eecs.umich.edu    static Addr
3898Ssaidi@eecs.umich.edu    calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i)
3584Ssaidi@eecs.umich.edu    {
3584Ssaidi@eecs.umich.edu        // what is the size of the object we are accessing??
3584Ssaidi@eecs.umich.edu        // NOTE: the compiler doesn't generate enough information
3745Sgblack@eecs.umich.edu        // to do this yet..have to just line up all the private
3745Sgblack@eecs.umich.edu        // work-item spaces back to back for now
3745Sgblack@eecs.umich.edu        /*
3584Ssaidi@eecs.umich.edu        StorageElement* se =
3584Ssaidi@eecs.umich.edu            i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
3584Ssaidi@eecs.umich.edu        assert(se);
3584Ssaidi@eecs.umich.edu
3025Ssaidi@eecs.umich.edu        return w->wfSlotId * w->privSizePerItem * VSZ +
2934Sktlim@umich.edu            se->offset * VSZ +
2995Ssaidi@eecs.umich.edu            lane * se->size;
2995Ssaidi@eecs.umich.edu        */
3025Ssaidi@eecs.umich.edu
3025Ssaidi@eecs.umich.edu        // addressing strategy: interleave the private spaces of
3025Ssaidi@eecs.umich.edu        // work-items in a wave-front on 8 byte granularity.
3025Ssaidi@eecs.umich.edu        // this won't be perfect coalescing like the spill space
3025Ssaidi@eecs.umich.edu        // strategy, but it's better than nothing. The spill space
2934Sktlim@umich.edu        // strategy won't work with private because the same address
2934Sktlim@umich.edu        // may be accessed by different sized loads/stores.
2934Sktlim@umich.edu
        // Note: I'm assuming that the largest load/store to private
        // is 8 bytes. If it is larger, the stride will have to increase

        Addr addr_div8 = addr / 8;
        Addr addr_mod8 = addr % 8;

        Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;

        assert(ret < w->privBase + (w->privSizePerItem * VSZ));

        return ret;
    }

    template<typename MemDataType, typename DestDataType,
             typename AddrRegOperandType>
    void
    LdInst<MemDataType, DestDataType,
           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *w = gpuDynInst->wavefront();

        typedef typename MemDataType::CType MemCType;
        const VectorMask &mask = w->get_pred();

        // Kernarg references are handled uniquely for now (no Memory Request
        // is used), so special-case them up front.  Someday we should
        // make this more realistic, at which we should get rid of this
        // block and fold this case into the switch below.
        if (this->segment == Brig::BRIG_SEGMENT_KERNARG) {
            MemCType val;

            // I assume no vector ld for kernargs
            assert(num_dest_operands == 1);

            // assuming for the moment that we'll never do register
            // offsets into kernarg space... just to make life simpler
            uint64_t address = this->addr.calcUniform();

            val = *(MemCType*)&w->kernelArgs[address];

            DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);

            for (int lane = 0; lane < VSZ; ++lane) {
                if (mask[lane]) {
                    this->dest.set(w, lane, val);
                }
            }

            return;
        } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
            uint64_t address = this->addr.calcUniform();
            for (int lane = 0; lane < VSZ; ++lane) {
                if (mask[lane]) {
                    MemCType val = w->readCallArgMem<MemCType>(lane, address);

                    DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address,
                            (unsigned long long)val);

                    this->dest.set(w, lane, val);
                }
            }

            return;
        }

        GPUDynInstPtr m = gpuDynInst;

        this->addr.calcVector(w, m->addr);

        m->m_op = Enums::MO_LD;
        m->m_type = MemDataType::memType;
        m->v_type = DestDataType::vgprType;

        m->exec_mask = w->execMask();
        m->statusBitVector = 0;
        m->equiv = this->equivClass;
        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);

        m->scope = getGenericMemoryScope(this->memoryScope);

        if (num_dest_operands == 1) {
            m->dst_reg = this->dest.regIndex();
            m->n_reg = 1;
        } else {
            m->n_reg = num_dest_operands;
            for (int i = 0; i < num_dest_operands; ++i) {
                m->dst_reg_vec[i] = this->dest_vect[i].regIndex();
            }
        }

        m->simdId = w->simdId;
        m->wfSlotId = w->wfSlotId;
        m->wfDynId = w->wfDynId;
        m->kern_id = w->kern_id;
        m->cu_id = w->computeUnit->cu_id;
        m->latency.init(&w->computeUnit->shader->tick_cnt);

        switch (this->segment) {
          case Brig::BRIG_SEGMENT_GLOBAL:
            m->s_type = SEG_GLOBAL;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));

            // this is a complete hack to get around a compiler bug
            // (the compiler currently generates global access for private
            //  addresses (starting from 0). We need to add the private offset)
            for (int lane = 0; lane < VSZ; ++lane) {
                if (m->addr[lane] < w->privSizePerItem) {
                    if (mask[lane]) {
                        // what is the size of the object we are accessing?
                        // find base for for this wavefront

                        // calcPrivAddr will fail if accesses are unaligned
                        assert(!((sizeof(MemCType) - 1) & m->addr[lane]));

                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
                                                     this);

                        m->addr[lane] = privAddr;
                    }
                }
            }

            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_rd_gm++;
            w->rd_gm_reqs_in_pipe--;
            break;

          case Brig::BRIG_SEGMENT_SPILL:
            assert(num_dest_operands == 1);
            m->s_type = SEG_SPILL;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));
            {
                for (int lane = 0; lane < VSZ; ++lane) {
                    //  note: this calculation will NOT WORK if the compiler
                    //  ever generates loads/stores to the same address with
                    //  different widths (e.g., a ld_u32 addr and a ld_u16 addr)
                    if (mask[lane]) {
                        assert(m->addr[lane] < w->spillSizePerItem);

                        m->addr[lane] = m->addr[lane] * w->spillWidth +
                                        lane * sizeof(MemCType) + w->spillBase;

                        w->last_addr[lane] = m->addr[lane];
                    }
                }
            }

            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_rd_gm++;
            w->rd_gm_reqs_in_pipe--;
            break;

          case Brig::BRIG_SEGMENT_GROUP:
            m->s_type = SEG_SHARED;
            m->pipeId = LDSMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(24));
            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
            w->outstanding_reqs_rd_lm++;
            w->rd_lm_reqs_in_pipe--;
            break;

          case Brig::BRIG_SEGMENT_READONLY:
            m->s_type = SEG_READONLY;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));

            for (int lane = 0; lane < VSZ; ++lane) {
                if (mask[lane]) {
                    assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
                    m->addr[lane] += w->roBase;
                }
            }

            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_rd_gm++;
            w->rd_gm_reqs_in_pipe--;
            break;

          case Brig::BRIG_SEGMENT_PRIVATE:
            m->s_type = SEG_PRIVATE;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));
            {
                for (int lane = 0; lane < VSZ; ++lane) {
                    if (mask[lane]) {
                        assert(m->addr[lane] < w->privSizePerItem);

                        m->addr[lane] = m->addr[lane] +
                            lane * sizeof(MemCType) + w->privBase;
                    }
                }
            }
            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_rd_gm++;
            w->rd_gm_reqs_in_pipe--;
            break;

          default:
            fatal("Load to unsupported segment %d %llxe\n", this->segment,
                  m->addr[0]);
        }

        w->outstanding_reqs++;
        w->mem_reqs_in_pipe--;
    }

    template<typename OperationType, typename SrcDataType,
             typename AddrRegOperandType>
    void
    StInst<OperationType, SrcDataType,
           AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
    {
        Wavefront *w = gpuDynInst->wavefront();

        typedef typename OperationType::CType CType;

        const VectorMask &mask = w->get_pred();

        // arg references are handled uniquely for now (no Memory Request
        // is used), so special-case them up front.  Someday we should
        // make this more realistic, at which we should get rid of this
        // block and fold this case into the switch below.
        if (this->segment == Brig::BRIG_SEGMENT_ARG) {
            uint64_t address = this->addr.calcUniform();

            for (int lane = 0; lane < VSZ; ++lane) {
                if (mask[lane]) {
                    CType data = this->src.template get<CType>(w, lane);
                    DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
                    w->writeCallArgMem<CType>(lane, address, data);
                }
            }

            return;
        }

        GPUDynInstPtr m = gpuDynInst;

        m->exec_mask = w->execMask();

        this->addr.calcVector(w, m->addr);

        if (num_src_operands == 1) {
            for (int lane = 0; lane < VSZ; ++lane) {
                if (mask[lane]) {
                    ((CType*)m->d_data)[lane] =
                        this->src.template get<CType>(w, lane);
                }
            }
        } else {
            for (int k= 0; k < num_src_operands; ++k) {
                for (int lane = 0; lane < VSZ; ++lane) {
                    if (mask[lane]) {
                        ((CType*)m->d_data)[k * VSZ + lane] =
                            this->src_vect[k].template get<CType>(w, lane);
                    }
                }
            }
        }

        m->m_op = Enums::MO_ST;
        m->m_type = OperationType::memType;
        m->v_type = OperationType::vgprType;

        m->statusBitVector = 0;
        m->equiv = this->equivClass;

        if (num_src_operands == 1) {
            m->n_reg = 1;
        } else {
            m->n_reg = num_src_operands;
        }

        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);

        m->scope = getGenericMemoryScope(this->memoryScope);

        m->simdId = w->simdId;
        m->wfSlotId = w->wfSlotId;
        m->wfDynId = w->wfDynId;
        m->kern_id = w->kern_id;
        m->cu_id = w->computeUnit->cu_id;
        m->latency.init(&w->computeUnit->shader->tick_cnt);

        switch (this->segment) {
          case Brig::BRIG_SEGMENT_GLOBAL:
            m->s_type = SEG_GLOBAL;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));

            // this is a complete hack to get around a compiler bug
            // (the compiler currently generates global access for private
            //  addresses (starting from 0). We need to add the private offset)
            for (int lane = 0; lane < VSZ; ++lane) {
                if (mask[lane]) {
                    if (m->addr[lane] < w->privSizePerItem) {

                        // calcPrivAddr will fail if accesses are unaligned
                        assert(!((sizeof(CType)-1) & m->addr[lane]));

                        Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
                                                     this);

                        m->addr[lane] = privAddr;
                    }
                }
            }

            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_wr_gm++;
            w->wr_gm_reqs_in_pipe--;
            break;

          case Brig::BRIG_SEGMENT_SPILL:
            assert(num_src_operands == 1);
            m->s_type = SEG_SPILL;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));
            {
                for (int lane = 0; lane < VSZ; ++lane) {
                    if (mask[lane]) {
                        assert(m->addr[lane] < w->spillSizePerItem);

                        m->addr[lane] = m->addr[lane] * w->spillWidth +
                                        lane * sizeof(CType) + w->spillBase;
                    }
                }
            }

            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_wr_gm++;
            w->wr_gm_reqs_in_pipe--;
            break;

          case Brig::BRIG_SEGMENT_GROUP:
            m->s_type = SEG_SHARED;
            m->pipeId = LDSMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(24));
            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
            w->outstanding_reqs_wr_lm++;
            w->wr_lm_reqs_in_pipe--;
            break;

          case Brig::BRIG_SEGMENT_PRIVATE:
            m->s_type = SEG_PRIVATE;
            m->pipeId = GLBMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(1));
            {
                for (int lane = 0; lane < VSZ; ++lane) {
                    if (mask[lane]) {
                        assert(m->addr[lane] < w->privSizePerItem);
                        m->addr[lane] = m->addr[lane] + lane *
                            sizeof(CType)+w->privBase;
                    }
                }
            }

            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_wr_gm++;
            w->wr_gm_reqs_in_pipe--;
            break;

          default:
            fatal("Store to unsupported segment %d\n", this->segment);
        }

        w->outstanding_reqs++;
        w->mem_reqs_in_pipe--;
    }

    template<typename OperationType, typename SrcDataType,
             typename AddrRegOperandType>
    void
    StInst<OperationType, SrcDataType,
           AddrRegOperandType>::generateDisassembly()
    {
        switch (num_src_operands) {
          case 1:
            this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
                                         segmentNames[this->segment],
                                         OperationType::label,
                                         this->src.disassemble(),
                                         this->addr.disassemble());
            break;
          case 2:
            this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
                                         segmentNames[this->segment],
                                         OperationType::label,
                                         this->src_vect[0].disassemble(),
                                         this->src_vect[1].disassemble(),
                                         this->addr.disassemble());
            break;
          case 4:
            this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
                                         this->opcode,
                                         segmentNames[this->segment],
                                         OperationType::label,
                                         this->src_vect[0].disassemble(),
                                         this->src_vect[1].disassemble(),
                                         this->src_vect[2].disassemble(),
                                         this->src_vect[3].disassemble(),
                                         this->addr.disassemble());
            break;
          default: fatal("Bad ld register src operand, num vector operands: "
                         "%d \n", num_src_operands);
            break;
        }
    }

    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
             bool HasDst>
    void
    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
        HasDst>::execute(GPUDynInstPtr gpuDynInst)
    {
        typedef typename DataType::CType CType;

        Wavefront *w = gpuDynInst->wavefront();

        GPUDynInstPtr m = gpuDynInst;

        this->addr.calcVector(w, m->addr);

        for (int lane = 0; lane < VSZ; ++lane) {
            ((CType *)m->a_data)[lane] =
                this->src[0].template get<CType>(w, lane);
        }

        // load second source operand for CAS
        if (NumSrcOperands > 1) {
            for (int lane = 0; lane < VSZ; ++lane) {
                ((CType*)m->x_data)[lane] =
                    this->src[1].template get<CType>(w, lane);
            }
        }

        assert(NumSrcOperands <= 2);

        m->m_op = this->opType;
        m->m_type = DataType::memType;
        m->v_type = DataType::vgprType;

        m->exec_mask = w->execMask();
        m->statusBitVector = 0;
        m->equiv = 0;  // atomics don't have an equivalence class operand
        m->n_reg = 1;
        m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);

        m->scope = getGenericMemoryScope(this->memoryScope);

        if (HasDst) {
            m->dst_reg = this->dest.regIndex();
        }

        m->simdId = w->simdId;
        m->wfSlotId = w->wfSlotId;
        m->wfDynId = w->wfDynId;
        m->kern_id = w->kern_id;
        m->cu_id = w->computeUnit->cu_id;
        m->latency.init(&w->computeUnit->shader->tick_cnt);

        switch (this->segment) {
          case Brig::BRIG_SEGMENT_GLOBAL:
            m->s_type = SEG_GLOBAL;
            m->latency.set(w->computeUnit->shader->ticks(64));
            m->pipeId = GLBMEM_PIPE;

            w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
            w->outstanding_reqs_wr_gm++;
            w->wr_gm_reqs_in_pipe--;
            w->outstanding_reqs_rd_gm++;
            w->rd_gm_reqs_in_pipe--;
            break;

          case Brig::BRIG_SEGMENT_GROUP:
            m->s_type = SEG_SHARED;
            m->pipeId = LDSMEM_PIPE;
            m->latency.set(w->computeUnit->shader->ticks(24));
            w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
            w->outstanding_reqs_wr_lm++;
            w->wr_lm_reqs_in_pipe--;
            w->outstanding_reqs_rd_lm++;
            w->rd_lm_reqs_in_pipe--;
            break;

          default:
            fatal("Atomic op to unsupported segment %d\n",
                  this->segment);
        }

        w->outstanding_reqs++;
        w->mem_reqs_in_pipe--;
    }

    const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp);

    template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
             bool HasDst>
    void
    AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
               HasDst>::generateDisassembly()
    {
        if (HasDst) {
            this->disassembly =
                csprintf("%s_%s_%s_%s %s,%s", this->opcode,
                         atomicOpToString(this->atomicOperation),
                         segmentNames[this->segment],
                         DataType::label, this->dest.disassemble(),
                         this->addr.disassemble());
        } else {
            this->disassembly =
                csprintf("%s_%s_%s_%s %s", this->opcode,
                         atomicOpToString(this->atomicOperation),
                         segmentNames[this->segment],
                         DataType::label, this->addr.disassemble());
        }

        for (int i = 0; i < NumSrcOperands; ++i) {
            this->disassembly += ",";
            this->disassembly += this->src[i].disassemble();
        }
    }
} // namespace HsailISA